In [194]:
# Standard library imports
from datetime import datetime, timedelta
from pathlib import Path
from zipfile import ZipFile, is_zipfile

# Related third party imports
import colorcet as cc
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bokeh.models import FixedTicker
from IPython.display import clear_output
from pandas.plotting import scatter_matrix, autocorrelation_plot
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    FunctionTransformer,
    QuantileTransformer,
    PowerTransformer,
)
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, adfuller, pacf
from tqdm.notebook import tqdm

clear_output()

In [195]:
# Warning settings
warnings.filterwarnings("ignore", category=FutureWarning)

Initial code block which main task is to unzip the contents of the compressed file.

In [196]:
# Define the path to the zipfile
zip_path = Path("data/ultimate_challenge_3.zip")


# Check if the zipfile exists with a function
def check_zipfile(zip_path: Path) -> bool:
    """Check if the zipfile exists at the given path."""
    return zip_path.exists() and is_zipfile(zip_path)


# Create a function to create a new directory
def create_target_directory(target_dir: Path):
    """Creates a new_directory and returns it."""
    target_dir.mkdir(parents=True, exist_ok=True)
    return target_dir


def extract_zipfile(zip_path: Path, target_dir: Path):
    """Extracts the contents of zip at zip_path to target_dir."""
    with ZipFile(zip_path, "r") as zip_ref:
        # get a list of all the files in the zipfile
        for file in zip_ref.namelist():
            # if the file does not exist in the target directory, extract it
            if not (target_dir / file).exists():
                zip_ref.extract(file, target_dir)
        print(f"Extracted {zip_path} to {target_dir}")


# use the functions to extract the zipfile
if check_zipfile(zip_path):
    target_dir = zip_path.parent / zip_path.stem
    create_target_directory(target_dir)
    extract_zipfile(zip_path, target_dir)

Extracted data\ultimate_challenge_3.zip to data\ultimate_challenge_3


### Part 1: Exploratory Data Analysis

The attached `logins.json` file contains (simulated) timestamps of user logins in a particular geographic location.
- Aggregate these login counts based on 15minute time intervals, and 
- visualize and describe the resulting time series of login counts in ways that best characterize the underlying patterns of the demand. 

Please report/illustrate important features of the demand, such as daily cycles. If there are data quality issues, please report them.

In [197]:
# Exploratory Data Analysis
logins_path = "data/ultimate_challenge_3/ultimate_challenge/logins.json"

logins_df = pd.read_json(logins_path)
print(
    f"Shape of the dataframe: {logins_df.shape[0]} rows and {logins_df.shape[1]} columns"
)
logins_df.head(10)

Shape of the dataframe: 93142 rows and 1 columns


Unnamed: 0,login_time
0,1970-01-01 20:13:18
1,1970-01-01 20:16:10
2,1970-01-01 20:16:37
3,1970-01-01 20:16:36
4,1970-01-01 20:26:21
5,1970-01-01 20:21:41
6,1970-01-01 20:12:16
7,1970-01-01 20:35:47
8,1970-01-01 20:35:38
9,1970-01-01 20:47:52


In [198]:
logins_df.describe()

Unnamed: 0,login_time
count,93142
mean,1970-02-26 19:09:39.779648278
min,1970-01-01 20:12:16
25%,1970-02-04 10:37:19.750000
50%,1970-03-01 06:33:05.500000
75%,1970-03-22 04:01:10.750000
max,1970-04-13 18:57:38


In [199]:
logins_copy = logins_df.copy()

Series not in perfect chronological order but is generally it is . 

In [200]:
# settings so that scroll is not active for the plot
active_opts = dict(active_tools=["box_zoom"])

logins_copy = logins_copy.reset_index().rename(
    columns={"index": "original_index"})

logins_copy["is_duplicate"] = logins_copy.duplicated(
    subset="login_time", keep=False)

logins_copy["hour"] = logins_copy["login_time"].dt.hour
logins_copy["week"] = logins_copy["login_time"].dt.isocalendar().week

# get the duplicate login times
duplicated_logins = logins_copy.loc[logins_copy["is_duplicate"] == True]

In [201]:
# create a hook for the xaxis lables to be [0, 6, 12, 18]
def hours_hook(plot, element):
    plot.state.xaxis.ticker = FixedTicker(ticks=list(range(0, 24, 6)))

In [202]:
def create_scatter_hist(df, x, y, xbins=16, ybins=24):
    """Create a scatter plot and two histograms for the given DataFrame and columns."""
    # opts to prevent the scroll from being active
    active_options = dict(active_tools=["box_zoom"])
    # Create a scatter plot
    scatter = df.hvplot.scatter(
        x=x, y=y, width=800, height=400, xlabel="", ylabel="", alpha=0.5
    ).opts(**active_options)

    # Create a histogram of the 'hour' column
    yhist = df.hvplot.hist(y=y, width=200, height=400, bins=ybins).opts(
        **active_options
    )

    # Create a histogram of the 'login_time' column
    xhist = df.hvplot.hist(y=x, width=800, height=200, xlabel="", bins=xbins).opts(
        **active_options
    )

    # Combine the plots
    plot = scatter << yhist << xhist

    return plot

In [203]:
create_scatter_hist(duplicated_logins, "login_time", "hour").opts(
    title="Duplicated Logins Distribution"
)

In [204]:
create_scatter_hist(logins_copy, "login_time", "hour").opts(
    title="Logins Distribution")

There are 2 hous in the dataset where no logins happened. 
- 1970-01-08 at 3pm
- 1970-02-19 as 8pm.

There is a general trend upwards for the number of logins each week.

In [205]:
# Aggregate the login on 15 minutes interval

logins_15m_count = (
    logins_copy[["login_time", "original_index"]]
    .set_index("login_time")
    .resample("15min")
    .count()
    .rename(columns={"original_index": "logins_15m_count"})
)
# Confirm with the sum of the logins
print(f"Should be 93142\n{logins_15m_count.sum()=}")
display(logins_15m_count.head())
# Do a plot of the logcounts at 15 minutes interval
logins_15m_count.hvplot.hist(
    height=400,
    width=800,
    title="Logins Count every 15 minutes",
    ylabel="",
).opts(active_tools=["box_zoom"])

Should be 93142
logins_15m_count.sum()=logins_15m_count    93142
dtype: int64


Unnamed: 0_level_0,logins_15m_count
login_time,Unnamed: 1_level_1
1970-01-01 20:00:00,2
1970-01-01 20:15:00,6
1970-01-01 20:30:00,9
1970-01-01 20:45:00,7
1970-01-01 21:00:00,1


The frequency for the number of logins every 15 mins is right-skewed, with the mean greater then the median.

In [206]:
# Create the KDE plot
kde_15m = logins_15m_count["logins_15m_count"].hvplot(
    kind="kde",
    xlim=(0, None),
    ylabel="",
    xlabel="Logins Count at 15 min Intervals",
    title="Logins Kernel Density Plot",
)
mean_value = logins_15m_count["logins_15m_count"].mean()
median_value = logins_15m_count["logins_15m_count"].median()

# Create the vertical lines
mean_line = hv.VLine(mean_value).opts(color="red")
median_line = hv.VLine(median_value).opts(color="green")

# Create the legend
mean_legend = hv.Text(mean_value, 0, " Mean", halign="left", valign="bottom").opts(
    color="red"
)
median_legend = hv.Text(
    median_value, 0, " Median", halign="right", valign="bottom"
).opts(color="green")

# Combine everything
(kde_15m * mean_line * median_line * mean_legend * median_legend).opts(
    active_tools=["box_zoom"]
)

The demand appears seasonal with a weekly frequency. The highest number of logins occurs on the weekends (both Saturday and Sunday).

In [207]:
login_daily = (
    logins_copy[["login_time", "original_index"]]
    .set_index("login_time")
    .resample("D")
    .count()
    .rename(columns={"original_index": "logins_daily_count"})
)
login_daily["is_weekend"] = "blue"
login_daily.loc[login_daily.index.dayofweek >= 5, "is_weekend"] = "red"

display(login_daily.head(3))


login_daily.hvplot() * login_daily.hvplot.scatter(
    title="Daily Logins with Weekends Highlighted",
    color=login_daily["is_weekend"],
).opts(active_tools=["box_zoom"])

Unnamed: 0_level_0,logins_daily_count,is_weekend
login_time,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-01-01,112,blue
1970-01-02,681,blue
1970-01-03,793,red


In [208]:
logins_15m_count["week"] = logins_15m_count.index.isocalendar().week
logins_15m_count["weekday"] = logins_15m_count.index.weekday
logins_15m_count["hour"] = logins_15m_count.index.hour
logins_15m_count["day"] = logins_15m_count.index.day

In [209]:
weekday_dict = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}
logins_15m_count["day_of_week"] = logins_15m_count["weekday"].map(weekday_dict)
weekday_counts = (
    logins_15m_count.groupby(["weekday"], observed=True)["logins_15m_count"]
    .sum()
    .reset_index()
)
weekday_counts["weekday"] = weekday_counts["weekday"].map(weekday_dict)
display(weekday_counts.head(3))
weekday_counts.hvplot.bar(
    x="weekday",
    y="logins_15m_count",
    title="Total Logins Count by Weekday",
    ylabel="",
    xlabel="",
    height=400,
    width=800,
).opts(active_tools=["box_zoom"])

Unnamed: 0,weekday,logins_15m_count
0,Monday,8823
1,Tuesday,9268
2,Wednesday,10339


The hourly profile looks significantly different for the weekday versus the weekend. 

During the week
- there is a gentle undulation with peaks around 11am and 10pm, 
- a minimum point at 6am which is near 0 near zero.
- there is less variance during the week


On the weekends, 
- there is alternations in the number of logins
-  but between 12am and 4am, there is higher login traffic than at any other hours of the day. This is a gradual increase.
-  there is s steep dropoff after 4am.

In [210]:
# Separate weekdays and weekends
weekdays_login = logins_15m_count[logins_15m_count["weekday"] < 5]
weekend_login = logins_15m_count[logins_15m_count["weekday"] >= 5]

# Plot of the weekday days mean aggregation by the hour
weekday_avg = weekdays_login.groupby("hour")["logins_15m_count"].mean()

# Plot of the weekend days mean aggregation by the hour
weekend_avg = weekend_login.groupby("hour")["logins_15m_count"].mean()

# Plot of each day of the week faded out
daily_plots = (
    logins_15m_count.groupby(["weekday", "hour"])["logins_15m_count"]
    .mean()
    .unstack(level=0)
    .rename(columns=weekday_dict)
    .hvplot(alpha=0.2, color="gray", label="")
)

# Plot the average logins by hour for weekdays and weekends overlaid
(
    weekday_avg.hvplot(label="Weekday")
    * weekend_avg.hvplot(label="Weekend")
    * daily_plots
).opts(
    active_tools=["box_zoom"],
    title="Average Logins by Hour of Day",
    xlabel="Hour of Day",
    hooks=[hours_hook],
)

In [211]:
logins_copy["week"] = logins_copy["login_time"].dt.isocalendar().week
logins_copy["weekday"] = logins_copy["login_time"].dt.weekday
logins_copy["hour"] = logins_copy["login_time"].dt.hour
logins_copy["day"] = logins_copy["login_time"].dt.day
weekday_login = logins_copy.loc[logins_copy["weekday"] < 5]
weekend_login = logins_copy.loc[logins_copy["weekday"] >= 5]

In [212]:
create_scatter_hist(weekday_login, "login_time", "hour").opts(
    title="Weekday Logins Distribution"
)

In [213]:
create_scatter_hist(weekend_login, "login_time", "hour").opts(
    title="Weekend Logins Distribution"
)

In [214]:
day_of_week_hour_long = logins_15m_count.groupby(
    ["weekday", "hour"], as_index=False, observed=True
)["logins_15m_count"].sum()
day_of_week_hour_long["weekday"] = day_of_week_hour_long["weekday"].map(weekday_dict)
day_of_week_hour_long["weekday"] = pd.Categorical(
    day_of_week_hour_long["weekday"], categories=weekday_dict.values(), ordered=True
)
display(day_of_week_hour_long.head(3))
day_of_week_hour_long.hvplot.heatmap(
    x="hour",
    y="weekday",
    C="logins_15m_count",
    title="Logins Count by Hour and Weekday",
    xlabel="Hour of Day",
    ylabel="",
    height=400,
    width=800,
    line_width=2,
).opts(
    active_tools=["box_zoom"],
    color_levels=5,
    hooks=[hours_hook],
)

Unnamed: 0,weekday,hour,logins_15m_count
0,Monday,0,531
1,Monday,1,414
2,Monday,2,312


- The least activity happens between 6am and 10am, no matter the day.
- There is some significant activity that occurs between 10am and 12pm on weekdays, but this pattern does not hold for the weekends. 
- The activity peaks at 4am on both Saturday and Sunday. 

In [215]:
def part_of_day(hour):
    if hour < 6:
        return "Night"
    elif hour < 12:
        return "Morning"
    elif hour < 18:
        return "Afternoon"
    else:
        return "Evening"


day_of_week_hour_long["part_of_day"] = day_of_week_hour_long["hour"].apply(part_of_day)
weekday_part_of_day_group = day_of_week_hour_long.groupby(
    ["weekday", "part_of_day"], as_index=False
)["logins_15m_count"].sum()

# set the order of the part_of_day categories
weekday_part_of_day_group["part_of_day"] = pd.Categorical(
    weekday_part_of_day_group["part_of_day"],
    categories=["Morning", "Afternoon", "Evening", "Night"],
    ordered=True,
)
weekday_part_of_day_pivot = weekday_part_of_day_group.pivot(
    index="weekday", columns="part_of_day", values="logins_15m_count"
)
display(weekday_part_of_day_pivot.head(3))

weekday_part_of_day_pivot.hvplot.heatmap(
    C="logins_15m_count",
    title="Logins Count by Part of Day and Weekday",
    xlabel="Part of Day",
    ylabel="",
    height=400,
    width=600,
    line_width=2,
).opts(active_tools=["box_zoom"], color_levels=7)

part_of_day,Morning,Afternoon,Evening,Night
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,2085,2265,2653,1820
Tuesday,1934,2029,3278,2027
Wednesday,2036,2144,3625,2534


Activity increases for the nights as the week progresses  starting from Monday, with the most activity happening on Saturday and Sunday nights.

There is the lowest activity in the mornings, no matter the day of the week.

In [216]:
# draw a scatterplot of weekday_part_of_day_group with the weekdays(M-F) one color and
# the weekends another color
weekday_part_of_day_group["is_weekend"] = weekday_part_of_day_group["weekday"].isin(
    ["Saturday", "Sunday"]
)
display(weekday_part_of_day_group.head(3))

weekday_part_of_day_group.sort_values(by="part_of_day").hvplot.scatter(
    x="part_of_day",
    y="logins_15m_count",
    c="is_weekend",
    cmap=["blue", "red"],
    title="Logins Count by Part of Day and Weekday",
    xlabel="",
    ylabel="",
    height=400,
    width=600,
    hover_cols=["weekday"],
    alpha=0.6,
    size=100,
).opts(active_tools=["box_zoom"], legend_labels={False: "Weekday", True: "Weekend"})

Unnamed: 0,weekday,part_of_day,logins_15m_count,is_weekend
0,Monday,Afternoon,2265,False
1,Monday,Evening,2653,False
2,Monday,Morning,2085,False


Although the weekend days have higher activity than the weekdays for the afternoons and for the nights parts of day, they have the lowest activities for the morning than any other days

Seasonal decomposition of the time series into 
- trend, 
- seasonality 
- residuals

In [217]:
decom_opts = dict(
    height=200,
    active_tools=["box_zoom"],
    xaxis="bare",
    show_grid=True,
)

# Decompose time series into trend, seasonality, and noise
decomposition = seasonal_decompose(logins_15m_count["logins_15m_count"], period=96)

# Extract components
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

# Plot components
original_series = logins_15m_count["logins_15m_count"].hvplot().opts(**decom_opts)
trend_series = trend.hvplot().opts(**decom_opts)
seasonal_series = seasonal.hvplot().opts(**decom_opts)
residual_series = residual.hvplot().opts(
    height=200, active_tools=["box_zoom"], show_grid=True
)

# Combine plots in a single column layout
decomposition_layout = (
    (original_series + trend_series + seasonal_series + residual_series)
    .opts(shared_axes=True)
    .cols(1)
)

# Display layout
decomposition_layout

In [218]:
# Calculate the autocorrelation and partial autocorrelation values
acf_values = acf(logins_15m_count["logins_15m_count"], nlags=96)
pacf_values = pacf(logins_15m_count["logins_15m_count"], nlags=96)

# Create DataFrames for the autocorrelation and partial autocorrelation values
acf_df = pd.DataFrame({"lag": range(len(acf_values)), "autocorrelation": acf_values})
pacf_df = pd.DataFrame(
    {"lag": range(len(pacf_values)), "partial_autocorrelation": pacf_values}
)

# Plot the autocorrelation and partial autocorrelation values with a hover tool
acf_plot = acf_df.hvplot.scatter(
    "lag", "autocorrelation", grid=True, hover_cols="all"
).opts(active_tools=["box_zoom"], title="Autocorrelation of Login Counts", ylabel="")
pacf_plot = pacf_df.hvplot.scatter(
    "lag", "partial_autocorrelation", grid=True, hover_cols="all"
).opts(
    active_tools=["box_zoom"],
    title="Partial Autocorrelation of Login Counts",
    ylabel="",
)

# Add a zero line to the plots
x_zero_line = hv.HLine(0).opts(color="black", line_width=1)
acf_plot *= x_zero_line
pacf_plot *= x_zero_line

# Display the plots
hv.Layout(acf_plot + pacf_plot).cols(1)

In [219]:
results = adfuller(logins_15m_count["logins_15m_count"])
print(f"ADF Statistic: {results[0]:.3f}")
print(f"p-value: {results[1]:.3f}")
if results[1] < 0.05:
    print("The time series is stationary")
else:
    print("The time series is not stationary")

for key, value in results[4].items():
    print("Critical Values:")
    print(f"{key}, {value:.3f}")

ADF Statistic: -10.338
p-value: 0.000
The time series is stationary
Critical Values:
1%, -3.431
Critical Values:
5%, -2.862
Critical Values:
10%, -2.567


### Part 2: Experiment and Metrics Design

The neighboring cities of Gotham and Metropolis have complementary circadian rhythms: on
weekdays, Ultimate Gotham is most active at night, and Ultimate Metropolis is most active
during the day. On weekends, there is reasonable activity in both cities.

However, a toll bridge, with a two way toll, between the two cities causes driver partners to tend to be exclusive to each city. The Ultimate managers of city operations for the two cities have proposed an experiment to encourage driver partners to be available in both cities, by reimbursing all toll costs.</br>
1. *What would you choose as the key measure of success of this experiment in encouraging driver partners to serve both cities, and why would you choose this metric?* </br>
**Answer**: </br>
The key measure of success would be the increase in the number of driver partners who cross the bridge (cross-city trip). This is because the goal of the experiment is to encourage driver partners to be available in both cities, and the increase in number of drivers who cross the bridge would be a direct measure of success. 

This is a good metric because it is 
- Specific: directly measures the behavior we want to influence
- Measureable: Can be tracked and quantified 
- Actionable: if we see that there is an increase in cross-city trips suggesting that the reimbursement is effective this can easily be continued or expanded
- Relevant: directly relates to the goal of the experiment

1. Describe a practical experiment you would design to compare the effectiveness of the
proposed change in relation to the key measure of success.

Please provide details on:
- a. *how you will implement the experiment*</br>
**Answer**: </br>
The experiment would be implemented by randomly selecting a group of driver partners from the entire pool off drivers to monitor. The sample should be random as to try to be representative of the population. This group can then be split into two equal groups. One group, the experimental group would be reimbursed for all tolls cost in a hassle free manner. The second group, the control group would continue to operate as usual without any changes. We can then measure and compare the cross-city trips made between both groups.

The experiment would be run for a period of 3 month period (12 months ideally), to avoid any influence of seasonal changes.</br>
- b. *what statistical test(s) you will conduct to verify the significance of the observation*</br>
**Answer**: </br>
*The statistical test that would be conducted to verify the significance of the observation would be a t-test. We would use the T test to compare the means of the two groups and determine if there is a significant difference.* </br>


- c. *how you would interpret the results and provide recommendations to the city operations team along with any caveats.*</br>
**Answer**: </br>
The results would help us understand whether reimbursing all toll costs had a significant effect on the number of cross-city trips. If the P-value is below a certain threshold (e.g. 0.05), we could conclude that the toll reimbursement is effective, and reject the null hypothesis. 

Some caveats:
- correlation does not imply causation. 
- other factors could be influencing the results and the experiment will not be able to prove that the toll reimbursement is the sole and primary cause of any increase in cross-city trips

### Part 3: Predictive Modeling

Ultimate is interested in predicting rider retention. To help explore this question, we have provided a sample dataset of a cohort of users who signed up for an Ultimate account in January 2014. The data was pulled several months later; we consider a user retained if they were “active” (i.e. took a trip) in the preceding 30 days.

We would like you to use this data set to help understand what factors are the best predictors for retention, and offer suggestions to operationalize those insights to help Ultimate.

The data is in the attached file ultimate_data_challenge.json. See below for a detailed description of the dataset. Please include any code you wrote for the analysis and delete the dataset when you have finished with the challenge.
1. Perform any cleaning, exploratory analysis, and/or visualizations to use the provided
data for this analysis (a few sentences/plots describing your approach will suffice). What
fraction of the observed users were retained?
2. Build a predictive model to help Ultimate determine whether or not a user will be active in
their 6th month on the system. Discuss why you chose your approach, what alternatives
you considered, and any concerns you have. How valid is your model? Include any key
indicators of model performance.
3. Briefly discuss how Ultimate might leverage the insights gained from the model to
improve its long term rider retention (again, a few sentences will suffice).

In [220]:
ultimate_data_path = (
    "data/ultimate_challenge_3/ultimate_challenge/ultimate_data_challenge.json"
)
ultimate_data = pd.read_json(ultimate_data_path)
# Show dataframe info
ultimate_data.info()

# show dataframe statistics
ultimate_data.describe(include="all").T.sort_values("unique").infer_objects(
    copy=False
).fillna("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    50000 non-null  object 
 1   trips_in_first_30_days  50000 non-null  int64  
 2   signup_date             50000 non-null  object 
 3   avg_rating_of_driver    41878 non-null  float64
 4   avg_surge               50000 non-null  float64
 5   last_trip_date          50000 non-null  object 
 6   phone                   49604 non-null  object 
 7   surge_pct               50000 non-null  float64
 8   ultimate_black_user     50000 non-null  bool   
 9   weekday_pct             50000 non-null  float64
 10  avg_dist                50000 non-null  float64
 11  avg_rating_by_driver    49799 non-null  float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 4.2+ MB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
phone,49604.0,2.0,iPhone,34582.0,,,,,,,
ultimate_black_user,50000.0,2.0,False,31146.0,,,,,,,
city,50000.0,3.0,Winterfell,23336.0,,,,,,,
signup_date,50000.0,31.0,2014-01-18,2948.0,,,,,,,
last_trip_date,50000.0,182.0,2014-06-29,2036.0,,,,,,,
trips_in_first_30_days,50000.0,,,,2.2782,3.792684,0.0,0.0,1.0,3.0,125.0
avg_rating_of_driver,41878.0,,,,4.601559,0.617338,1.0,4.3,4.9,5.0,5.0
avg_surge,50000.0,,,,1.074764,0.222336,1.0,1.0,1.0,1.05,8.0
surge_pct,50000.0,,,,8.849536,19.958811,0.0,0.0,0.0,8.6,100.0
weekday_pct,50000.0,,,,60.926084,37.081503,0.0,33.3,66.7,100.0,100.0


In [235]:
ultimate_df = ultimate_data.copy()
display(ultimate_df.sample(3))

# display sample of the dataframe
ultimate_df.sample().T

Unnamed: 0,15353
city,Astapor
trips_in_first_30_days,0
signup_date,2014-01-18
avg_rating_of_driver,
avg_surge,1.0
last_trip_date,2014-06-08
phone,iPhone
surge_pct,0.0
ultimate_black_user,True
weekday_pct,0.0


In [236]:
# Convert dates to datetime format
ultimate_df["signup_date"] = pd.to_datetime(
    ultimate_df["signup_date"], format="%Y-%m-%d"
)
ultimate_df["last_trip_date"] = pd.to_datetime(
    ultimate_df["last_trip_date"], format="%Y-%m-%d"
)

print(
    f"Signup Date range :{ultimate_df['signup_date'].min()} - {ultimate_df['signup_date'].max()}"
)
print(
    f"Last Trip range :{ultimate_df['last_trip_date'].min()} - {ultimate_df['last_trip_date'].max()}"
)

Signup Date range :2014-01-01 00:00:00 - 2014-01-31 00:00:00
Last Trip range :2014-01-01 00:00:00 - 2014-07-01 00:00:00


In [256]:
# Identify the last date in the dataset
latest_date = max(ultimate_df["last_trip_date"])
cutoff_date = latest_date - timedelta(days=30)

# Identify retained users
ultimate_df["retained"] = ultimate_df["last_trip_date"].apply(
    lambda date: date >= cutoff_date
)

# Calculate the retention rate
retention_rate = (ultimate_df["retained"].sum() / len(ultimate_df)) * 100
print(f"Fraction of retained users: {retention_rate:.3f}%")

Fraction of retained users: 37.608%


In [275]:
ultimate_df["signup_date"].hvplot.hist(title="Signup Date Counts").opts(
    active_tools=["box_zoom"]
)

In [284]:
count_normalized = ultimate_df["signup_date"].value_counts(normalize=True)
count_normalized = count_normalized.sort_index().reset_index(name="initial_proportions")

In [None]:


signup_last_trip_scatter = ultimate_df.sort_values(by="signup_date").hvplot.scatter(
    x="last_trip_date",
    y="signup_date",
    by="retained",
    cmap=["gray", "green"],
    title="Last Trip Date vs Signup Date",
    size=5,
    height=600,
    width=800,
    legend=False,
)
# add text to the plot to indicate the cutoff date, anf the retained users
retained_text = hv.Text(
    cutoff_date,
    ultimate_df["signup_date"].max(),
    "Retained Users",
    halign="left",
    valign="bottom",
).opts(color="red")

cutoff_line = hv.VLine(cutoff_date).opts(
    color="gray", line_dash="dashed", active_tools=["box_zoom"]
)
signup_last_trip_scatter * cutoff_line * retained_text

In [263]:
# Histogram of the signup dates for retained users
ultimate_df.loc[ultimate_df["retained"] == True, "signup_date"].hvplot.hist(
    title="Signup Date Distribution for Retained Users",
).opts(active_tools=["box_zoom"])

In [288]:
retained_users_count_normalized = (
    ultimate_df.loc[ultimate_df["retained"] == True, "signup_date"]
    .value_counts(normalize=True)
    .sort_index()
)
retained_users_count_normalized = retained_users_count_normalized.reset_index(
    name="retained_proportions"
)
# the retained users distribution profile correlates with the initial signup date distribution
count_normalized.merge(retained_users_count_normalized).set_index("signup_date").corr()

Unnamed: 0,initial_proportions,retained_proportions
initial_proportions,1.0,0.978069
retained_proportions,0.978069,1.0


In [269]:
ultimate_df.loc[(ultimate_df["signup_date"] == ultimate_df["last_trip_date"])][
    "signup_date"
].hvplot.hist(title="Signup Date same date as Last Trip Date").opts(
    active_tools=["box_zoom"]
)

In [237]:
ultimate_df["is_black_user"] = ultimate_df["ultimate_black_user"].map(
    {True: "elite", False: "regular"}
)
ultimate_df = ultimate_df.drop(columns=["ultimate_black_user"])



Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,weekday_pct,avg_dist,avg_rating_by_driver,is_black_user
32481,Winterfell,2,2014-01-28,3.5,1.18,2014-06-21,iPhone,5.0,60.0,5.28,4.6,regular
15274,Astapor,0,2014-01-09,,1.0,2014-03-27,Android,0.0,100.0,6.71,5.0,regular
29754,Winterfell,5,2014-01-18,4.5,1.0,2014-06-29,iPhone,0.0,0.0,2.71,4.2,elite


In [238]:
# get the number of null values in each column
null_counts = ultimate_df.isnull().sum()
print(f"Null Values in Columns:\n{null_counts[null_counts > 0]}")
null_counts[null_counts > 0].hvplot.barh(title="Null Values in Columns").opts(
    **active_opts
)

Null Values in Columns:
avg_rating_of_driver    8122
phone                    396
avg_rating_by_driver     201
dtype: int64


In [239]:
# Number columns
grid_opts = dict(active_tools=["box_zoom"], height=200, width=400)

num_cols = ultimate_df.select_dtypes(include=["number"]).columns
num_hists = [
    ultimate_df[num_col].hvplot.hist(
        title=f"{num_col} Distribution").opts(**grid_opts)
    for num_col in num_cols
]
hv.Layout(num_hists)

Both of the ratings of and by the driver seem to be left skewed, but the trips_in_first_30_days is right skewed.

In [240]:
# Define a function for Box-Cox transformation
def boxcox_transform(X):
    # +1 to handle zero values in the data
    X_transformed, _ = stats.boxcox(X + 1)
    return X_transformed


q_transform = QuantileTransformer(output_distribution="normal")
p_transform = PowerTransformer()

num_box_cox = [
    boxcox_transform(ultimate_df.dropna()[num_col]).reshape(-1, 1)
    for num_col in num_cols
]

# Box_cox transform
ultimate_bc_df = pd.DataFrame(np.concatenate(num_box_cox, axis=1), columns=num_cols)
# Log transform
ultimate_lg_df = ultimate_df[num_cols].apply(np.log1p)
# Quantile transform
ultimate_q_df = pd.DataFrame(
    q_transform.fit_transform(ultimate_df[num_cols]), columns=num_cols
)
# Power transform
ultimate_p_df = pd.DataFrame(
    p_transform.fit_transform(ultimate_df[num_cols]), columns=num_cols
)

num_q_hists = [
    ultimate_q_df[num_col]
    .hvplot.hist(title=f"{num_col} Quantile Transformed")
    .opts(**grid_opts)
    for num_col in num_cols
]
hv.Layout(num_q_hists)

In [241]:
# Categorical columns bar plots
cat_cols = ultimate_df.select_dtypes(include=["object"]).columns
cat_plots = [
    ultimate_df[cat_col].value_counts().hvplot.bar(title=f"{cat_col}").opts(**grid_opts)
    for cat_col in cat_cols
]

hv.Layout(cat_plots)

In [64]:
# check correlation between the numerical columns


numerical_cols = ultimate_df.select_dtypes(include=np.number).columns
correlation = ultimate_df[numerical_cols].corr()
mask = np.tril(np.ones_like(correlation, dtype=bool))
correlation_plot = correlation.hvplot.heatmap(
    title="Correlation Heatmap", height=400, width=800, rot=90
)

(correlation.where(~mask)).hvplot.heatmap().opts(
    color_levels=5, **active_opts, width=600, height=400, xrotation=90
)

In [65]:
ultimate_df.loc[ultimate_df["avg_rating_by_driver"].isnull()].describe().T
ultimate_df.loc[ultimate_df["avg_dist"] == 0].describe(include="all").T.sort_values(
    "unique"
).fillna("")

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
phone,149.0,2.0,iPhone,92.0,,,,,,,
is_black_user,150.0,2.0,regular,133.0,,,,,,,
city,150.0,3.0,Winterfell,92.0,,,,,,,
signup_date,150.0,30.0,2014-01-24,14.0,,,,,,,
last_trip_date,150.0,89.0,2014-01-25,8.0,,,,,,,
trips_in_first_30_days,150.0,,,,0.64,0.508716,0.0,0.0,1.0,1.0,2.0
avg_rating_of_driver,73.0,,,,4.178082,1.336896,1.0,4.0,5.0,5.0,5.0
avg_surge,150.0,,,,1.038333,0.223863,1.0,1.0,1.0,1.0,3.0
surge_pct,150.0,,,,4.0,19.661566,0.0,0.0,0.0,0.0,100.0
weekday_pct,150.0,,,,56.0,49.804989,0.0,0.0,100.0,100.0,100.0


In [66]:
# look at the rows which has null values
ultimate_df[ultimate_df.isnull().any(axis=1)].describe(include="all").T.sort_values(
    by="unique"
).fillna("")

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
phone,8159.0,2.0,iPhone,5230.0,,,,,,,
is_black_user,8555.0,2.0,regular,5620.0,,,,,,,
city,8555.0,3.0,Winterfell,4175.0,,,,,,,
signup_date,8555.0,31.0,2014-01-25,558.0,,,,,,,
last_trip_date,8555.0,182.0,2014-01-26,250.0,,,,,,,
trips_in_first_30_days,8555.0,,,,0.672355,1.063267,0.0,0.0,1.0,1.0,27.0
avg_rating_of_driver,433.0,,,,4.586143,0.726478,1.0,4.5,5.0,5.0,5.0
avg_surge,8555.0,,,,1.079328,0.312312,1.0,1.0,1.0,1.0,5.0
surge_pct,8555.0,,,,8.611315,26.493406,0.0,0.0,0.0,0.0,100.0
weekday_pct,8555.0,,,,59.333139,46.497673,0.0,0.0,100.0,100.0,100.0


In [167]:
feature_columns = [
    "trips_in_first_30_days",
    "is_black_user",
    "avg_dist",
    "weekday_pct",
    "avg_surge",
    "surge_pct",
    "avg_rating_by_driver",
    "avg_rating_of_driver",
    "phone",
    "city",
]

In [168]:
ultimate_df["retained"] = ultimate_df["retained"].astype(int)

In [169]:
plots = []
plot_opts = dict(height=200, width=300, active_tools=["box_zoom"])
cat_cols = [col for col in feature_columns if ultimate_df[col].dtype == "object"]
num_cols = [col for col in feature_columns if ultimate_df[col].dtype != "object"]
cat_plots = []
num_plots = []

# Create plots for the categorical columns
for column in cat_cols:
    plot = (
        ultimate_df.groupby(column)["retained"]
        .mean()
        .hvplot.barh(
            title=(
                f"{column}".replace("_", " ").title()
                if "_" in column
                else column.title()
            ),
            xlabel="",
            ylabel="Retention Rate",
        )
        .opts(**plot_opts)
    )
    cat_plots.append(plot)

# Create plots for the numerical columns
for column in num_cols:
    plot = ultimate_df.hvplot.violin(
        y=column,
        by="retained",
        title=f"{column}".replace("_", " ").title(),
    ).opts(**plot_opts)
    num_plots.append(plot)

hv.Layout(cat_plots).cols(3)

In [170]:
hv.Layout(num_plots).cols(3)

In [187]:
null_rows = ultimate_df[ultimate_df.isna().any(axis=1)]

null_rows.groupby(["retained", "city"]).size().unstack().hvplot.bar()

In [140]:
target = ultimate_df["retained"]



# check the correlation of the target with any other column


correlation = ultimate_df[numerical_cols].corrwith(target).sort_values(ascending=False)


correlation_plot = correlation.hvplot.barh(
    title="Correlation of Retained Users with Numerical Columns",
    width=800,
    height=400,
).opts(**active_opts)
correlation_plot

In [164]:
avg_dist_feature = ["avg_dist"]
numerical_features = [
    "avg_rating_by_driver",
    "avg_rating_of_driver",
    "surge_pct",
    "trips_in_first_30_days",
    "weekday_pct",
]
categorical_features = ["city", "phone", "is_black_user"]

# Split the data into features and target
X = ultimate_df[num_cols + cat_cols]
y = ultimate_df["retained"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=628, stratify=y
)
# avg_dist_pipe
avg_dist_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("quantile", QuantileTransformer(output_distribution="normal")),
    ]
)


# create a pipeline for numerical columns
numerical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
    ]
)

# create a pipeline for categorical columns
categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent", add_indicator=True)),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)


# combine the pipelines and transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_features),
        ("avg_dist", avg_dist_pipeline, avg_dist_feature),
        ("cat", categorical_pipeline, categorical_features),
    ],
    remainder="drop",
)

In [165]:
# create a pipeline for the model
model_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=628)),
    ]
)

param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [None, 10, 20, 30],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
}

random_search = RandomizedSearchCV(
    model_pipeline,
    param_grid,
    n_iter=10,
    cv=5,
    random_state=628,
    scoring="roc_auc",
    n_jobs=-1,
)

# fit the model
random_search.fit(X_train, y_train)

# best parameters
best_params = random_search.best_params_
print(f"Best Parameters: {best_params}")

# best model
best_model = random_search.best_estimator_


y_pred_proba_tuned = best_model.predict_proba(X_test)[:, 1]
roc_auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned)
print(f"Tuned Model ROC AUC Score: {roc_auc_tuned:.3f}")


# evaluate the model
# y_pred = model_pipeline.predict(X_test)
# y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]

# # get ROC AUC score
# roc_auc = roc_auc_score(y_test, y_pred_proba)
# print(f"ROC AUC Score: {roc_auc:.2f}")

Best Parameters: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 4, 'classifier__max_depth': 10}
Tuned Model ROC AUC Score: 0.85


In [166]:
# Define models and their respective parameter grids
models = {
    "RandomForest": (
        RandomForestClassifier(random_state=628, n_jobs=-1),
        {
            "classifier__n_estimators": [100, 200, 300],
            "classifier__max_depth": [80, 90, 100, 110],
            "classifier__max_features": [2, 3],
            "classifier__min_samples_split": [8, 10, 12],
            "classifier__min_samples_leaf": [3, 4, 5],
        },
    ),
    "LightGBM": (
        LGBMClassifier(random_state=628, n_jobs=-1, is_unbalance=True, num_leaves=31, boosting_type='gbdt'),
        {
            "classifier__learning_rate": [0.01, 0.05],
            "classifier__reg_alpha": [0.1, 0.5],
            "classifier__reg_lambda": [0.1, 0.5],
        },
    ),
    "XGBoost": (
        XGBClassifier(random_state=628),
        {
            "classifier__n_estimators": [100, 200, 500,],
            "classifier__learning_rate": [0.01, 0.1, 0.2, 0.4],
            "classifier__max_depth": [3, 5, 7],
            "classifier__subsample": [0.6, 0.9, 1.0],
            "classifier__colsample_bytree": [0.6, 0.9, 1.0],
            "classifier__colsample_bylevel": [0.6, 0.9, 1.0],
            "classifier__min_child_weight": [1, 5, 20, 100],
        },
    ),
}

# Create an empty dictionary to store the best models
best_models = {}

# Loop through each model and perform random search
for model_name, (model, param_grid) in tqdm(models.items(), desc="Model Tuning"):
    # Create a pipeline for the model
    model_pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", model),
        ]
    )

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(
        model_pipeline,
        param_grid,
        n_iter=5,
        cv=5,
        random_state=628,
        scoring="roc_auc",
        n_jobs=-1,
    )

    # Fit the model
    random_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = random_search.best_estimator_

    # Print the best parameters and ROC AUC score for each model
    print(f"\nBest Parameters for {model_name}: {random_search.best_params_}")
    y_pred_proba_tuned = random_search.best_estimator_.predict_proba(X_test)[:, 1]
    roc_auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned)
    print(f"{model_name} - Tuned Model ROC AUC Score: {roc_auc_tuned:.3f}")

Model Tuning:   0%|          | 0/3 [00:00<?, ?it/s]


Best Parameters for RandomForest: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 12, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 2, 'classifier__max_depth': 110}
RandomForest - Tuned Model ROC AUC Score: 0.850
[LightGBM] [Info] Number of positive: 15043, number of negative: 24957
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 852
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376075 -> initscore=-0.506242
[LightGBM] [Info] Start training from score -0.506242

Best Parameters for LightGBM: {'classifier__reg_lambda': 0.5, 'classifier__reg_alpha': 0.1, 'classifier__learning_rate': 0.05}
LightGBM - Tuned Model ROC AUC Score: 0.857

Best Parameters for 

In [163]:
# # Assuming 'preprocessor' is a ColumnTransformer
# ohe = (
#     model_pipeline.named_steps["preprocessor"]
#     .named_transformers_["cat"]
#     .named_steps["onehot"]
# )

# # Get feature names after one-hot encoding

# ohe_feature_names = ohe.get_feature_names_out()
# ohe_feature_names
best_models["XGBoost"]

In [251]:
importances = model_pipeline.named_steps["classifier"].feature_importances_

In [257]:
# Get feature names from your preprocessor
# This depends on how you've set up your preprocessor
feature_names = numerical_features + ohe_feature_names.tolist()

# Create a DataFrame for easy visualization
feature_importances_df = pd.DataFrame(
    {
        "Feature": feature_names,
        "Importance": importances,
    }
)

# Sort features by importance
feature_importances_df.sort_values("Importance", ascending=False, inplace=True)

# Plot feature importances
feature_importances_df.hvplot.barh("Feature").opts(**active_opts)