In [287]:
from IPython.display import clear_output
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import holoviews as hv
from holoviews import opts
import hvplot.pandas
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import QuantileTransformer, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    recall_score,
    roc_curve,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy import stats
from tqdm.notebook import tqdm
import helper_functions as hf
clear_output()

In [288]:
hv.extension("bokeh")
hvplot.extension("bokeh")
clear_output()

Use helper functions to get the file

In [289]:
zip_path = Path("data/relax_challenge.zip")

# use the functions to extract the zipfile
if hf.check_zipfile(zip_path):
    target_dir = zip_path.parent / zip_path.stem
    hf.create_target_directory(target_dir)
    hf.extract_zipfile(zip_path, target_dir)

Extracted data\relax_challenge.zip to data\relax_challenge


In [290]:
user_engagement_path = (
    "./data/relax_challenge/relax_challenge/takehome_user_engagement.csv"
)

users_path = "data/relax_challenge/relax_challenge/takehome_users.csv"

users_engagement_df = pd.read_csv(user_engagement_path)
display(users_engagement_df.sample(3))
users_engagement_df.info()
display(users_engagement_df.describe(include="all").T.fillna(""))
# drop visited column
users_engagement_df.drop(columns="visited", inplace=True)

# convert time_stamp to datetime
users_engagement_df["time_stamp"] = pd.to_datetime(
    users_engagement_df["time_stamp"])

Unnamed: 0,time_stamp,user_id,visited
40028,2013-11-24 11:40:13,2447,1
142137,2013-08-15 15:05:54,8137,1
157017,2013-11-17 22:58:02,9031,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
time_stamp,207917.0,207220.0,2013-04-06 21:21:37,2.0,,,,,,,
user_id,207917.0,,,,5913.314197,3394.941674,1.0,3087.0,5682.0,8944.0,12000.0
visited,207917.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [291]:
# show the description of the dataframe
display(users_engagement_df.describe(include="all").T.fillna(""))
# get value counts of user_id
user_id_counts = users_engagement_df["user_id"].value_counts()
user_id_counts

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time_stamp,207917.0,2013-10-30 05:06:45.648763648,2012-05-31 08:20:06,2013-07-16 20:17:21,2013-12-03 06:38:34,2014-03-13 08:00:24,2014-06-06 14:58:50,
user_id,207917.0,5913.314197,1.0,3087.0,5682.0,8944.0,12000.0,3394.941674


user_id
3623     606
906      600
1811     593
7590     590
8068     585
        ... 
4699       1
4698       1
4697       1
4696       1
12000      1
Name: count, Length: 8823, dtype: int64

Only 8823 accounts out of 12,000 logged in and were found in the engadgement dataset. The rest of the accounts were not found in the dataset.

In [292]:
# Convert the timestamp to date
users_engagement_df["date"] = pd.to_datetime(
    users_engagement_df["time_stamp"]).dt.date

# Drop duplicates based on user_id and date to get unique logins
user_engagement_datewise = users_engagement_df[["user_id", "date"]].drop_duplicates(
    subset=["user_id", "date"]
)

# Sort the DataFrame by user_id and date
user_engagement_datewise.sort_values(by=["user_id", "date"], inplace=True)

# Calculate the difference in days between the current and previous login date for each user
user_engagement_datewise["date_diff"] = user_engagement_datewise.groupby("user_id")[
    "date"
].diff()
user_engagement_datewise["date_diff"] = user_engagement_datewise["date_diff"].apply(
    lambda x: pd.Timedelta(x).days
)

# Fill NA values with 0
user_engagement_datewise["date_diff"] = user_engagement_datewise["date_diff"].fillna(
    0)

# Calculate the difference in days between the current and the login date two rows back for each user
user_engagement_datewise["date_diff_2"] = (
    user_engagement_datewise.groupby("user_id")["date"]
    .diff(2)
    .apply(lambda x: pd.Timedelta(x).days)
)

# Filter the DataFrame to get users with 3 logins in a 7-day period
adopted_user_df = (
    user_engagement_datewise[user_engagement_datewise["date_diff_2"] < 8]
    .groupby("user_id")
    .first()
)

# Reset the index and rename the columns
adopted_user_df = adopted_user_df[["date"]].reset_index()
adopted_user_df.columns = ["user_id", "date_became_adopted"]

# Mark these users as adopted users
adopted_user_df["adopted_user"] = 1

adopted_users = adopted_user_df["user_id"].unique()
print(f"Number of adopted users: {len(adopted_users)}")

Number of adopted users: 1656


Small number of adopted users, 13% of the users are adopted. The dataset is imbalanced.

In [370]:
users_df = pd.read_csv(users_path, encoding="latin-1")
users_df.info()
display(users_df.sample(3))
print(f"Statistics of the users dataframe")
users_df.describe(include="all").round(2).T.sort_values(by="unique").fillna("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
3020,3021,2014-04-29 11:32:57,Garrison Steven,StevenNGarrison@yahoo.com,ORG_INVITE,1398771000.0,0,0,86,8475.0
9387,9388,2013-04-28 09:22:08,Holden Alisha,AlishaHolden@gustr.com,ORG_INVITE,,0,0,98,408.0
4598,4599,2013-10-12 17:43:22,Bradshaw Freddie,FreddieBradshaw@gmail.com,SIGNUP_GOOGLE_AUTH,1381600000.0,0,0,104,


Statistics of the users dataframe


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
creation_source,12000.0,5.0,ORG_INVITE,4254.0,,,,,,,
name,12000.0,11355.0,Araujo Gabriela,5.0,,,,,,,
email,12000.0,11980.0,AlfieLane@yahoo.com,2.0,,,,,,,
creation_time,12000.0,11996.0,2014-02-11 17:57:53,2.0,,,,,,,
object_id,12000.0,,,,6000.5,3464.25,1.0,3000.75,6000.5,9000.25,12000.0
last_session_creation_time,8823.0,,,,1379279305.7,19531160.79,1338452406.0,1363194965.0,1382888470.0,1398442604.0,1402066730.0
opted_in_to_mailing_list,12000.0,,,,0.25,0.43,0.0,0.0,0.0,0.0,1.0
enabled_for_marketing_drip,12000.0,,,,0.15,0.36,0.0,0.0,0.0,0.0,1.0
org_id,12000.0,,,,141.88,124.06,0.0,29.0,108.0,238.25,416.0
invited_by_user_id,6417.0,,,,5962.96,3383.76,3.0,3058.0,5954.0,8817.0,11999.0


In [294]:
users_df["last_session_creation_time"] = pd.to_datetime(
    users_df["last_session_creation_time"], unit="s"
)
users_df["creation_time"] = pd.to_datetime(users_df["creation_time"])
# describe the 2 time columns
users_df[["creation_time", "last_session_creation_time"]].describe(
    include="all"
).T.fillna("")

Unnamed: 0,count,mean,min,25%,50%,75%,max
creation_time,12000,2013-07-16 13:25:32.964499968,2012-05-31 00:43:27,2013-01-15 21:28:22.750000128,2013-08-05 21:35:19.500,2014-01-28 10:20:12.249999872,2014-05-30 23:59:19
last_session_creation_time,8823,2013-09-15 21:08:25.700441856,2012-05-31 08:20:06,2013-03-13 17:16:05.000000000,2013-10-27 15:41:10.000,2014-04-25 16:16:44.000000000,2014-06-06 14:58:50


In [295]:
# examine the missing values for the last_session_creation_time
users_df[users_df["last_session_creation_time"].isna()].describe(
    include="all"
).T.fillna("")

# see if the object_id of the null dataframe is in the engagement data as the user_id
null_df = users_df[users_df["last_session_creation_time"].isna()]
null_df["object_id"].isin(users_engagement_df["user_id"]).sum()
print(
    f"Number of users with missing last_session_creation_time: {len(null_df)}")

Number of users with missing last_session_creation_time: 3177


We found that `3177` users werre missing data from the `last_session_creation_time` column. These users were also missing data from the user engagement dataset. We therefore could not determine if these users were adopted users or not. Although this was a sizeable portion of the dataset, we decided to drop these users from the dataset.

In [296]:
# drop the rows with the missing last_session_creation_time
users_df = users_df[~users_df["last_session_creation_time"].isna()]
users_df.info()
# look at the missing values
users_df.isna().sum()
print(
    f"Number of missing values in invited_by_user_id: {users_df['invited_by_user_id'].isna().sum()}"
)

# investigate the missing values in invited_by_user_id
users_df[users_df["invited_by_user_id"].isna()].head(3)

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   8823 non-null   int64         
 1   creation_time               8823 non-null   datetime64[ns]
 2   name                        8823 non-null   object        
 3   email                       8823 non-null   object        
 4   creation_source             8823 non-null   object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    8823 non-null   int64         
 7   enabled_for_marketing_drip  8823 non-null   int64         
 8   org_id                      8823 non-null   int64         
 9   invited_by_user_id          4776 non-null   float64       
dtypes: datetime64[ns](2), float64(1), int64(4), object(3)
memory usage: 758.2+ KB
Number of missing values in invited_by_user_id

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,2012-12-20 13:24:32,0,1,37,
10,11,2013-12-26 03:55:54,Paulsen Malthe,MaltheAPaulsen@gustr.com,SIGNUP,2013-12-27 03:55:54,0,0,69,
13,14,2012-10-11 16:14:33,Rivera Bret,BretKRivera@gmail.com,SIGNUP,2012-10-12 16:14:33,0,0,0,


In [297]:
null_df = users_df[users_df["invited_by_user_id"].isna()]
# Check if the object_id of the null dataframe is in the engagement data as the user_id
null_df["object_id"].isin(users_engagement_df["user_id"]).sum()

4047

All these rows with nulll values are i nthe engagement dataset. We can use these rows. the missing values is not a probel . it is because noone invited these users. a  or as they are not from an `ORG_INVITE`  nor a `GUEST_INVITE`. We can fill these missing values with `0`.

In [298]:
# Look at the count of the creation_source and the org_id
null_df["creation_source"].value_counts()
# Look at the invited_by_user_id for the null dataframe
users_df["invited_by_user_id"] = users_df["invited_by_user_id"].fillna(
    0).astype(int)
# look at the info and description of the dataframe
users_df.info()
users_df.describe(include="all").T.fillna("")

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   8823 non-null   int64         
 1   creation_time               8823 non-null   datetime64[ns]
 2   name                        8823 non-null   object        
 3   email                       8823 non-null   object        
 4   creation_source             8823 non-null   object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    8823 non-null   int64         
 7   enabled_for_marketing_drip  8823 non-null   int64         
 8   org_id                      8823 non-null   int64         
 9   invited_by_user_id          8823 non-null   int32         
dtypes: datetime64[ns](2), int32(1), int64(4), object(3)
memory usage: 723.8+ KB


Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
object_id,8823.0,,,,6019.821716,1.0,3017.5,6034.0,9029.5,12000.0,3464.251001
creation_time,8823.0,,,,2013-07-18 15:48:32.228833792,2012-05-31 00:43:27,2013-01-18 22:39:45.500000,2013-08-09 22:08:11,2014-01-30 00:21:54,2014-05-30 23:59:19,
name,8823.0,8453.0,Correia Leonardo,4.0,,,,,,,
email,8823.0,8810.0,MarkoSeiler@yahoo.com,2.0,,,,,,,
creation_source,8823.0,5.0,ORG_INVITE,3188.0,,,,,,,
last_session_creation_time,8823.0,,,,2013-09-15 21:08:25.700441856,2012-05-31 08:20:06,2013-03-13 17:16:05,2013-10-27 15:41:10,2014-04-25 16:16:44,2014-06-06 14:58:50,
opted_in_to_mailing_list,8823.0,,,,0.252295,0.0,0.0,0.0,1.0,1.0,0.434354
enabled_for_marketing_drip,8823.0,,,,0.151989,0.0,0.0,0.0,0.0,1.0,0.359031
org_id,8823.0,,,,142.572254,0.0,30.0,109.0,239.0,416.0,124.176422
invited_by_user_id,8823.0,,,,3237.316786,0.0,0.0,1055.0,6405.0,11999.0,3888.088044


In [299]:
# get the number of logins for each user
logins_per_user = users_engagement_df["user_id"].value_counts().reset_index()
logins_per_user.columns = ["user_id", "logins"]

# merge on the user_id column
users_who_logged_in = logins_per_user.merge(adopted_user_df, how="left")

# merge in the other user data
all_users_df = users_df.merge(
    users_who_logged_in, how="left", left_on="object_id", right_on="user_id"
)
all_users_df

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,logins,date_became_adopted,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803,1,1,,
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316,2,14,2014-02-09,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525,3,1,,
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151,4,1,,
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240,5,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8818,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,2013-09-06 06:14:15,0,0,89,8263,11996,1,,
8819,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-15 18:28:37,0,0,200,0,11997,1,,
8820,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,2014-04-27 12:45:16,1,1,83,8074,11998,1,,
8821,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,2012-06-02 11:55:59,0,0,6,0,11999,1,,


In [300]:
# Fill missing values in 'adopted_user' with 0 and convert the column to integer
all_users_df["adopted_user"] = all_users_df["adopted_user"].fillna(
    0).astype(int)

# Fill missing values in 'logins' with 0 and convert the column to integer
all_users_df["logins"] = all_users_df["logins"].fillna(0).astype(int)

# Convert 'last_session_creation_time' to datetime using seconds as the unit
all_users_df["last_session_creation_time"] = pd.to_datetime(
    all_users_df["last_session_creation_time"], unit="s"
)
all_users_df["last_session_creation_time"].isna().sum()

0

In [301]:
# Convert 'creation_time' to datetime and extract the date
all_users_df["start_date"] = pd.to_datetime(
    all_users_df["creation_time"]).dt.date

# Fill missing values in 'last_session_creation_time' with 'creation_time' and extract the date
all_users_df["last_login"] = (
    all_users_df["last_session_creation_time"]
    .fillna(all_users_df["creation_time"])
    .dt.date
)
all_users_df["start_date"].hvplot(
    kind="hist", bins=52, title="Start Date Distribution"
).opts(active_tools=["box_zoom"])

In [302]:
# Convert 'last_session_creation_time' and 'creation_time' to datetime
all_users_df["last_session_creation_time"] = pd.to_datetime(
    all_users_df["last_session_creation_time"]
)
all_users_df["creation_time"] = pd.to_datetime(all_users_df["creation_time"])

# # Calculate 'account_age' in days
all_users_df["account_age"] = (
    all_users_df["last_session_creation_time"] - all_users_df["creation_time"]
).dt.days
all_users_df["account_age"].describe()

# all_users_df[
#     [
#         "start_date",
#         "last_login",
#         "last_session_creation_time",
#         "creation_time",
#     ]
# ].dtypes

count    8823.000000
mean       59.222147
std       144.172167
min         0.000000
25%         0.000000
50%         1.000000
75%        20.000000
max       729.000000
Name: account_age, dtype: float64

In [303]:
# Extract the month from 'creation_time'
all_users_df["creation_month"] = all_users_df["creation_time"].dt.month

# Drop the 'user_id' column and rename the 'object_id' column to 'user_id'
all_users_df = all_users_df.drop(columns="user_id").rename(
    columns={"object_id": "user_id"}
)

# Extract the domain from the 'email' column
all_users_df["domain"] = (
    all_users_df["email"].str.split("@").str[1].str.split(".").str[0]
)

# Fill missing values in 'invited_by_user_id' with 0, convert to string and add leading zeros
all_users_df["invited_by_user_id"] = (
    all_users_df["invited_by_user_id"]
    .fillna(0)
    .astype(int)
    .astype("string")
    .str.zfill(5)
)

# Convert 'user_id' to string and add leading zeros
all_users_df["user_id"] = all_users_df["user_id"].astype("string").str.zfill(5)

# Replace '00000' in 'invited_by_user_id' with 'Not Invited'
all_users_df["invited_by_user_id"] = all_users_df["invited_by_user_id"].replace(
    "00000", "Not Invited"
)

# Create a DataFrame of adopted users
adopted_users = all_users_df[all_users_df["adopted_user"] == 1]["user_id"]

# Count the number of users each user has referred
referred_df = all_users_df["invited_by_user_id"].value_counts(
).reset_index().iloc[1:]
referred_df.columns = ["user_id", "num_referred"]

# Merge 'referred_df' with 'all_users_df'
all_users_df = all_users_df.merge(referred_df, how="left")

# Fill missing values in 'num_referred' with 0
all_users_df["num_referred"] = all_users_df["num_referred"].fillna(
    0).astype(int)

# Create a new column 'was_invited' that indicates whether a user was invited or not
all_users_df["was_invited"] = all_users_df["invited_by_user_id"].apply(
    lambda x: "Invited" if x != "Not Invited" else "Not Invited"
)

In [304]:
all_users_df[(all_users_df["num_referred"] > 0) & (all_users_df["adopted_user"] == 1)][
    [
        "user_id",
        "num_referred",
        "adopted_user",
        "was_invited",
        "invited_by_user_id",
    ]
].sort_values(by="num_referred", ascending=False)

Unnamed: 0,user_id,num_referred,adopted_user,was_invited,invited_by_user_id
3381,04612,10,1,Invited,04612
3523,04803,8,1,Invited,05826
1100,01525,8,1,Invited,09224
1487,02042,7,1,Not Invited,Not Invited
4665,06370,7,1,Not Invited,Not Invited
...,...,...,...,...,...
4060,05547,1,1,Invited,02271
4064,05552,1,1,Invited,05552
4090,05586,1,1,Invited,02537
4101,05600,1,1,Invited,01754


In [313]:
all_users_df["user_invited"] = all_users_df["num_referred"] > 0
all_users_df["user_invited"] = all_users_df["user_invited"].astype(int)
# all_users_df

In [314]:
# create a column for the users who invited themselves
all_users_df["invited_self"] = (
    all_users_df["user_id"] == all_users_df["invited_by_user_id"]
)
all_users_df["invited_self"] = all_users_df["invited_self"].astype(int)

In [315]:
all_users_df["new_domain"] = all_users_df["domain"].apply(
    lambda x: (
        "other"
        if x not in ["gmail", "yahoo", "jourrapide", "cuvox", "gustr", "hotmail"]
        else x
    )
)

In [316]:
all_users_df.describe(include="all").T.sort_values("unique").fillna("")

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
was_invited,8823.0,2.0,Invited,4776.0,,,,,,,
creation_source,8823.0,5.0,ORG_INVITE,3188.0,,,,,,,
new_domain,8823.0,7.0,gmail,2930.0,,,,,,,
date_became_adopted,1656.0,625.0,2012-10-14,9.0,,,,,,,
start_date,8823.0,730.0,2014-05-30,53.0,,,,,,,
last_login,8823.0,736.0,2014-06-04,373.0,,,,,,,
domain,8823.0,812.0,gmail,2930.0,,,,,,,
invited_by_user_id,8823.0,2230.0,Not Invited,4047.0,,,,,,,
name,8823.0,8453.0,Correia Leonardo,4.0,,,,,,,
email,8823.0,8810.0,MarkoSeiler@yahoo.com,2.0,,,,,,,


In [317]:
all_users_df.sample().T

Unnamed: 0,655
user_id,00903
creation_time,2013-10-16 21:38:27
name,Theiss Nadine
email,NadineTheiss@cuvox.de
creation_source,ORG_INVITE
last_session_creation_time,2013-10-16 21:38:27
opted_in_to_mailing_list,1
enabled_for_marketing_drip,1
org_id,67
invited_by_user_id,10976


Target Variable

In [318]:
all_users_df["adopted_user"].value_counts(normalize=True).reset_index(
    name="adopted_users"
)

Unnamed: 0,adopted_user,adopted_users
0,0,0.812309
1,1,0.187691


In [319]:
def plot_cat_active_stacked_bars(df, cat_cols, target_col="adopted_user"):
    """Create a bar plot for each categorical column showing the proportion of active users."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    cat_plots = []
    for column in cat_cols:
        proportions = (
            df.groupby(column)[target_col].value_counts(
                normalize=True).unstack()
        )
        plot = proportions.hvplot.barh(
            title=(
                f"{column}".replace("_", " ").title()
                if "_" in column
                else column.title()
            ),
            xlabel="",
            ylabel="",
            stacked=True,
            cmap=["lightgray", "green"],
            legend=False,
        ).opts(**plot_opts, legend_position="top_right")
        cat_plots.append(plot)
    return hv.Layout(cat_plots).cols(3)


def plot_num_active_violins(df, list_of_num_cols, target_col="adopted_user"):
    """Create 2 violin plot for each numerical column with the target column."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    num_plots = []
    for column in list_of_num_cols:
        plot = df.hvplot.violin(
            y=column,
            by=target_col,
            c=target_col,
            ylabel="",
            title=f"{column}".replace("_", " ").title(),
            cmap=["lightgray", "green"],
        ).opts(**plot_opts)
        num_plots.append(plot)
    return hv.Layout(num_plots).cols(3)


def plot_num_active_hist(df, list_of_num_cols, target_col="adopted_user"):
    """Creates a histogram plot of each numerical column with target column
    separated by color."""
    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])

    hist_plots = [
        df[[num_col] + [target_col]]
        .hvplot.hist(
            title=f"{num_col} Distribution",
            bins=12,
            xlabel="",
            by=target_col,
            alpha=0.6,
            muted_alpha=0.02,
        )
        .opts(**plot_opts)
        for num_col in list_of_num_cols
    ]
    return hv.Layout(hist_plots)

In [320]:
cat_cols = [
    "new_domain",
    "creation_source",
    "was_invited",
    "creation_month",
    "user_invited",
    "invited_self",
]

all_users_df.dtypes
plot_cat_active_stacked_bars(all_users_df, cat_cols)

In [217]:
all_users_df.dtypes

user_id                       string[python]
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                       object
last_session_creation_time    datetime64[ns]
opted_in_to_mailing_list               int64
enabled_for_marketing_drip             int64
org_id                                 int64
invited_by_user_id            string[python]
logins                                 int32
date_became_adopted                   object
adopted_user                           int32
start_date                            object
last_login                            object
account_age                            int64
creation_month                         int32
domain                                object
num_referred                           int32
was_invited                           object
adopted_user_invited                  object
invited_self                           int32
new_domain

In [245]:
plot_num_active_violins(
    all_users_df,
    [
        "logins",
        "account_age",
        "num_referred",
        "opted_in_to_mailing_list",
        "enabled_for_marketing_drip",
    ],
)

In [356]:
qt = QuantileTransformer(output_distribution="normal", n_quantiles=200)
acct_trans = qt.fit_transform(all_users_df[["account_age"]])

pd.DataFrame(acct_trans, columns=["account_age"]).skew()

all_users_df["account_age"].skew()
# all_users_df[all_users_df["account_age"] > 1]["account_age"].hvplot(
#     kind="hist", bins=10, title="Account Age Distribution"
# ).opts(active_tools=["box_zoom"])

2.9160866430145997

We can't use `login` because that is potential leakage. We can use The talent age and this seems to be a good indicator of the person is an adopted user or not. We can also try combining some category features such as
- guest_referral and the month of June
- personal project and invited self'
- adopted referral and guest invite

In [357]:
all_users_df["is_home_project"] = all_users_df["creation_source"].apply(
    lambda x: 1 if x in ("GUEST_INVITE", "PERSONAL_PROJECTS") else 0
)
all_users_df["active_3_sum"] = (
    all_users_df["user_invited"]
    + all_users_df["is_home_project"]
    + all_users_df["invited_self"]
)
all_users_df["active_3_sum"].value_counts(normalize=True)
plot_cat_active_stacked_bars(all_users_df, ["active_3_sum"])

In [358]:
categorical_features = ["active_3_sum", "new_domain", "creation_month"]
numerical_features = ["account_age"]

X = all_users_df[categorical_features + numerical_features]
y = all_users_df["adopted_user"]

In [359]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=628, stratify=y
)

In [360]:
# create a pipeline for numerical columns
numerical_pipeline = Pipeline(
    steps=[
        ("scaler", RobustScaler()),
    ]
)

# create a pipeline for categorical columns
categorical_pipeline = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_features),
        ("cat", categorical_pipeline, categorical_features),
    ],
    remainder="drop",
)

In [361]:
# Models dict
models = {}
models["LogisticRegression"] = (
    LogisticRegression(
        random_state=628,
        n_jobs=-1,
        max_iter=1000,
    ),
    {
        "classifier__C": np.logspace(-3, 3, 7),
        "classifier__class_weight": ["balanced", None],
        "classifier__solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    },
)

models["LightGBM"] = (
    LGBMClassifier(
        random_state=628,
        n_jobs=-1,
        is_unbalance=True,
        num_leaves=31,
        boosting_type="gbdt",
        verbose=-1,
    ),
    {
        "classifier__learning_rate": stats.uniform(0.01, 0.5),
        "classifier__reg_alpha": stats.uniform(0.0, 0.05),
        "classifier__reg_lambda": stats.uniform(0.0, 0.05),
        "classifier__min_child_samples": stats.randint(100, 500),
        "classifier__min_data_in_leaf": stats.randint(20, 100),
    },
)

models["XGBoost"] = (
    XGBClassifier(
        random_state=628,
        scale_pos_weight=sum(y_train == 0) / sum(y_train == 1),
    ),
    {
        "classifier__n_estimators": stats.randint(100, 1000),
        "classifier__learning_rate": [0.001, 0.01, 0.1, 0.2, 0.4, 0.5],
        "classifier__max_depth": stats.randint(3, 10),
        "classifier__subsample": stats.uniform(0.6, 0.4),
        "classifier__colsample_bytree": stats.uniform(0.6, 0.4),
        "classifier__colsample_bylevel": stats.uniform(0.6, 0.4),
        "classifier__min_child_weight": stats.randint(1, 200),
    },
)

In [362]:
# Create an empty dictionary to store the best models
best_models = {}

# Loop through each model and perform random search
for model_name, (model, param_grid) in tqdm(models.items(), desc="Model Tuning"):
    # Create a pipeline for the model
    model_pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", model),
        ]
    )

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(
        model_pipeline,
        param_grid,
        n_iter=15,
        cv=5,
        random_state=628,
        scoring="recall",
        n_jobs=-1,
    )

    # Fit the model
    random_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = random_search.best_estimator_

    # Print the best parameters and ROC AUC score for each model
    print(f"\nBest Parameters for {model_name}: {random_search.best_params_}")
    # Print the mean cross-validation score for the best parameters
    best_index = random_search.best_index_
    mean_cv_score = random_search.cv_results_["mean_test_score"][best_index]
    print(f"{model_name} - Mean CV Score for Best Parameters: {mean_cv_score:.3f}")
    y_pred_proba_tuned = random_search.best_estimator_.predict_proba(X_test)[:, 1]
    y_pred = best_models[model_name].predict(X_test)
    roc_auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned)
    recall_score_tuned = recall_score(y_test, y_pred)

    print(f"{model_name} - Tuned Model ROC AUC Score: {roc_auc_tuned:.3f}")
    print(f"{model_name} - Tuned Model Recall Score:{recall_score_tuned:.3f}")

Model Tuning:   0%|          | 0/3 [00:00<?, ?it/s]


Best Parameters for LogisticRegression: {'classifier__solver': 'newton-cg', 'classifier__class_weight': 'balanced', 'classifier__C': 0.1}
LogisticRegression - Mean CV Score for Best Parameters: 0.935
LogisticRegression - Tuned Model ROC AUC Score: 0.992
LogisticRegression - Tuned Model Recall Score:0.940

Best Parameters for LightGBM: {'classifier__learning_rate': 0.07827724005998087, 'classifier__min_child_samples': 366, 'classifier__min_data_in_leaf': 83, 'classifier__reg_alpha': 0.017503009363665636, 'classifier__reg_lambda': 0.004726102044580444}
LightGBM - Mean CV Score for Best Parameters: 0.945
LightGBM - Tuned Model ROC AUC Score: 0.991
LightGBM - Tuned Model Recall Score:0.952

Best Parameters for XGBoost: {'classifier__colsample_bylevel': 0.9165172691366399, 'classifier__colsample_bytree': 0.7387330441498317, 'classifier__learning_rate': 0.001, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 818, 'classifier__subsample': 0.761341513

In [363]:
feature_names = (
    best_models["LogisticRegression"]
    .named_steps["preprocessor"]
    .get_feature_names_out()
)
lr_importance = best_models["LogisticRegression"].named_steps["classifier"].coef_[0]

xgb_feature_importance = (
    best_models["XGBoost"].named_steps["classifier"].feature_importances_
)
lgbm_feature_importance = (
    best_models["LightGBM"].named_steps["classifier"].feature_importances_
)

In [364]:
# Create a DataFrame for easy visualization
feature_importances_df = pd.DataFrame(
    {
        "Logistic Regression": lr_importance,
        "XGBoost": xgb_feature_importance,
        "Light GBM": lgbm_feature_importance,
    },
    index=feature_names,
)


def highlight_topn(s):
    is_top3 = s.isin(s.nlargest(4))
    return ["background-color: salmon" if v else "" for v in is_top3]


# Sort features by importance
feature_importances_df.sort_values("Logistic Regression", ascending=False, inplace=True)
# highlight the 3 highest values in each column
feature_importances_df.style.apply(highlight_topn, axis=0)
# feature_importances_df.round(4)

Unnamed: 0,Logistic Regression,XGBoost,Light GBM
num__account_age,1.528528,0.769143,1259
cat__creation_month_4,0.508315,0.011405,37
cat__creation_month_5,0.384336,0.0572,19
cat__new_domain_gmail,0.380916,0.00725,81
cat__creation_month_6,0.265209,0.008329,22
cat__creation_month_3,0.162689,0.007884,39
cat__active_3_sum_2,0.145446,0.009936,14
cat__new_domain_gustr,0.130498,0.004274,23
cat__active_3_sum_3,0.082977,0.00701,0
cat__creation_month_12,0.080919,0.007882,32


In [365]:
# Plot feature importances
active_opts = dict(
    active_tools=["box_zoom"],
)
hv.Layout(
    [
        feature_importances_df[col]
        .hvplot.barh(
            y=col,
            xlabel="",
            #  title with model name
            title=f"Feature Importance {col}",
            height=600,
        )
        .opts(**active_opts)
        for col in feature_importances_df.columns
    ],
).cols(2)

In [366]:
def get_roc_confusion_matrix(model, X_test, y_test):
    """Get the ROC AUC score and the confusion matrix for the model."""
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    # y_pred = model.predict(X_test)

    # Get the ROC AUC score
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Get the confusion matrix
    fpr, tpr, thresh = roc_curve(y_test, y_pred_proba)
    # Calculate the optimal threshold
    J = tpr - fpr
    optimal_idx = np.argmax(J)
    optimal_threshold = thresh[optimal_idx]

    # Use the optimal threshold to convert probabilities into class predictions
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)

    roc_curve_plot = hv.Curve((fpr, tpr)).opts(
        title=f"ROC Curve (AUC: {roc_auc:.2f} || opt thresh:{optimal_threshold:.2f})",  # noqa: E501
        xlabel="False Positive Rate",
        ylabel="True Positive Rate",
        line_width=2,
        height=400,
        width=400,
        tools=["hover"],
        active_tools=["box_zoom"],
    )
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix = pd.DataFrame(
        conf_matrix,
        index=["Actual 0", "Actual 1"],
        columns=["Predicted 0", "Predicted 1"],
    )
    print(classification_report(y_test, y_pred))
    conf_heatmap = conf_matrix.hvplot.heatmap(
        height=400, width=400, colorbar=False, title="Confusion Matrix"
    ).opts(**active_opts, invert_yaxis=True, color_levels=5)
    # Create a DataFrame for labels
    labels_df = pd.DataFrame(conf_matrix.stack(), columns=[
                             "value"]).reset_index()
    labels_df.columns = ["y", "x", "value"]

    # Create labels
    labels = hv.Labels(labels_df, ["x", "y"], "value")

    return hv.Layout(
        [
            (conf_heatmap * labels.opts(text_color="gray", text_font_size="14pt")),
            roc_curve_plot,
        ]
    ).cols(2)

Logistic Regression

In [367]:
lr_model = best_models["LogisticRegression"]
get_roc_confusion_matrix(lr_model, X_test, y_test)

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      1434
           1       0.78      0.97      0.87       331

    accuracy                           0.94      1765
   macro avg       0.89      0.95      0.91      1765
weighted avg       0.95      0.94      0.95      1765



XGBoost

In [368]:
xgboost_model = best_models["XGBoost"]
get_roc_confusion_matrix(xgboost_model, X_test, y_test)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1434
           1       0.87      0.94      0.90       331

    accuracy                           0.96      1765
   macro avg       0.93      0.95      0.94      1765
weighted avg       0.96      0.96      0.96      1765



LightGBM

In [369]:
lgbm_model = best_models["LightGBM"]
get_roc_confusion_matrix(lgbm_model, X_test, y_test)

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1434
           1       0.81      0.95      0.87       331

    accuracy                           0.95      1765
   macro avg       0.90      0.95      0.92      1765
weighted avg       0.95      0.95      0.95      1765

