In [126]:
from IPython.display import clear_output
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import holoviews as hv
from holoviews import opts
import hvplot.pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import (
    QuantileTransformer,
    RobustScaler,
    OneHotEncoder,
    StandardScaler,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    recall_score,
    roc_curve,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy import stats
from tqdm.notebook import tqdm
import helper_functions as hf

clear_output()

In [127]:
hv.extension("bokeh")
hvplot.extension("bokeh")
# for setting the number of columns to display in the notebook
pd.set_option("display.max_columns", 50)
clear_output()

Use helper functions to get the file

In [128]:
zip_path = Path("data/relax_challenge.zip")

# use the functions to extract the zipfile
if hf.check_zipfile(zip_path):
    target_dir = zip_path.parent / zip_path.stem
    hf.create_target_directory(target_dir)
    hf.extract_zipfile(zip_path, target_dir)

Extracted data\relax_challenge.zip to data\relax_challenge


In [129]:
user_engagement_path = (
    "./data/relax_challenge/relax_challenge/takehome_user_engagement.csv"
)

users_path = "data/relax_challenge/relax_challenge/takehome_users.csv"

users_engagement_df = pd.read_csv(user_engagement_path)
display(users_engagement_df.sample(3))
users_engagement_df.info()
display(users_engagement_df.describe(include="all").round(2).T.fillna(""))
# drop visited column
users_engagement_df.drop(columns="visited", inplace=True)

# convert time_stamp to datetime
users_engagement_df["time_stamp"] = pd.to_datetime(
    users_engagement_df["time_stamp"])
# # Convert user_id to string
# users_engagement_df['user_id'] = users_engagement_df['user_id'].astype(
#     'string').str.zfill(5)

Unnamed: 0,time_stamp,user_id,visited
70918,2013-10-26 15:05:07,4086,1
207644,2013-05-11 11:10:11,11975,1
71515,2014-03-09 07:56:40,4141,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
time_stamp,207917.0,207220.0,2013-04-06 21:21:37,2.0,,,,,,,
user_id,207917.0,,,,5913.31,3394.94,1.0,3087.0,5682.0,8944.0,12000.0
visited,207917.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [130]:
# show the description of the dataframe
display(users_engagement_df.describe(include="all").round(2).T.fillna(""))
# get value counts of user_id
user_id_counts = users_engagement_df["user_id"].value_counts()
print(f"number of unique users: {len(user_id_counts)}")
user_id_counts

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time_stamp,207917.0,2013-10-30 05:06:45.648763648,2012-05-31 08:20:06,2013-07-16 20:17:21,2013-12-03 06:38:34,2014-03-13 08:00:24,2014-06-06 14:58:50,
user_id,207917.0,5913.31,1.0,3087.0,5682.0,8944.0,12000.0,3394.94


number of unique users: 8823


user_id
3623     606
906      600
1811     593
7590     590
8068     585
        ... 
4699       1
4698       1
4697       1
4696       1
12000      1
Name: count, Length: 8823, dtype: int64

Only 8,823 accounts out of 12,000 logged in and were found in the engagement dataset. The other accounts we do not have any login/engagement data for.

Find the `adopted_user`s.<br>
**Criteria: Users with 3 logins in a 7-day period.**

In [131]:
# Convert the timestamp to date
users_engagement_df["date"] = pd.to_datetime(
    users_engagement_df["time_stamp"]).dt.date

# Drop duplicates based on user_id and date to get unique logins
user_engagement_datewise = users_engagement_df[["user_id", "date"]].drop_duplicates(
    subset=["user_id", "date"]
)

# Sort the DataFrame by user_id and date
user_engagement_datewise.sort_values(by=["user_id", "date"], inplace=True)

# Calculate the difference in days between the current and previous login date for each user
user_engagement_datewise["date_diff"] = user_engagement_datewise.groupby("user_id")[
    "date"
].diff()
user_engagement_datewise["date_diff"] = user_engagement_datewise["date_diff"].apply(
    lambda x: pd.Timedelta(x).days
)

# Fill NA values with 0
user_engagement_datewise["date_diff"] = user_engagement_datewise["date_diff"].fillna(
    0)

# Calculate the difference in days between the current and the login date two rows back for each user
user_engagement_datewise["date_diff_2"] = (
    user_engagement_datewise.groupby("user_id")["date"]
    .diff(2)
    .apply(lambda x: pd.Timedelta(x).days)
)

# Filter the DataFrame to get users with 3 logins in a 7-day period
adopted_users_df = (
    user_engagement_datewise[user_engagement_datewise["date_diff_2"] < 8]
    .groupby("user_id")
    .first()
)

# Reset the index and rename the columns
adopted_users_df = adopted_users_df[["date"]].reset_index()
adopted_users_df.columns = ["user_id", "date_became_adopted"]

# Mark these users as adopted users
adopted_users_df["adopted_user"] = 1

adopted_users = adopted_users_df["user_id"].unique()
print(f"Number of adopted users: {len(adopted_users)}")

Number of adopted users: 1656


In [132]:
# get the highest date in the dataset
max_timestamp = users_engagement_df["time_stamp"].max()
print(f"The most recent user_login date is: \n{max_timestamp}")

The most recent user_login date is: 
2014-06-06 14:58:50


In [133]:
# Get a cumcount of the number of logins for each user at each row
user_engagement_datewise["login_count"] = (
    user_engagement_datewise.groupby("user_id").cumcount() + 1
)
# get a cumsum of the number of days since the first login for each user at each row
user_engagement_datewise["days_since_first_login"] = user_engagement_datewise.groupby(
    "user_id"
)["date_diff"].cumsum()
# get the avg time between logins for each user at each row
user_engagement_datewise["avg_time_between_logins"] = (
    user_engagement_datewise["days_since_first_login"]
    / user_engagement_datewise["login_count"]
).round(1)

user_engagement_datewise.loc[user_engagement_datewise["avg_time_between_logins"] > 0]
# filter the user_engagement_datewise to get the rows of users who are adopted
user_engagement_datewise[user_engagement_datewise["user_id"].isin(
    adopted_users)]
# get the login_count number when they became an adopted user
# merge on the user_id and the date_became_adopted with user_id and date respectively
adopted_users_df = adopted_users_df.rename(
    columns={"date_became_adopted": "date"})
adoption_df = adopted_users_df.merge(
    user_engagement_datewise, on=["user_id", "date"])
# get a cumulative count of the adopted users in chronological order
adoption_df["date"] = pd.to_datetime(adoption_df["date"])
adoption_df.sort_values("date", inplace=True)
adoption_df["cum_adopted"] = adoption_df["adopted_user"].cumsum()
display(adoption_df.head())
# plot the cumulative adopted users
adoption_df.sort_values(by=["date"]).hvplot.scatter(
    x="date", y="cum_adopted", size=5, title="Cumulative Adopted Users"
).opts(active_tools=["box_zoom"])

Unnamed: 0,user_id,date,adopted_user,date_diff,date_diff_2,login_count,days_since_first_login,avg_time_between_logins,cum_adopted
224,1693,2012-06-10,1,1.0,5.0,4,10.0,2.5,1
106,728,2012-06-16,1,1.0,5.0,3,5.0,1.7,2
206,1525,2012-06-16,1,5.0,7.0,3,7.0,2.3,3
1624,11764,2012-06-17,1,2.0,4.0,3,4.0,1.3,4
1020,7590,2012-06-18,1,5.0,7.0,3,7.0,2.3,5


In [134]:
# Get the first adoption date
first_adoption = adoption_df["date"].min()
print(f"The first adoption date is: {first_adoption}")

# Calculate the rate of cumulative adopted users
adoption_df["days_since_first_adoption"] = (adoption_df["date"] -
                                            first_adoption).dt.days
adoption_df["adoption_rate"] = (
    adoption_df["cum_adopted"] /
    adoption_df["days_since_first_adoption"]).round(2)

# Plot the adoption rate over time
adoption_df.hvplot.scatter(x="days_since_first_adoption",
                           y="adoption_rate",
                           title="Adoption Rate",
                           size=5).opts(active_tools=["box_zoom"])

The first adoption date is: 2012-06-10 00:00:00


In [88]:
# get the count of the adopted users for each month
adoption_df["month"] = adoption_df["date"].dt.to_period("M")
adoption_df["month_adoption_count"] = adoption_df.groupby("month")[
    "adopted_user"
].transform("sum")
# check the adoptions for each month and plot on a bar chart
monthly_adoption_count = (
    adoption_df[["month", "month_adoption_count"]].drop_duplicates().set_index("month")
)
monthly_adoption_count.hvplot() * monthly_adoption_count.hvplot.scatter(
    height=600, title="Adoptions per Month"
).opts(active_tools=["box_zoom"])

In [89]:
# get the count of the adopted users for each week
adoption_df["week"] = adoption_df["date"].dt.to_period("W")
adoption_df["week_adoption_count"] = adoption_df.groupby("week")[
    "adopted_user"
].transform("sum")

# check the weekly adoptions
weekly_adoption_count = (
    adoption_df[["date", "adopted_user"]].set_index("date").resample("W").sum()
)
weekly_adoption_count.hvplot() * weekly_adoption_count.hvplot(
    height=600, title="Adoptions per Week"
).opts(active_tools=["box_zoom"])
# smooth out the weekly adoptions
(
    weekly_adoption_count.rolling(window=7, center=True, min_periods=1).mean().hvplot()
    * weekly_adoption_count.hvplot.scatter(title="Adoptions per Week", color="gray")
).opts(active_tools=["box_zoom"], height=500, width=1000, show_grid=True, xlabel="")
# adoption_df

In [11]:
weekly_adoption_count

Unnamed: 0_level_0,adopted_user
date,Unnamed: 1_level_1
2012-06-10,1
2012-06-17,3
2012-06-24,5
2012-07-01,3
2012-07-08,11
...,...
2014-05-11,27
2014-05-18,26
2014-05-25,19
2014-06-01,9


In [12]:
# get the count of the adopted users for each day
adoption_df["day"] = adoption_df["date"].dt.to_period("D")
adoption_df["day_adoption_count"] = adoption_df.groupby(
    "day")["adopted_user"].transform("sum")
# check the daily adoptions
daily_adoption_count = (adoption_df[["date", "adopted_user"
                                     ]].set_index("date").resample("D").sum())
daily_adoption_count.hvplot(grid=True) * daily_adoption_count.hvplot.scatter(
    height=600, title="Adoptions per Day").opts(active_tools=["box_zoom"])

Small number of adopted users, 13% of the users are adopted. The dataset is imbalanced.

In [13]:
users_df = pd.read_csv(users_path, encoding="latin-1")
users_df.info()
display(users_df.sample(3))
print(f"Statistics of the users dataframe")
users_df.describe(include="all").round(2).T.sort_values(by="unique").fillna("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
6808,6809,2013-11-20 01:45:07,Bech Hannah,naosxljo@svdbx.com,ORG_INVITE,1384998000.0,0,0,324,2973.0
253,254,2014-03-04 19:52:58,Simonsen Niels,NielsHSimonsen@jourrapide.com,PERSONAL_PROJECTS,1394308000.0,1,0,11,
10760,10761,2013-10-22 02:48:16,Allan Aidan,AidanAllan@hotmail.com,ORG_INVITE,1382410000.0,0,0,152,11626.0


Statistics of the users dataframe


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
creation_source,12000.0,5.0,ORG_INVITE,4254.0,,,,,,,
name,12000.0,11355.0,Araujo Gabriela,5.0,,,,,,,
email,12000.0,11980.0,AlfieLane@yahoo.com,2.0,,,,,,,
creation_time,12000.0,11996.0,2014-02-11 17:57:53,2.0,,,,,,,
object_id,12000.0,,,,6000.5,3464.25,1.0,3000.75,6000.5,9000.25,12000.0
last_session_creation_time,8823.0,,,,1379279305.7,19531160.79,1338452406.0,1363194965.0,1382888470.0,1398442604.0,1402066730.0
opted_in_to_mailing_list,12000.0,,,,0.25,0.43,0.0,0.0,0.0,0.0,1.0
enabled_for_marketing_drip,12000.0,,,,0.15,0.36,0.0,0.0,0.0,0.0,1.0
org_id,12000.0,,,,141.88,124.06,0.0,29.0,108.0,238.25,416.0
invited_by_user_id,6417.0,,,,5962.96,3383.76,3.0,3058.0,5954.0,8817.0,11999.0


In [14]:
users_df["last_session_creation_time"] = pd.to_datetime(
    users_df["last_session_creation_time"], unit="s")
users_df["creation_time"] = pd.to_datetime(users_df["creation_time"])
display(users_df.sample(3))
# describe the 2 time columns
users_df[["creation_time",
          "last_session_creation_time"]].describe(include="all").T.fillna("")

# Calculate the recency of the user's last session
users_df["recency"] = (max_timestamp -
                       users_df["last_session_creation_time"]).dt.days
# Describe the recency column
users_df.recency.describe()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
5526,5527,2013-05-27 14:48:15,Grandstaff Christopher,ChristopherHGrandstaff@gmail.com,GUEST_INVITE,2014-02-13 14:48:15,0,0,322,9162.0
11729,11730,2012-10-04 11:58:37,Carpenter Chelsea,ChelseaCarpenter@hotmail.com,GUEST_INVITE,2014-06-02 11:58:37,0,0,69,1400.0
1565,1566,2013-06-10 17:40:52,Roberts Nicholas,NicholasRoberts@yahoo.com,ORG_INVITE,2013-06-10 17:40:52,0,0,204,3742.0


count    8823.000000
mean      263.244361
std       226.056408
min         0.000000
25%        41.000000
50%       221.000000
75%       449.000000
max       736.000000
Name: recency, dtype: float64

In [15]:
# Create a histogram of the recency column
users_df["recency"].hvplot.hist(title="Recency of Last Session",
                                bins=20,
                                color="orange").opts(active_tools=["box_zoom"],
                                                     height=300,
                                                     width=600)

In [16]:
# examine the missing values for the last_session_creation_time
display(users_df[users_df["last_session_creation_time"].isna()].describe(
    include="all").T.fillna(""))
# see if the object_id of the null dataframe is in the engagement data as the user_id
null_df = users_df[users_df["last_session_creation_time"].isna()]
null_df["object_id"].isin(users_engagement_df["user_id"]).sum()
print(
    f"Number of users with missing last_session_creation_time: {len(null_df):,}"
)

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
object_id,3177.0,,,,5946.84073,8.0,2919.0,5941.0,8909.0,11994.0,3464.211901
creation_time,3177.0,,,,2013-07-10 17:30:15.857412608,2012-05-31 18:12:49,2013-01-09 05:27:00,2013-07-21 05:44:10,2014-01-23 05:34:03,2014-05-30 22:34:31,
name,3177.0,3133.0,Rodrigues Ryan,3.0,,,,,,,
email,3177.0,3176.0,ThomasBrandt@gmail.com,2.0,,,,,,,
creation_source,3177.0,4.0,PERSONAL_PROJECTS,1347.0,,,,,,,
last_session_creation_time,0.0,,,,,,,,,,
opted_in_to_mailing_list,3177.0,,,,0.241737,0.0,0.0,0.0,0.0,1.0,0.428203
enabled_for_marketing_drip,3177.0,,,,0.141958,0.0,0.0,0.0,0.0,1.0,0.349062
org_id,3177.0,,,,139.974819,0.0,28.0,104.0,237.0,415.0,123.723159
invited_by_user_id,1641.0,,,,5911.913467,7.0,2998.0,5978.0,8664.0,11999.0,3353.670938


Number of users with missing last_session_creation_time: 3,177


In [17]:
# Missing value data for org_id
users_df.loc[users_df["last_session_creation_time"].isna(
)]["org_id"].value_counts().sort_index().hvplot(
    title="Org ID of rows with missing last_session_creation_time",
    color="silver").opts(active_tools=["box_zoom"], height=300, width=600)

We found that `3177` users were missing data from the `last_session_creation_time` column. These users were also missing data from the `user_engagement` dataset. We therefore could not determine if these users were adopted users or not. Although this was a sizeable portion of the dataset, we decided to drop these users from the dataset.

In [18]:
# non null data
users_df[~users_df["last_session_creation_time"].isna()][
    "org_id"
].value_counts().sort_index().hvplot()

non_null_df = users_df[~users_df["last_session_creation_time"].isna()]
non_null_df["org_id"].value_counts().sort_index().hvplot(
    title="Org ID of rows with non-missing last_session_creation_time", color="orange"
).opts(active_tools=["box_zoom"], height=300, width=600)

In [19]:
# drop the rows with the missing last_session_creation_time
users_df = users_df[~users_df["last_session_creation_time"].isna()]
users_df.info()
# look at the missing values
users_df.isna().sum()
print(
    f"\nNumber of missing values in invited_by_user_id: \n{users_df['invited_by_user_id'].isna().sum()}"
)

# investigate the missing values in invited_by_user_id
users_df[users_df["invited_by_user_id"].isna()].head(3)

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   8823 non-null   int64         
 1   creation_time               8823 non-null   datetime64[ns]
 2   name                        8823 non-null   object        
 3   email                       8823 non-null   object        
 4   creation_source             8823 non-null   object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    8823 non-null   int64         
 7   enabled_for_marketing_drip  8823 non-null   int64         
 8   org_id                      8823 non-null   int64         
 9   invited_by_user_id          4776 non-null   float64       
 10  recency                     8823 non-null   float64       
dtypes: datetime64[ns](2), float64(2), int64(4), object(3)
memory

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,recency
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,2012-12-20 13:24:32,0,1,37,,533.0
10,11,2013-12-26 03:55:54,Paulsen Malthe,MaltheAPaulsen@gustr.com,SIGNUP,2013-12-27 03:55:54,0,0,69,,161.0
13,14,2012-10-11 16:14:33,Rivera Bret,BretKRivera@gmail.com,SIGNUP,2012-10-12 16:14:33,0,0,0,,601.0


In [20]:
null_df = users_df[users_df["invited_by_user_id"].isna()]
# Check if the object_id of the null dataframe is in the engagement data as the user_id
null_df["object_id"].isin(users_engagement_df["user_id"]).sum()

4047

All these rows with null values are IN the `engagement_dataset`. We can use these rows. The missing values is not a problem . They are because noone invited these users as they are not from an `ORG_INVITE`  nor a `GUEST_INVITE`. We can fill these missing values with `00000` as that is not a `user_id`.

In [21]:
# Look at the count of the creation_source and the org_id
display(null_df["creation_source"].value_counts())
# Look at the invited_by_user_id for the null dataframe
users_df["invited_by_user_id"] = users_df["invited_by_user_id"].fillna("00000")
# look at the info and description of the dataframe
users_df.info()
users_df.describe(include="all").T.sort_values(by="unique").fillna("")

creation_source
SIGNUP                1898
SIGNUP_GOOGLE_AUTH    1385
PERSONAL_PROJECTS      764
Name: count, dtype: int64

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   8823 non-null   int64         
 1   creation_time               8823 non-null   datetime64[ns]
 2   name                        8823 non-null   object        
 3   email                       8823 non-null   object        
 4   creation_source             8823 non-null   object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    8823 non-null   int64         
 7   enabled_for_marketing_drip  8823 non-null   int64         
 8   org_id                      8823 non-null   int64         
 9   invited_by_user_id          8823 non-null   object        
 10  recency                     8823 non-null   float64       
dtypes: datetime64[ns](2), float64(1), int64(4), object(4)
memory

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
creation_source,8823.0,5.0,ORG_INVITE,3188.0,,,,,,,
invited_by_user_id,8823.0,2230.0,00000,4047.0,,,,,,,
name,8823.0,8453.0,Correia Leonardo,4.0,,,,,,,
email,8823.0,8810.0,MarkoSeiler@yahoo.com,2.0,,,,,,,
object_id,8823.0,,,,6019.821716,1.0,3017.5,6034.0,9029.5,12000.0,3464.251001
creation_time,8823.0,,,,2013-07-18 15:48:32.228833792,2012-05-31 00:43:27,2013-01-18 22:39:45.500000,2013-08-09 22:08:11,2014-01-30 00:21:54,2014-05-30 23:59:19,
last_session_creation_time,8823.0,,,,2013-09-15 21:08:25.700441856,2012-05-31 08:20:06,2013-03-13 17:16:05,2013-10-27 15:41:10,2014-04-25 16:16:44,2014-06-06 14:58:50,
opted_in_to_mailing_list,8823.0,,,,0.252295,0.0,0.0,0.0,1.0,1.0,0.434354
enabled_for_marketing_drip,8823.0,,,,0.151989,0.0,0.0,0.0,0.0,1.0,0.359031
org_id,8823.0,,,,142.572254,0.0,30.0,109.0,239.0,416.0,124.176422


In [22]:
all_users_df = users_df.rename(columns={
    'object_id': 'user_id'
}).merge(adopted_users_df, how="left")
all_users_df

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,recency,date,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,45.0,,
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,67.0,2014-02-09,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,443.0,,
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,380.0,,
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,500.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8818,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,2013-09-06 06:14:15,0,0,89,8263.0,273.0,,
8819,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-15 18:28:37,0,0,200,00000,506.0,,
8820,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,2014-04-27 12:45:16,1,1,83,8074.0,40.0,,
8821,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,2012-06-02 11:55:59,0,0,6,00000,734.0,,


In [23]:
all_users_df["time_to_become_adopted"] = (
    pd.to_datetime(all_users_df["date"]) -
    pd.to_datetime(all_users_df["creation_time"])).dt.days
all_users_df

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,recency,date,adopted_user,time_to_become_adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,45.0,,,
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,67.0,2014-02-09,1.0,85.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,443.0,,,
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,380.0,,,
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,500.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8818,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,2013-09-06 06:14:15,0,0,89,8263.0,273.0,,,
8819,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-15 18:28:37,0,0,200,00000,506.0,,,
8820,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,2014-04-27 12:45:16,1,1,83,8074.0,40.0,,,
8821,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,2012-06-02 11:55:59,0,0,6,00000,734.0,,,


In [24]:
adoption_df_slim = adoption_df[[
    "user_id",
    "date",
    "adopted_user",
    "avg_time_between_logins",
    "login_count",
    "days_since_first_login",
]]

adoption_df_slim.columns = [
    "user_id",
    "date",
    "adopted_user",
    "avg_time_bet_logins_at_adopt",
    "login_count_at_adopt",
    "account_age_at_adopt",
]

adoption_df_slim["date"] = adoption_df_slim["date"].dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adoption_df_slim["date"] = adoption_df_slim["date"].dt.date


In [25]:
all_users_df = all_users_df.merge(adoption_df_slim, how="left")

In [26]:
all_users_df["time_to_become_adopted"].hvplot.hist(
    title="Time to become adopted (days)", bins=12,
    xlabel="").opts(active_tools=["box_zoom"])

In [27]:
all_users_df.login_count_at_adopt.hvplot.hist(
    title="Login Count at Adoption", xlabel="").opts(active_tools=["box_zoom"])

In [28]:
# Get the frequency of logins for each user
user_frequency = user_engagement_datewise[[
    'user_id', 'login_count'
]].groupby('user_id').max().reset_index()
# merge the user_frequency with the all_users_df
all_users_df = all_users_df.merge(
    user_frequency, how="left").rename(columns={'login_count': 'frequency'})

In [29]:
# Fill missing values in 'adopted_user' with 0 and convert the column to integer
all_users_df["adopted_user"] = all_users_df["adopted_user"].fillna(0).astype(
    int)

# Fill missing values in 'logins' with 0 and convert the column to integer
all_users_df["frequency"] = all_users_df["frequency"].fillna(0).astype(int)

# Convert 'last_session_creation_time' to datetime using seconds as the unit
all_users_df["last_session_creation_time"] = pd.to_datetime(
    all_users_df["last_session_creation_time"], unit="s")
# get the time since last login
all_users_df["days_since_last_login"] = (
    max_timestamp - all_users_df["last_session_creation_time"]).dt.days
all_users_df["last_session_creation_time"].hvplot.hist(
    bins=52, title="Last session creation time",
    xlabel="").opts(active_tools=["box_zoom"])

In [30]:
all_users_df.hvplot.hist(
    "days_since_last_login",
    by="adopted_user",
    alpha=0.5,
    title="Time since last login",
).opts(active_tools=["box_zoom"])

In [31]:
# Convert 'creation_time' to datetime and extract the date
all_users_df["start_date"] = pd.to_datetime(
    all_users_df["creation_time"]).dt.date

all_users_df.hvplot.hist(
    "start_date", alpha=0.5,
    title="Start Date Distribution").opts(active_tools=["box_zoom"])

In [32]:
# Convert 'last_session_creation_time' and 'creation_time' to datetime
all_users_df["last_session_creation_time"] = pd.to_datetime(
    all_users_df["last_session_creation_time"])
all_users_df["creation_time"] = pd.to_datetime(all_users_df["creation_time"])
# Calculate 'account_age' in days
all_users_df["account_age"] = (max_timestamp -
                               all_users_df["creation_time"]).dt.days
# Calculate 'account_age_last_login' in days
all_users_df["account_age_last_login"] = (
    all_users_df["last_session_creation_time"] -
    all_users_df["creation_time"]).dt.days
# Calculate quietness period as avg time between logins
all_users_df["avg_time_bet_logins"] = (all_users_df["account_age"] /
                                       all_users_df["frequency"]).round(1)

all_users_df["account_age_last_login"].describe()

all_users_df.sample(3)

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,recency,date,adopted_user,time_to_become_adopted,avg_time_bet_logins_at_adopt,login_count_at_adopt,account_age_at_adopt,frequency,days_since_last_login,start_date,account_age,account_age_last_login,avg_time_bet_logins
5236,7161,2013-05-04 16:53:31,Howarth Maya,MayaHowarth@yahoo.com,GUEST_INVITE,2013-05-05 16:53:31,1,0,331,1114.0,396.0,,0,,,,,1,396,2013-05-04,397,1,397.0
3416,4665,2014-04-21 04:40:01,McCulloch Hamish,HamishMcCulloch@yahoo.com,SIGNUP,2014-04-22 04:40:01,1,1,28,0.0,45.0,,0,,,,,1,45,2014-04-21,46,1,46.0
8102,11019,2012-12-05 00:15:25,Peacock Oscar,OscarPeacock@yahoo.com,SIGNUP,2012-12-06 00:15:25,0,0,38,0.0,547.0,,0,,,,,1,547,2012-12-05,548,1,548.0


In [33]:
# Extract the month from 'creation_time'
all_users_df["creation_month"] = all_users_df["creation_time"].dt.month

# Convert 'user_id' to string and add leading zeros
all_users_df["user_id"] = all_users_df["user_id"].astype("string").str.zfill(5)

# Extract the domain from the 'email' column
all_users_df["domain"] = (
    all_users_df["email"].str.split("@").str[1].str.split(".").str[0])

# Fill missing values in 'invited_by_user_id' with 0, convert to string and add leading zeros
all_users_df["invited_by_user_id"] = (
    all_users_df["invited_by_user_id"]
    # .fillna(0)
    .astype(int).astype("string").str.zfill(5))

# Replace '00000' in 'invited_by_user_id' with 'Not Invited'
all_users_df["invited_by_user_id"] = all_users_df[
    "invited_by_user_id"].replace("00000", "Not Invited")

# Create a DataFrame of adopted users
adopted_users = all_users_df[all_users_df["adopted_user"] == 1]["user_id"]

# Count the number of users each user has referred
referred_df = all_users_df["invited_by_user_id"].value_counts().reset_index(
).iloc[1:]
referred_df.columns = ["user_id", "num_referrals"]

# Merge 'referred_df' with 'all_users_df'
all_users_df = all_users_df.merge(referred_df, how="left")

# Fill missing values in 'num_referrals' with 0
all_users_df["num_referrals"] = all_users_df["num_referrals"].fillna(0).astype(
    int)

# Create a new column 'was_invited' that indicates whether a user was invited or not
all_users_df["was_invited"] = all_users_df["invited_by_user_id"].apply(
    lambda x: "Invited" if x != "Not Invited" else "Not Invited")

In [34]:
all_users_df[(all_users_df["num_referrals"] > 0)
             & (all_users_df["adopted_user"] == 1)][[
                 "user_id",
                 "num_referrals",
                 "adopted_user",
                 "was_invited",
                 "invited_by_user_id",
             ]].sort_values(by="num_referrals", ascending=False).describe(
                 include="all").T.sort_values("unique").fillna("")

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
was_invited,382.0,2.0,Invited,235.0,,,,,,,
invited_by_user_id,382.0,229.0,Not Invited,147.0,,,,,,,
user_id,382.0,382.0,04612,1.0,,,,,,,
num_referrals,382.0,,,,2.232984,1.590873,1.0,1.0,2.0,3.0,10.0
adopted_user,382.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [35]:
all_users_df["sent_referrals"] = np.where(all_users_df["num_referrals"] > 0,
                                          "sent_referrals",
                                          "no_referrals_sent")

In [36]:
# create a column for the users who invited themselves
all_users_df["invited_self"] = (
    all_users_df["user_id"] == all_users_df["invited_by_user_id"])
all_users_df["invited_self"] = all_users_df["invited_self"].astype(int)

In [37]:
all_users_df["domain"] = all_users_df["domain"].apply(lambda x: (
    "other" if x not in
    ["gmail", "yahoo", "jourrapide", "cuvox", "gustr", "hotmail"] else x))

In [38]:
all_users_df.describe(include="all").T.sort_values("unique").fillna("")

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
was_invited,8823.0,2.0,Invited,4776.0,,,,,,,
sent_referrals,8823.0,2.0,no_referrals_sent,7189.0,,,,,,,
creation_source,8823.0,5.0,ORG_INVITE,3188.0,,,,,,,
domain,8823.0,7.0,gmail,2930.0,,,,,,,
date,1656.0,625.0,2012-10-14,9.0,,,,,,,
start_date,8823.0,730.0,2014-05-30,53.0,,,,,,,
invited_by_user_id,8823.0,2230.0,Not Invited,4047.0,,,,,,,
name,8823.0,8453.0,Correia Leonardo,4.0,,,,,,,
email,8823.0,8810.0,MarkoSeiler@yahoo.com,2.0,,,,,,,
user_id,8823.0,8823.0,00001,1.0,,,,,,,


In [39]:
all_users_df.sample().T

Unnamed: 0,6872
user_id,09359
creation_time,2014-03-24 15:41:00
name,Crotty Charlie
email,CharlieCrotty@gmail.com
creation_source,ORG_INVITE
last_session_creation_time,2014-06-02 15:41:00
opted_in_to_mailing_list,0
enabled_for_marketing_drip,0
org_id,289
invited_by_user_id,09070


Target Variable

In [40]:
all_users_df["adopted_user"].value_counts(normalize=True).reset_index(
    name="adopted_users")

Unnamed: 0,adopted_user,adopted_users
0,0,0.812309
1,1,0.187691


In [41]:
def remove_outer_percentile_outliers(df, col, lower=0.01, upper=0.99):
    """Remove the outer percentile outliers from a column in a DataFrame."""
    lower_bound = df[col].quantile(lower)
    upper_bound = df[col].quantile(upper)
    return df[(df[col] > lower_bound) & (df[col] < upper_bound)]

In [42]:
def plot_cat_active_stacked_bars(df, cat_cols, target_col="adopted_user"):
    """Create a bar plot for each categorical column showing the proportion of active users."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    cat_plots = []
    for col in cat_cols:
        proportions = df.groupby(col)[target_col].value_counts(
            normalize=True).unstack()
        plot = proportions.hvplot.barh(
            title=(f"{col}".replace("_", " ").title()
                   if "_" in col else col.title()),
            xlabel="",
            ylabel="",
            stacked=True,
            cmap=["lightgray", "green"],
            legend=False,
        ).opts(**plot_opts, legend_position="top_right")
        cat_plots.append(plot)
    return hv.Layout(cat_plots).cols(3)


def plot_num_active_violins(df, list_of_num_cols, target_col="adopted_user"):
    """Create 2 violin plot for each numerical column with the target column."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    num_plots = []
    for col in list_of_num_cols:
        plot = df.hvplot.violin(
            y=col,
            by=target_col,
            c=target_col,
            ylabel="",
            title=f"{col}".replace("_", " ").title(),
            cmap=["lightgray", "green"],
        ).opts(**plot_opts)
        num_plots.append(plot)
    return hv.Layout(num_plots).cols(3)


def plot_num_active_hist(df, list_of_num_cols, target_col="adopted_user"):
    """Creates a histogram plot of each numerical column with target column
      separated by color."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    hist_plots = [
        df[[num_col] + [target_col]].hvplot.hist(
            title=(f"{num_col}".replace("_", " ").title()
                   if "_" in num_col else num_col.title() + " Distribution"),
            bins=12,
            xlabel="",
            by=target_col,
            alpha=0.6,
            muted_alpha=0.02,
            color=["silver", "green"],
        ).opts(**plot_opts) for num_col in list_of_num_cols
    ]
    return hv.Layout(hist_plots)


def plot_cat_bars(df, list_of_cat_cols):
    """Creates a bar plot for each categorical column."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    cat_plots = [
        df[col].astype("string").value_counts().sort_index().hvplot.barh(
            title=f"{col}".replace("_", " ").title()
            if "_" in col else col.title(),
            xlabel="",
            ylabel="",
            color="silver",
        ).opts(**plot_opts) for col in list_of_cat_cols
    ]
    return hv.Layout(cat_plots).cols(3)

In [43]:
cat_cols = [
    "domain",
    "creation_source",
    "was_invited",
    "creation_month",
    "sent_referrals",
    "invited_self",
    "opted_in_to_mailing_list",
    "enabled_for_marketing_drip",
]

all_users_df.dtypes
plot_cat_active_stacked_bars(all_users_df, cat_cols)

- `Invited self` seems to have a small lean towards being an adopted user. 
- `GUEST_INVITE` and `PERSONAL_PROJECTS` have a similar distribution of adopted users, which is slightly higher than the other categories.
- `creation month` 6 has a higher proportion of adopted users than the other months, with 5 being the lowest.
- `hotmail` has a higher proportion of adopted users than the other email providers.

We can use a combination of these features in feature engineering to help build our model. Unfortunately, all these features are lower than the others on absolute numbers despite having a higher proportion of adopted users.

In [44]:
plot_cat_bars(all_users_df, cat_cols)

In [45]:
plot_num_active_violins(
    all_users_df,
    [
        "frequency",
        "account_age_last_login",
        "num_referrals",
        "account_age",
        "days_since_last_login",
        "avg_time_bet_logins",
        'recency',
        'org_id',
    ],
)

The numerical features have some more pronounced differences between the adopted and non-adopted users. 
- `Frequency` we should not use as it is closely related to the target variable.
- `Recency` though can be used as it is the time since the last login. What it basically says is that our adopted users login more frequently, so if we see time has passed without any particular user logging in, that is a signal that we are losing that users and may require some intervention. 
- `Org ID` although it is not a numerical feature, it is much easier to visualize if we consider it like this as oppose to 417 cardinality categorical variable. There is not a clear distinction between the adopted and non-adopted users, but there are some differences in the distribution of the two groups.
- `Num referrals` seems indifferent in the distribution of the two groups.

In [46]:
# find the correct transform for the right skewed columns
plot_num_active_hist(
    all_users_df,
    ["frequency", "account_age_last_login", "num_referrals", "account_age"],
).cols(3)

stripped_outliers = []
for col in [
    "frequency",
    "account_age_last_login",
    "num_referrals",
    "account_age",
    "days_since_last_login",
    "avg_time_bet_logins",
    'recency',
    'org_id',
]:
    col_stripped = remove_outer_percentile_outliers(all_users_df, col)

    stripped_outliers.append(plot_num_active_hist(col_stripped, [col]))

hv.Layout(stripped_outliers).cols(3)

In [47]:
adopted_col_features = [
    "time_to_become_adopted",
    "avg_time_bet_logins_at_adopt",
    "login_count_at_adopt",
    "account_age_at_adopt",
]
stripped_outliers = []
for col in adopted_col_features:
    col_stripped = remove_outer_percentile_outliers(all_users_df, col)

    stripped_outliers.append(plot_num_active_hist(
        col_stripped,
        [col],
    ))

hv.Layout(stripped_outliers).cols(3)

In [48]:
all_users_df.dtypes.sort_index()
all_users_df.select_dtypes(np.number).columns
# get the org_id which has the most ORG_INVITE creation_source
org_invite = all_users_df[all_users_df["creation_source"] == "ORG_INVITE"]
org_invite.sort_values(by="sent_referrals", ascending=False)

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,recency,date,adopted_user,time_to_become_adopted,avg_time_bet_logins_at_adopt,login_count_at_adopt,account_age_at_adopt,frequency,days_since_last_login,start_date,account_age,account_age_last_login,avg_time_bet_logins,creation_month,domain,num_referrals,was_invited,sent_referrals,invited_self
3920,05370,2013-04-05 19:00:43,Alves Gabrielly,GabriellyBarbosaAlves@gmail.com,ORG_INVITE,2013-04-06 19:00:43,1,1,358,04771,425.0,,0,,,,,1,425,2013-04-05,426,1,426.0,4,gmail,1,Invited,sent_referrals,0
7378,10042,2013-05-24 03:48:01,Almeida Sophia,SophiaCardosoAlmeida@yahoo.com,ORG_INVITE,2013-05-24 03:48:01,0,0,374,10221,378.0,,0,,,,,1,378,2013-05-24,378,0,378.0,5,yahoo,1,Invited,sent_referrals,0
3805,05202,2012-07-05 09:31:37,Rasmussen Mohammad,MohammadMRasmussen@hotmail.com,ORG_INVITE,2012-07-08 09:31:37,0,0,53,00093,698.0,,0,,,,,1,698,2012-07-05,701,3,701.0,7,hotmail,2,Invited,sent_referrals,0
1641,02263,2012-09-04 13:38:37,Eisenhauer Melanie,avdcbbaz@wlpro.com,ORG_INVITE,2012-09-04 13:38:37,0,0,406,09785,640.0,,0,,,,,1,640,2012-09-04,640,0,640.0,9,other,1,Invited,sent_referrals,0
3803,05200,2012-11-14 15:31:49,Rhodes Victoria,VictoriaRhodes@cuvox.de,ORG_INVITE,2014-05-26 15:31:49,1,1,97,10649,10.0,2013-01-15,1,61.0,6.1,10.0,61.0,258,10,2012-11-14,568,558,2.2,11,cuvox,1,Invited,sent_referrals,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3259,04440,2014-04-04 04:05:56,Akhtar Emily,EmilyAkhtar@jourrapide.com,ORG_INVITE,2014-04-06 04:05:56,0,0,128,04612,61.0,,0,,,,,1,61,2014-04-04,63,2,63.0,4,jourrapide,0,Invited,no_referrals_sent,0
3261,04442,2014-03-27 08:46:14,Ribeiro Marisa,MarisaCavalcantiRibeiro@jourrapide.com,ORG_INVITE,2014-03-27 08:46:14,1,0,101,07630,71.0,,0,,,,,1,71,2014-03-27,71,0,71.0,3,jourrapide,0,Invited,no_referrals_sent,0
3268,04450,2014-03-08 19:22:51,Bradley Courtney,CourtneyDBradley@hotmail.com,ORG_INVITE,2014-03-09 19:22:51,0,0,140,11040,88.0,,0,,,,,1,88,2014-03-08,89,1,89.0,3,hotmail,0,Invited,no_referrals_sent,0
3271,04453,2014-04-29 20:04:45,Bang Christine,pbnkmgsx@kfumy.com,ORG_INVITE,2014-05-07 20:04:45,0,0,19,06896,29.0,,0,,,,,2,29,2014-04-29,37,8,18.5,4,other,0,Invited,no_referrals_sent,0


In [49]:
# Get the org_id which has the highest portion of adopted users compared to its total users
# bin them into 5 groups
all_users_df["org_id_adopted_ratio"] = all_users_df.groupby(
    "org_id")["adopted_user"].transform("mean")
all_users_df["org_id_count"] = all_users_df["org_id"].map(
    all_users_df["org_id"].value_counts())

# create an organiztion dataframe with the org_id as the index and put the descriptions in the columns
org_df = all_users_df.groupby(
    "org_id")["org_id_adopted_ratio"].mean().reset_index()
org_df["org_id_count"] = org_df["org_id"].map(
    all_users_df["org_id"].value_counts())
# add a column with thhe day the organization first logged in
org_df["org_first_login"] = all_users_df.groupby(
    "org_id")["creation_time"].min()
# Add a column with the day the organization last logged in
org_df["org_last_login"] = all_users_df.groupby(
    "org_id")["last_session_creation_time"].max()
# get the account age for the organization
org_df["org_account_age"] = (org_df["org_last_login"] -
                             org_df["org_first_login"]).dt.days
# get the number of logins for the organization
org_df["number_of_logins"] = all_users_df.groupby(["org_id"
                                                   ])["frequency"].sum()
# get the number of adopted users for the organization
org_df["number_of_adopted_users"] = all_users_df.groupby(
    ["org_id"])["adopted_user"].sum()

all_users_df.hvplot.scatter(
    y="account_age",
    x="recency",
    c="adopted_user",
    cmap=['silver', 'green'],
    size=2,
    alpha=0.7,
    height=500,
    width=800,
    hover_cols=["user_id"],
    grid=True,
).opts(active_tools=["box_zoom"], title="Account Age vs Recency")

In [50]:
# check correlations of the numerical columns in a half matrix
corr_df = all_users_df.drop(columns=[
    'time_to_become_adopted', 'avg_time_bet_logins_at_adopt',
    'login_count_at_adopt', 'account_age_at_adopt'
])
corr_cols = corr_df.select_dtypes(np.number).columns.tolist()
corr_cols.sort()
# put the adopted_user coumn at the end
corr_cols.remove("adopted_user")
corr_cols.append("adopted_user")
# get the correlation matrix
corr = corr_df[corr_cols].corr()
mask = np.tril(np.ones_like(corr, dtype=bool))
corr.mask(mask).hvplot.heatmap(height=600,
                               rot=90,
                               aspect='square',
                               cmap='coolwarm_r').opts(
                                   active_tools=["box_zoom"],
                                   title="Correlation Heatmap",
                                   color_levels=7,
                                   symmetric=True,
                                   line_color='white',
                                   yaxis='right',
                                   line_width=0.5)



In [51]:
# create a copy of the dataframe of the user engagement
new_users_engagement_df = users_engagement_df.copy()
# convert the user_id to string and add leading zeros
new_users_engagement_df["user_id"] = (
    new_users_engagement_df["user_id"].astype("string").str.zfill(5))
# merge the user engagement with the all users dataframe
org_time_df = new_users_engagement_df.merge(
    all_users_df[["org_id", "user_id"]]).drop(["user_id", "date"], axis=1)
# convert the time_stamp to datetime
org_time_df["time_stamp"] = pd.to_datetime(org_time_df["time_stamp"])

# resample timestamp to hourly intervals and count the number of logins
# org_hourly = org_time_df.set_index("time_stamp").groupby("org_id").resample("H").count()

# Extract hour of day from timestamp
org_time_df["hour_of_day"] = org_time_df["time_stamp"].dt.hour

# Group by org_id and hour of day, and count the number of logins
org_hourly = org_time_df.groupby(["org_id", "hour_of_day"]).count()

# unstack the org_id level of the MultiIndex
org_df_unstacked = org_hourly.unstack(level=0).fillna(0)
print(f"Shape of the unstacked DataFrame: {org_df_unstacked.shape}")
org_df_unstacked.columns = org_df_unstacked.columns.droplevel()
org_hourly_df = org_df_unstacked.T.reset_index()
org_hourly_df = org_hourly_df.set_index("org_id").rename_axis(None, axis=1)
org_hourly_df

# Standardize the data
scaler = StandardScaler()
org_df_unstacked_scaled = scaler.fit_transform(org_hourly_df)

# Define the number of clusters you want to find
n_clusters = 26

# Apply K-means clustering
kmeans = KMeans(n_clusters=n_clusters,
                random_state=628).fit(org_df_unstacked_scaled)

# add the cluster labels to the org_daily_df
org_hourly_df = org_hourly_df.reset_index()
org_hourly_df["org_group"] = kmeans.labels_
org_hourly_df[["org_id", "org_group"]]
# org_daily_df

Shape of the unstacked DataFrame: (24, 417)


Unnamed: 0,org_id,org_group
0,0,22
1,1,10
2,2,3
3,3,12
4,4,10
...,...,...
412,412,25
413,413,1
414,414,25
415,415,6


In [52]:
# Merge on org_id to put the org_group in the all_users_df

org_grouped_all_users = all_users_df.merge(
    org_hourly_df[["org_id", "org_group"]],
    how="left",
)

plot_cat_bars(org_grouped_all_users,
              ["org_group"]) + plot_cat_active_stacked_bars(
                  org_grouped_all_users, ["org_group"])

In [53]:
# Hourly distribution of the logins for each user
users_engagement_df['hour'] = users_engagement_df["time_stamp"].dt.hour
users_engagement_df['adopted_user'] = users_engagement_df['user_id'].isin(
    adoption_df['user_id']).astype(int)
users_engagement_df.hvplot.hist("hour",
                                by="adopted_user",
                                alpha=0.5,
                                title="Hourly Login Distribution",
                                bins=24).opts(active_tools=["box_zoom"])

The `non-adopted` appear to be almost quintessentially evenly distributed across the hours of the day in terms of login times.

In [54]:
# show the users who are not adopted but have a high number of logins
all_users_df[(all_users_df["frequency"] > 3)
             & (all_users_df["adopted_user"] == 0)].sort_values(
                 by=["frequency"], ascending=False)

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,recency,date,adopted_user,time_to_become_adopted,avg_time_bet_logins_at_adopt,login_count_at_adopt,account_age_at_adopt,frequency,days_since_last_login,start_date,account_age,account_age_last_login,avg_time_bet_logins,creation_month,domain,num_referrals,was_invited,sent_referrals,invited_self,org_id_adopted_ratio,org_id_count
8329,11321,2013-01-30 18:48:54,Butler Aimee,AimeeButler@yahoo.com,ORG_INVITE,2013-07-05 18:48:54,0,0,158,03515,335.0,,0,,,,,15,335,2013-01-30,491,156,32.7,1,yahoo,0,Invited,no_referrals_sent,0,0.058824,17
4666,06371,2012-11-21 11:01:28,Shepherd Lenard,LenardMShepherd@gmail.com,SIGNUP_GOOGLE_AUTH,2013-05-06 11:01:28,0,0,160,Not Invited,396.0,,0,,,,,13,396,2012-11-21,562,166,43.2,11,gmail,0,Not Invited,no_referrals_sent,0,0.352941,17
1546,02134,2012-12-08 03:58:08,Schröder Andrea,AndreaSchroder@yahoo.com,ORG_INVITE,2013-04-26 03:58:08,0,0,129,02042,406.0,,0,,,,,13,406,2012-12-08,545,139,41.9,12,yahoo,0,Invited,no_referrals_sent,0,0.238095,21
8056,10949,2013-03-17 11:26:42,Crowder Mason,MasonCrowder@gmail.com,ORG_INVITE,2013-07-30 11:26:42,0,0,106,08623,311.0,,0,,,,,12,311,2013-03-17,446,135,37.2,3,gmail,0,Invited,no_referrals_sent,0,0.208333,24
300,00398,2013-06-10 17:48:12,Gomes Igor,IgorRochaGomes@gmail.com,ORG_INVITE,2013-09-01 17:48:12,0,0,105,01758,277.0,,0,,,,,12,277,2013-06-10,360,83,30.0,6,gmail,0,Invited,no_referrals_sent,0,0.150000,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2574,03521,2014-05-07 10:47:50,Sadler Abbey,AbbeySadler@jourrapide.com,GUEST_INVITE,2014-06-01 10:47:50,0,0,244,04215,5.0,,0,,,,,4,5,2014-05-07,30,25,7.5,5,jourrapide,0,Invited,no_referrals_sent,0,0.090909,11
2535,03470,2013-01-05 08:08:30,Humphries Evan,EvanHumphries@gmail.com,GUEST_INVITE,2013-03-22 08:08:30,0,0,327,01775,441.0,,0,,,,,4,441,2013-01-05,517,76,129.2,1,gmail,2,Invited,sent_referrals,0,0.071429,14
5960,08173,2012-10-14 12:19:57,Code Emma,EmmaCode@gustr.com,SIGNUP,2012-11-18 12:19:57,0,0,163,Not Invited,565.0,,0,,,,,4,565,2012-10-14,600,35,150.0,10,gustr,0,Not Invited,no_referrals_sent,0,0.333333,21
6027,08254,2014-04-22 07:11:26,Fernandes Luis,LuisDiasFernandes@yahoo.com,ORG_INVITE,2014-05-25 07:11:26,0,0,250,07572,12.0,,0,,,,,4,12,2014-04-22,45,33,11.2,4,yahoo,0,Invited,no_referrals_sent,0,0.250000,12


In [55]:
new_df = all_users_df.copy()

new_df["is_referrer"] = new_df["sent_referrals"].apply(
    lambda x: 1 if x in ("sent_referrals") else 0)

# Test the Active Level column
new_df["is_home_project"] = new_df["creation_source"].apply(
    lambda x: 1 if x in ("GUEST_INVITE", "PERSONAL_PROJECTS") else 0)

new_df["active_level"] = (new_df["is_referrer"] + new_df["is_home_project"] +
                          new_df["invited_self"])
hv.Layout([
    plot_cat_bars(new_df, ["active_level"]),
    plot_cat_active_stacked_bars(new_df, ["active_level"]),
])

In [56]:
# combine the domains to 'all_others' except for hotmail and yahoo
new_df["new_domain"] = new_df["domain"].apply(
    lambda x: "all_others" if x not in ("hotmail", "yahoo") else x)
(plot_cat_active_stacked_bars(new_df, ["new_domain"]) +
 plot_cat_bars(new_df, ["new_domain"]))

In [57]:
# create feature as the sum of the 2 spam columns
new_df["spam_prod"] = (new_df["opted_in_to_mailing_list"] *
                       new_df["enabled_for_marketing_drip"])
new_df["spam_sum"] = (new_df["opted_in_to_mailing_list"] +
                      new_df["enabled_for_marketing_drip"])
hv.Layout([
    plot_cat_active_stacked_bars(new_df, ["spam_prod"]),
    plot_cat_active_stacked_bars(new_df, ["spam_sum"]),
])

No difference with the combining the spam features. Now we try combining the `new_domain` with the spam features.

In [58]:
# multiply the spam prod and the domain columns
new_df["spam_prod"] = new_df["spam_prod"].astype("string")
new_df["spam_domain"] = new_df["new_domain"] + "_" + new_df["spam_prod"]
(plot_cat_active_stacked_bars(new_df, ["spam_domain"]) +
 plot_cat_bars(new_df, ["spam_domain"]))

- Only the yahoo domain seems to respond positively to the email marketing. 
- Hotmail domains were negatively affected by the email marketing. 
- All the others did not have much of a difference. (< 1% difference)


In [59]:
new_df["spam_source"] = (new_df["creation_source"] + "_" +
                         new_df["spam_prod"].astype("string"))

(plot_cat_active_stacked_bars(new_df, ["spam_source"]) +
 plot_cat_bars(new_df, ["spam_source"]))

- Those who `creation_source` is `PERSONAL_PROJECTS` have a positive response to the email marketing. 
- All others are indifferent ( < 2% difference).


In [60]:
new_df["spam_referrals"] = (new_df["sent_referrals"] + "_" +
                            new_df["spam_prod"].astype("string"))

(plot_cat_active_stacked_bars(new_df, ["spam_referrals"]) +
 plot_cat_bars(new_df, ["spam_referrals"]))

In [61]:
all_users_df.dtypes.sort_index()

account_age                              int64
account_age_at_adopt                   float64
account_age_last_login                   int64
adopted_user                             int32
avg_time_bet_logins                    float64
avg_time_bet_logins_at_adopt           float64
creation_month                           int32
creation_source                         object
creation_time                   datetime64[ns]
date                                    object
days_since_last_login                    int64
domain                                  object
email                                   object
enabled_for_marketing_drip               int64
frequency                                int32
invited_by_user_id              string[python]
invited_self                             int32
last_session_creation_time      datetime64[ns]
login_count_at_adopt                   float64
name                                    object
num_referrals                            int32
opted_in_to_m

In [62]:
# spam_domain feature
all_users_df["new_domain"] = all_users_df["domain"].apply(
    lambda x: "all_others" if x not in ("hotmail", "yahoo") else x)
all_users_df["spam_prod"] = (all_users_df["opted_in_to_mailing_list"] *
                             all_users_df["enabled_for_marketing_drip"])
all_users_df["spam_domain"] = (all_users_df["new_domain"] + "_" +
                               all_users_df["spam_prod"].astype("string"))
# active_level feature
all_users_df["is_referrer"] = all_users_df["sent_referrals"].apply(
    lambda x: 1 if x in ("sent_referrals") else 0)
all_users_df["is_home_project"] = all_users_df["creation_source"].apply(
    lambda x: 1 if x in ("GUEST_INVITE", "PERSONAL_PROJECTS") else 0)
all_users_df["active_level"] = (all_users_df["is_referrer"] +
                                all_users_df["is_home_project"] +
                                all_users_df["invited_self"])

In [63]:
# check chi squared test for the categorical columns
import itertools as it
chi2_cols = [
    "active_level",
    "spam_domain",
    "creation_month",
    "sent_referrals",
]
chi2_cols.sort()

# Get all unique pairs of variables
pairs = list(it.combinations(chi2_cols, 2))

# Initialize a list to store the results
results = []

for pair in pairs:
    contingency_table = pd.crosstab(new_df[pair[0]], new_df[pair[1]])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    # Append the results to the list
    results.append([pair[0], pair[1], chi2, p])

# Convert the results to a DataFrame
results_df = pd.DataFrame(
    results,
    columns=['Variable 1', 'Variable 2', 'Chi-square Statistic', 'P-value'])

# Display the results
sorted_results_df = results_df.sort_values('Variable 1', )
sorted_results_df

Unnamed: 0,Variable 1,Variable 2,Chi-square Statistic,P-value
0,active_level,creation_month,670.61107,6.734566e-120
1,active_level,sent_referrals,4376.431551,0.0
2,active_level,spam_domain,37.622975,0.00102551
3,creation_month,sent_referrals,1210.841732,7.459246e-253
4,creation_month,spam_domain,71.67085,0.06493931
5,sent_referrals,spam_domain,11.424526,0.04358313


In [198]:
all_users_df["recency_over_account_age"] = (
    all_users_df["recency"] / all_users_df["account_age"]
)

In [199]:
categorical_features = [
    # "spam_domain",
    # "creation_month",
    # 'active_level',
]
# numerical_features = ["days_since_last_login", 'account_age']
numerical_features = ["recency_over_account_age"]
X = all_users_df[numerical_features]  # + categorical_features]
y = all_users_df["adopted_user"]

In [200]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=628, stratify=y
)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

X_train shape: (7058, 1), X_test shape: (1765, 1)
y_train shape: (7058,), y_test shape: (1765,)


In [201]:
from sklearn.preprocessing import StandardScaler

In [184]:
# create a pipeline for numerical columns
numerical_pipeline = Pipeline(
    steps=[
        ("scaler", RobustScaler()),
    ]
)

# create a pipeline for categorical columns
categorical_pipeline = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_features),
        # ("cat", categorical_pipeline, categorical_features),
    ],
    remainder="drop",
)

In [185]:
# Models dict
models = {}
models["LogisticRegression"] = (
    LogisticRegression(
        random_state=628,
        n_jobs=-1,
        max_iter=1000,
    ),
    {
        "classifier__C": np.logspace(-3, 3, 7),
        "classifier__class_weight": ["balanced", None],
        "classifier__solver":
        ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    },
)

models["LightGBM"] = (
    LGBMClassifier(
        random_state=628,
        n_jobs=-1,
        is_unbalance=True,
        num_leaves=31,
        boosting_type="gbdt",
        verbose=-1,
    ),
    {
        "classifier__learning_rate": stats.uniform(0.01, 0.5),
        "classifier__reg_alpha": stats.uniform(0.0, 0.05),
        "classifier__reg_lambda": stats.uniform(0.0, 0.05),
        "classifier__min_child_samples": stats.randint(100, 500),
        "classifier__min_data_in_leaf": stats.randint(20, 100),
    },
)

models["XGBoost"] = (
    XGBClassifier(
        random_state=628,
        scale_pos_weight=sum(y_train == 0) / sum(y_train == 1),
    ),
    {
        "classifier__n_estimators": stats.randint(100, 1000),
        "classifier__learning_rate": [0.001, 0.01, 0.1, 0.2, 0.4, 0.5],
        "classifier__max_depth": stats.randint(3, 10),
        "classifier__subsample": stats.uniform(0.6, 0.4),
        "classifier__colsample_bytree": stats.uniform(0.6, 0.4),
        "classifier__colsample_bylevel": stats.uniform(0.6, 0.4),
        "classifier__min_child_weight": stats.randint(1, 200),
    },
)

In [203]:
# Create an empty dictionary to store the best models
best_models = {}

# Loop through each model and perform random search
for model_name, (model, param_grid) in tqdm(models.items(), desc="Model Tuning"):
    # Create a pipeline for the model
    model_pipeline = Pipeline(
        steps=[
            # ("preprocessor", preprocessor),
            ("classifier", model),
        ]
    )

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(
        model_pipeline,
        param_grid,
        n_iter=50,
        cv=5,
        random_state=628,
        scoring="recall",
        n_jobs=-1,
    )

    # Fit the model
    random_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = random_search.best_estimator_

    # Print the best parameters and ROC AUC score for each model
    print(f"\nBest Parameters for {model_name}: {random_search.best_params_}")
    # Print the mean cross-validation score for the best parameters
    best_index = random_search.best_index_
    mean_cv_score = random_search.cv_results_["mean_test_score"][best_index]
    print(f"{model_name} - Mean CV Score for Best Parameters: {mean_cv_score:.3f}")
    y_pred_proba_tuned = random_search.best_estimator_.predict_proba(X_test)[:, 1]
    y_pred = best_models[model_name].predict(X_test)
    roc_auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned)
    recall_score_tuned = recall_score(y_test, y_pred)

    print(f"{model_name} - Tuned Model ROC AUC Score: {roc_auc_tuned:.3f}")
    print(f"{model_name} - Tuned Model Recall Score:{recall_score_tuned:.3f}")

Model Tuning:   0%|          | 0/3 [00:00<?, ?it/s]


Best Parameters for LogisticRegression: {'classifier__solver': 'saga', 'classifier__class_weight': 'balanced', 'classifier__C': 1000.0}
LogisticRegression - Mean CV Score for Best Parameters: 0.869
LogisticRegression - Tuned Model ROC AUC Score: 0.982
LogisticRegression - Tuned Model Recall Score:0.861

Best Parameters for LightGBM: {'classifier__learning_rate': 0.488011690586939, 'classifier__min_child_samples': 258, 'classifier__min_data_in_leaf': 24, 'classifier__reg_alpha': 0.015148096611211454, 'classifier__reg_lambda': 0.006194735076478092}
LightGBM - Mean CV Score for Best Parameters: 0.957
LightGBM - Tuned Model ROC AUC Score: 0.981
LightGBM - Tuned Model Recall Score:0.958

Best Parameters for XGBoost: {'classifier__colsample_bylevel': 0.9951363139606418, 'classifier__colsample_bytree': 0.9927628170122282, 'classifier__learning_rate': 0.5, 'classifier__max_depth': 7, 'classifier__min_child_weight': 27, 'classifier__n_estimators': 845, 'classifier__subsample': 0.71841952356744

In [205]:
# feature_names = (best_models["LogisticRegression"].named_steps["preprocessor"].
#                  get_feature_names_out())

lr_importance = best_models["LogisticRegression"].named_steps["classifier"].coef_[0]

xgb_feature_importance = (

    best_models["XGBoost"].named_steps["classifier"].feature_importances_
)

lgbm_feature_importance = (

    best_models["LightGBM"].named_steps["classifier"].feature_importances_
)

In [206]:
# Create a DataFrame for easy visualization
feature_importances_df = pd.DataFrame(
    {
        "Logistic Regression": lr_importance,
        "XGBoost": xgb_feature_importance,
        "Light GBM": lgbm_feature_importance,
    },
    # index=feature_names,
    index=numerical_features,
)


def highlight_topn(s):
    is_top3 = s.isin(s.nlargest(4))
    return ["background-color: salmon" if v else "" for v in is_top3]


# Sort features by importance
feature_importances_df.sort_values("Logistic Regression", ascending=False, inplace=True)
# highlight the 3 highest values in each column
feature_importances_df.style.apply(highlight_topn, axis=0)
# feature_importances_df.round(4)

Unnamed: 0,Logistic Regression,XGBoost,Light GBM
recency_over_account_age,-8.126869,1.0,3000


In [189]:
# Plot feature importances
active_opts = dict(
    active_tools=["box_zoom"],
)
hv.Layout(
    [
        feature_importances_df[col]
        .hvplot.barh(
            y=col,
            xlabel="",
            #  title with model name
            title=f"Feature Importance {col}",
            height=200,
        )
        .opts(**active_opts)
        for col in feature_importances_df.columns
    ],
).cols(2)

In [207]:
from sklearn.metrics import average_precision_score, precision_recall_curve

In [208]:
def get_curve_and_confusion_matrix(model, X_test, y_test, curve_type="roc"):
    """Get the ROC AUC or Precision-Recall curve and the confusion matrix for the model.
    curve_types are 'roc' or 'pr' for ROC AUC or Precision-Recall curve, respectively.
    """
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    if curve_type == "roc":
        # Get the ROC AUC score
        score = roc_auc_score(y_test, y_pred_proba)
        # Get the curve values
        fpr, tpr, thresh = roc_curve(y_test, y_pred_proba)
        # Calculate the optimal threshold
        J = tpr - fpr
        optimal_idx = np.argmax(J)
        optimal_threshold = thresh[optimal_idx]
        # Plot title
        title = f"ROC Curve (AUC: {score:.2f} || opt thresh:{optimal_threshold:.2f})"
        # Curve plot
        curve_plot = hv.Curve((fpr, tpr))
    elif curve_type == "pr":
        # Get the average precision score
        score = average_precision_score(y_test, y_pred_proba)
        # Get the curve values
        precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
        # Calculate the optimal threshold
        optimal_idx = np.argmax(precision + recall)
        optimal_threshold = thresholds[optimal_idx]
        # Plot title
        title = f"PR Curve (AP: {score:.2f} || opt thresh:{optimal_threshold:.2f})"
        # Curve plot
        curve_plot = hv.Curve((recall, precision))
    else:
        raise ValueError("Invalid curve_type. Choose either 'roc' or 'pr'.")

    # Use the optimal threshold to convert probabilities into class predictions
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)

    curve_plot = curve_plot.opts(
        title=title,
        xlabel="False Positive Rate" if curve_type == "roc" else "Recall",
        ylabel="True Positive Rate" if curve_type == "roc" else "Precision",
        line_width=2,
        height=400,
        width=400,
        tools=["hover"],
        active_tools=["box_zoom"],
        xlim=(0, 1.01),
        ylim=(0, 1.01),
        yaxis="left" if curve_type == "roc" else "right",
    )

    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix = pd.DataFrame(
        conf_matrix,
        index=["Actual 0", "Actual 1"],
        columns=["Predicted 0", "Predicted 1"],
    )
    print(classification_report(y_test, y_pred))
    conf_heatmap = conf_matrix.hvplot.heatmap(
        height=400, width=400, colorbar=False, title="Confusion Matrix"
    ).opts(
        **active_opts,
        invert_yaxis=True,
        color_levels=7,
        line_color="white",
        line_width=2,
    )
    # Create a DataFrame for labels
    labels_df = pd.DataFrame(conf_matrix.stack(), columns=["value"]).reset_index()
    labels_df.columns = ["y", "x", "value"]

    # Create labels
    labels = hv.Labels(labels_df, ["x", "y"], "value")

    return hv.Layout(
        [
            (conf_heatmap * labels.opts(text_color="gray", text_font_size="14pt")),
            curve_plot,
        ]
    ).cols(2)

Logistic Regression

In [209]:
lr_model = best_models["LogisticRegression"]
get_curve_and_confusion_matrix(lr_model, X_test, y_test, curve_type="pr")

              precision    recall  f1-score   support

           0       0.98      0.95      0.96      1434
           1       0.80      0.90      0.85       331

    accuracy                           0.94      1765
   macro avg       0.89      0.92      0.90      1765
weighted avg       0.94      0.94      0.94      1765



XGBoost

In [211]:
xgboost_model = best_models["XGBoost"]
get_curve_and_confusion_matrix(xgboost_model, X_test, y_test, curve_type="pr")

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1434
           1       0.82      0.88      0.85       331

    accuracy                           0.94      1765
   macro avg       0.90      0.92      0.90      1765
weighted avg       0.94      0.94      0.94      1765



LightGBM

In [212]:
lgbm_model = best_models["LightGBM"]
get_curve_and_confusion_matrix(lgbm_model, X_test, y_test, curve_type="pr")

              precision    recall  f1-score   support

           0       0.98      0.93      0.96      1434
           1       0.76      0.92      0.84       331

    accuracy                           0.93      1765
   macro avg       0.87      0.93      0.90      1765
weighted avg       0.94      0.93      0.93      1765

