In [1]:
from IPython.display import clear_output
from IPython.display import display
from pathlib import Path
import pandas as pd
import holoviews as hv
import hvplot.pandas
from panel import widgets as pnw
import panel as pn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import (
    RobustScaler,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy import stats
from tqdm.notebook import tqdm
import helper_functions as hf

clear_output()

In [2]:
hv.extension("bokeh")
hvplot.extension("bokeh")
pn.extension("tabulator", sizing_mode="stretch_width", template="fast")
pn.config.throttled = True
# for setting the number of columns to display in the notebook
pd.set_option("display.max_columns", 50)
clear_output()

Use helper functions to get the file

In [3]:
zip_path = Path("data/relax_challenge.zip")

# use the functions to extract the zipfile
if hf.check_zipfile(zip_path):
  target_dir = zip_path.parent / zip_path.stem
  hf.create_target_directory(target_dir)
  hf.extract_zipfile(zip_path, target_dir)

Extracted data\relax_challenge.zip to data\relax_challenge


### Read in user engagement data

In [4]:
user_engagement_path = (
    "./data/relax_challenge/relax_challenge/takehome_user_engagement.csv")

users_path = "data/relax_challenge/relax_challenge/takehome_users.csv"

users_engagement_df = pd.read_csv(user_engagement_path)
display(users_engagement_df.sample(3))
users_engagement_df.info()
display(users_engagement_df.describe(include="all").round(2).T.fillna(""))
# drop visited column
users_engagement_df.drop(columns="visited", inplace=True)

# convert time_stamp to datetime
users_engagement_df["time_stamp"] = pd.to_datetime(
    users_engagement_df["time_stamp"])
# # Convert user_id to string

Unnamed: 0,time_stamp,user_id,visited
130470,2012-10-04 08:42:10,7493,1
202867,2013-05-23 19:17:16,11642,1
48759,2013-10-02 18:12:42,2881,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
time_stamp,207917.0,207220.0,2013-04-06 21:21:37,2.0,,,,,,,
user_id,207917.0,,,,5913.31,3394.94,1.0,3087.0,5682.0,8944.0,12000.0
visited,207917.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# show the description of the dataframe
display(users_engagement_df.describe(include="all").round(2).T.fillna(""))
# get value counts of user_id
user_id_counts = users_engagement_df["user_id"].value_counts()
print(f"number of unique users: {len(user_id_counts)}")
user_id_counts

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
time_stamp,207917.0,2013-10-30 05:06:45.648763648,2012-05-31 08:20:06,2013-07-16 20:17:21,2013-12-03 06:38:34,2014-03-13 08:00:24,2014-06-06 14:58:50,
user_id,207917.0,5913.31,1.0,3087.0,5682.0,8944.0,12000.0,3394.94


number of unique users: 8823


user_id
3623     606
906      600
1811     593
7590     590
8068     585
        ... 
4699       1
4698       1
4697       1
4696       1
12000      1
Name: count, Length: 8823, dtype: int64

Only 8,823 accounts out of 12,000 logged in and were found in the engagement dataset. The other accounts we do not have any login/engagement data for.

Find the `adopted_user`s.<br>
**Criteria: Users with 3 logins in a 7-day period.**

In [6]:
# Convert the timestamp to date
users_engagement_df["date"] = pd.to_datetime(
    users_engagement_df["time_stamp"]).dt.date

# Drop duplicates based on user_id and date to get unique logins
user_engagement_datewise = users_engagement_df[[
    "user_id", "date"
]].drop_duplicates(subset=["user_id", "date"])

# Sort the DataFrame by user_id and date
user_engagement_datewise.sort_values(by=["user_id", "date"], inplace=True)

# Calculate the difference in days between the current and previous login date for each user
user_engagement_datewise["date_diff"] = user_engagement_datewise.groupby(
    "user_id")["date"].diff()
user_engagement_datewise["date_diff"] = user_engagement_datewise[
    "date_diff"].apply(lambda x: pd.Timedelta(x).days)

# Fill NA values with 0
user_engagement_datewise["date_diff"] = user_engagement_datewise[
    "date_diff"].fillna(0)

# Calculate the difference in days between the current and the login date two rows back for each user
user_engagement_datewise["date_diff_2"] = (user_engagement_datewise.groupby(
    "user_id")["date"].diff(2).apply(lambda x: pd.Timedelta(x).days))

# Filter the DataFrame to get users with 3 logins in a 7-day period
adopted_users_df = (user_engagement_datewise[
    user_engagement_datewise["date_diff_2"] < 8].groupby("user_id").first())

# Reset the index and rename the columns
adopted_users_df = adopted_users_df[["date"]].reset_index()
adopted_users_df.columns = ["user_id", "date_became_adopted"]

# Mark these users as adopted users (this is the target variable)
adopted_users_df["adopted_user"] = 1
display(adopted_users_df.head())

adopted_users = adopted_users_df["user_id"].unique()
print(f"Number of adopted users: {len(adopted_users)}")

Unnamed: 0,user_id,date_became_adopted,adopted_user
0,2,2014-02-09,1
1,10,2013-02-06,1
2,20,2014-03-13,1
3,33,2014-03-23,1
4,42,2012-12-25,1


Number of adopted users: 1656


In [7]:
# get the Dataframe with the user_id and the adopted_user column
# with all the users
target_df = users_engagement_df[["user_id"]].drop_duplicates()
target_df["adopted_user"] = target_df["user_id"].apply(
    lambda x: 1 if x in adopted_users else 0
)
target_df.set_index("user_id", inplace=True)
display(target_df.head())

Unnamed: 0_level_0,adopted_user
user_id,Unnamed: 1_level_1
1,0
2,1
3,0
4,0
5,0


In [8]:
# save the target_df to a csv file
target_path = 'data/relax_challenge/relax_challenge/takehome_users_target.csv'

target_df.to_csv(target_path)

Small number of adopted users, 13% of the users are adopted. The dataset is imbalanced.

In [9]:
# get the highest date in the dataset
max_timestamp = users_engagement_df["time_stamp"].max()
print(f"The most recent user_login date is: \n{max_timestamp}")

min_timestamp = users_engagement_df["time_stamp"].min()
print(f"The earliest user_login date is: \n{min_timestamp}")

The most recent user_login date is: 
2014-06-06 14:58:50
The earliest user_login date is: 
2012-05-31 08:20:06


In [10]:
# Get a cumcount of the number of logins for each user at each row
user_engagement_datewise["login_count"] = (
    user_engagement_datewise.groupby("user_id").cumcount() + 1)
# get a cumsum of the number of days since the first login for each user at each row
user_engagement_datewise[
    "days_since_first_login"] = user_engagement_datewise.groupby(
        "user_id")["date_diff"].cumsum()
# get the avg time between logins for each user at each row
user_engagement_datewise["avg_time_between_logins"] = (
    user_engagement_datewise["days_since_first_login"] /
    user_engagement_datewise["login_count"]).round(1)

user_engagement_datewise.loc[
    user_engagement_datewise["avg_time_between_logins"] > 0]
# filter the user_engagement_datewise to get the rows of users who are adopted
user_engagement_datewise[user_engagement_datewise["user_id"].isin(
    adopted_users)]
# get the login_count number when they became an adopted user
# merge on the user_id and the date_became_adopted with user_id and date respectively
adopted_users_df = adopted_users_df.rename(
    columns={"date_became_adopted": "date"})
adoption_df = adopted_users_df.merge(user_engagement_datewise,
                                     on=["user_id", "date"])
# get a cumulative count of the adopted users in chronological order
adoption_df["date"] = pd.to_datetime(adoption_df["date"])
adoption_df.sort_values("date", inplace=True)
adoption_df["cum_adopted"] = adoption_df["adopted_user"].cumsum()
display(adoption_df.head())
# plot the cumulative adopted users
adoption_df.sort_values(by=["date"]).hvplot.scatter(
    x="date", y="cum_adopted", size=5,
    title="Cumulative Adopted Users").opts(active_tools=["box_zoom"])

Unnamed: 0,user_id,date,adopted_user,date_diff,date_diff_2,login_count,days_since_first_login,avg_time_between_logins,cum_adopted
224,1693,2012-06-10,1,1.0,5.0,4,10.0,2.5,1
106,728,2012-06-16,1,1.0,5.0,3,5.0,1.7,2
206,1525,2012-06-16,1,5.0,7.0,3,7.0,2.3,3
1624,11764,2012-06-17,1,2.0,4.0,3,4.0,1.3,4
1020,7590,2012-06-18,1,5.0,7.0,3,7.0,2.3,5


In [11]:
# Get the first adoption date
first_adoption = adoption_df["date"].min()
print(f"The first adoption date is: {first_adoption}")

# Calculate the rate of cumulative adopted users
adoption_df["days_since_first_adoption"] = (adoption_df["date"] -
                                            first_adoption).dt.days
adoption_df["adoption_rate"] = (
    adoption_df["cum_adopted"] /
    adoption_df["days_since_first_adoption"]).round(2)

# Plot the adoption rate over time
adoption_df.hvplot.scatter(x="days_since_first_adoption",
                           y="adoption_rate",
                           title="Adoption Rate",
                           size=5).opts(active_tools=["box_zoom"])

The first adoption date is: 2012-06-10 00:00:00


In [12]:
# get the count of the adopted users for each month
adoption_df["month"] = adoption_df["date"].dt.to_period("M")
adoption_df["month_adoption_count"] = adoption_df.groupby(
    "month")["adopted_user"].transform("sum")
# check the adoptions for each month and plot on a bar chart
monthly_adoption_count = (adoption_df[["month", "month_adoption_count"
                                       ]].drop_duplicates().set_index("month"))
monthly_adoption_count.hvplot() * monthly_adoption_count.hvplot.scatter(
    height=600, title="Adoptions per Month").opts(active_tools=["box_zoom"])

In [13]:
# get the count of the adopted users for each week
adoption_df["week"] = adoption_df["date"].dt.to_period("W")
adoption_df["week_adoption_count"] = adoption_df.groupby(
    "week")["adopted_user"].transform("sum")

# check the weekly adoptions
weekly_adoption_count = (adoption_df[["date", "adopted_user"
                                      ]].set_index("date").resample("W").sum())
weekly_adoption_count_plot = weekly_adoption_count.hvplot().opts(alpha=0.2)
# smooth out the weekly adoptions
(weekly_adoption_count.rolling(window=4, center=True,
                               min_periods=1).mean().hvplot() *
 weekly_adoption_count_plot).opts(
     active_tools=["box_zoom"],
     height=500,
     width=1000,
     show_grid=True,
     xlabel="",
     title="Weekly Adoptions and Rolling Mean of Weekly Adoptions",
 )
# adoption_df

In [14]:
# get the count of the adopted users for each day
adoption_df["day"] = adoption_df["date"].dt.to_period("D")
adoption_df["day_adoption_count"] = adoption_df.groupby(
    "day")["adopted_user"].transform("sum")
# check the daily adoptions
daily_adoption_count = (adoption_df[["date", "adopted_user"
                                     ]].set_index("date").resample("D").sum())
# Smooth out the daily adoptions
daily_adoption_count_plot = daily_adoption_count.hvplot().opts(alpha=0.2)
(daily_adoption_count.rolling(window=28, center=True,
                              min_periods=1).mean().hvplot() *
 daily_adoption_count_plot).opts(
     active_tools=["box_zoom"],
     height=500,
     width=1000,
     show_grid=True,
     xlabel="",
     title="Daily Adoptions and Rolling Mean of Daily Adoptions",
 )

In [15]:
user_engagement_datewise.head()

Unnamed: 0,user_id,date,date_diff,date_diff_2,login_count,days_since_first_login,avg_time_between_logins
0,1,2014-04-22,0.0,,1,0.0,0.0
1,2,2013-11-15,0.0,,1,0.0,0.0
2,2,2013-11-29,14.0,,2,14.0,7.0
3,2,2013-12-09,10.0,24.0,3,24.0,8.0
4,2,2013-12-25,16.0,26.0,4,40.0,10.0


### Read in users data

In [16]:
# Read in the users dataset
users_df = pd.read_csv(users_path, encoding="latin-1")
users_df.info()
display(users_df.sample(3))
print(f"Statistics of the users dataframe")
users_df.describe(include="all").round(2).T.sort_values(by="unique").fillna("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
10836,10837,2012-08-31 15:51:31,Berry Gabriel,GabrielBerry@gmail.com,GUEST_INVITE,1346428000.0,0,0,4,5920.0
4145,4146,2014-02-22 22:41:17,Lima Marisa,MarisaBarbosaLima@yahoo.com,GUEST_INVITE,1395182000.0,0,0,352,5678.0
914,915,2013-10-27 09:31:58,Azevedo Bruna,BrunaMartinsAzevedo@jourrapide.com,SIGNUP,1383039000.0,0,0,190,


Statistics of the users dataframe


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
creation_source,12000.0,5.0,ORG_INVITE,4254.0,,,,,,,
name,12000.0,11355.0,Araujo Gabriela,5.0,,,,,,,
email,12000.0,11980.0,AlfieLane@yahoo.com,2.0,,,,,,,
creation_time,12000.0,11996.0,2014-02-11 17:57:53,2.0,,,,,,,
object_id,12000.0,,,,6000.5,3464.25,1.0,3000.75,6000.5,9000.25,12000.0
last_session_creation_time,8823.0,,,,1379279305.7,19531160.79,1338452406.0,1363194965.0,1382888470.0,1398442604.0,1402066730.0
opted_in_to_mailing_list,12000.0,,,,0.25,0.43,0.0,0.0,0.0,0.0,1.0
enabled_for_marketing_drip,12000.0,,,,0.15,0.36,0.0,0.0,0.0,0.0,1.0
org_id,12000.0,,,,141.88,124.06,0.0,29.0,108.0,238.25,416.0
invited_by_user_id,6417.0,,,,5962.96,3383.76,3.0,3058.0,5954.0,8817.0,11999.0


In [17]:
users_df[users_df['last_session_creation_time'].isna()]

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,
11,12,2014-04-17 23:48:38,Mathiesen Lærke,LaerkeLMathiesen@cuvox.de,ORG_INVITE,,0,0,130,9270.0
14,15,2013-07-16 21:33:54,Theiss Ralf,RalfTheiss@hotmail.com,PERSONAL_PROJECTS,,0,0,175,
15,16,2013-02-11 10:09:50,Engel René,ReneEngel@hotmail.com,PERSONAL_PROJECTS,,0,0,211,
...,...,...,...,...,...,...,...,...,...,...
11975,11976,2013-12-25 22:01:41,Kohl Leah,LeahKohl@hotmail.com,PERSONAL_PROJECTS,,0,0,248,
11977,11978,2014-04-23 16:28:06,Castro Pedro,PedroCunhaCastro@gustr.com,PERSONAL_PROJECTS,,1,0,29,
11984,11985,2013-07-08 17:23:26,Jespersen Marcus,MarcusTJespersen@cuvox.de,PERSONAL_PROJECTS,,0,0,74,
11992,11993,2013-03-28 23:24:21,Townsend Isabel,IsabelTownsend@cuvox.de,PERSONAL_PROJECTS,,0,0,281,


In [18]:
users_df["last_login"] = pd.to_datetime(users_df["last_session_creation_time"],
                                        unit="s")
users_df["creation_time"] = pd.to_datetime(users_df["creation_time"])
display(users_df.sample(3))
# describe the 2 time columns
display(users_df[["creation_time",
                  "last_login"]].describe(include="all").fillna(""))

# Calculate the recency of the user's last session
users_df["recency"] = (max_timestamp - users_df["last_login"]).dt.days
# Describe the recency column
users_df.recency.describe().round()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,last_login
2946,2947,2013-09-26 01:58:05,Fiedler Lucas,LucasFiedler@gustr.com,SIGNUP,1380420000.0,1,0,140,,2013-09-29 01:58:05
9125,9126,2012-09-10 02:34:41,Kjær Mette,MetteCKjaer@yahoo.com,SIGNUP,,1,0,21,,NaT
4594,4595,2013-01-07 14:28:17,Webster Oscar,dhbhirot@cfqpe.com,SIGNUP,1357742000.0,0,0,0,,2013-01-09 14:28:17


Unnamed: 0,creation_time,last_login
count,12000,8823
mean,2013-07-16 13:25:32.964499968,2013-09-15 21:08:25.700441856
min,2012-05-31 00:43:27,2012-05-31 08:20:06
25%,2013-01-15 21:28:22.750000128,2013-03-13 17:16:05
50%,2013-08-05 21:35:19.500000,2013-10-27 15:41:10
75%,2014-01-28 10:20:12.249999872,2014-04-25 16:16:44
max,2014-05-30 23:59:19,2014-06-06 14:58:50


count    8823.0
mean      263.0
std       226.0
min         0.0
25%        41.0
50%       221.0
75%       449.0
max       736.0
Name: recency, dtype: float64

- A third of the `last_login` are missing. Only have values for `8,823` users, which is the same number of users that logged in.
- Half of the `last_login` values are before `2013-10-27`.
- The max of the `creation_time` is `2014-05-30` which is 1 week before the max of `last_login` which is `2014-06-06`.

In [19]:
users_df['creation_time'].sort_values().reset_index(drop=True).hvplot(grid=True,
                                                                      height=600, width=800, title="User Creation Time").opts(active_tools=["box_zoom"])

In [20]:
# Create a histogram of the recency column
users_df["recency"].hvplot.hist(title="Recency of Last Session",
                                bins=20,
                                color="orange").opts(active_tools=["box_zoom"],
                                                     height=300,
                                                     width=600)

In [21]:
# examine the missing values for the last_login
display(users_df[users_df["last_login"].isna()].describe(
    include="all").T.fillna(""))
# see if the object_id of the null dataframe is in the engagement data as the user_id
null_df = users_df[users_df["last_login"].isna()]
null_df["object_id"].isin(users_engagement_df["user_id"]).sum()
print(f"Number of users with missing last_login: {len(null_df):,}")

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
object_id,3177.0,,,,5946.84073,8.0,2919.0,5941.0,8909.0,11994.0,3464.211901
creation_time,3177.0,,,,2013-07-10 17:30:15.857412608,2012-05-31 18:12:49,2013-01-09 05:27:00,2013-07-21 05:44:10,2014-01-23 05:34:03,2014-05-30 22:34:31,
name,3177.0,3133.0,Rodrigues Ryan,3.0,,,,,,,
email,3177.0,3176.0,ThomasBrandt@gmail.com,2.0,,,,,,,
creation_source,3177.0,4.0,PERSONAL_PROJECTS,1347.0,,,,,,,
last_session_creation_time,0.0,,,,,,,,,,
opted_in_to_mailing_list,3177.0,,,,0.241737,0.0,0.0,0.0,0.0,1.0,0.428203
enabled_for_marketing_drip,3177.0,,,,0.141958,0.0,0.0,0.0,0.0,1.0,0.349062
org_id,3177.0,,,,139.974819,0.0,28.0,104.0,237.0,415.0,123.723159
invited_by_user_id,1641.0,,,,5911.913467,7.0,2998.0,5978.0,8664.0,11999.0,3353.670938


Number of users with missing last_login: 3,177


In [22]:
# Missing value data for org_id
users_df.loc[users_df["last_login"].isna()]["org_id"].value_counts(
).sort_index().hvplot(title="Org ID of rows with missing last_login",
                      color="silver").opts(active_tools=["box_zoom"],
                                           height=300,
                                           width=600)

We found that `3177` users were missing data from the `last_login` column. These users were also missing data from the `user_engagement` dataset. We therefore could not determine if these users were adopted users or not. Although this was a sizeable portion of the dataset, we decided to drop these users from the dataset.

In [23]:
# non null data
users_df[~users_df["last_login"].isna()]["org_id"].value_counts().sort_index(
).hvplot()

non_null_df = users_df[~users_df["last_login"].isna()]
non_null_df["org_id"].value_counts().sort_index().hvplot(
    title="Org ID of rows with non-missing last_login",
    color="orange").opts(active_tools=["box_zoom"], height=300, width=600)

In [24]:
# drop the rows with the missing last_login
users_df = users_df[~users_df["last_login"].isna()]
users_df.info()
# look at the missing values
users_df.isna().sum()
print(
    f"\nNumber of missing values in invited_by_user_id: \n{users_df['invited_by_user_id'].isna().sum()}"
)

# investigate the missing values in invited_by_user_id
users_df[users_df["invited_by_user_id"].isna()].head(3)

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   8823 non-null   int64         
 1   creation_time               8823 non-null   datetime64[ns]
 2   name                        8823 non-null   object        
 3   email                       8823 non-null   object        
 4   creation_source             8823 non-null   object        
 5   last_session_creation_time  8823 non-null   float64       
 6   opted_in_to_mailing_list    8823 non-null   int64         
 7   enabled_for_marketing_drip  8823 non-null   int64         
 8   org_id                      8823 non-null   int64         
 9   invited_by_user_id          4776 non-null   float64       
 10  last_login                  8823 non-null   datetime64[ns]
 11  recency                     8823 non-null   float64       
d

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,last_login,recency
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,,2012-12-20 13:24:32,533.0
10,11,2013-12-26 03:55:54,Paulsen Malthe,MaltheAPaulsen@gustr.com,SIGNUP,1388117000.0,0,0,69,,2013-12-27 03:55:54,161.0
13,14,2012-10-11 16:14:33,Rivera Bret,BretKRivera@gmail.com,SIGNUP,1350058000.0,0,0,0,,2012-10-12 16:14:33,601.0


In [25]:
null_df = users_df[users_df["invited_by_user_id"].isna()]
# Check if the object_id of the null dataframe is in the engagement data as the user_id
null_df["object_id"].isin(users_engagement_df["user_id"]).sum()

4047

All these rows with null values are IN the `engagement_dataset`. We can use these rows. The missing values is not a problem . They are because noone invited these users as they are not from an `ORG_INVITE`  nor a `GUEST_INVITE`. We can fill these missing values with `00000` as that is not a `user_id`.

In [26]:
# Look at the count of the creation_source and the org_id
display(null_df["creation_source"].value_counts())
# Look at the invited_by_user_id for the null dataframe
users_df["invited_by_user_id"] = users_df["invited_by_user_id"].fillna(0)
# look at the info and description of the dataframe
users_df.info()
users_df.describe(include="all").T.sort_values(by="unique").fillna("")

creation_source
SIGNUP                1898
SIGNUP_GOOGLE_AUTH    1385
PERSONAL_PROJECTS      764
Name: count, dtype: int64

<class 'pandas.core.frame.DataFrame'>
Index: 8823 entries, 0 to 11999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   8823 non-null   int64         
 1   creation_time               8823 non-null   datetime64[ns]
 2   name                        8823 non-null   object        
 3   email                       8823 non-null   object        
 4   creation_source             8823 non-null   object        
 5   last_session_creation_time  8823 non-null   float64       
 6   opted_in_to_mailing_list    8823 non-null   int64         
 7   enabled_for_marketing_drip  8823 non-null   int64         
 8   org_id                      8823 non-null   int64         
 9   invited_by_user_id          8823 non-null   float64       
 10  last_login                  8823 non-null   datetime64[ns]
 11  recency                     8823 non-null   float64       
d

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
creation_source,8823.0,5.0,ORG_INVITE,3188.0,,,,,,,
name,8823.0,8453.0,Correia Leonardo,4.0,,,,,,,
email,8823.0,8810.0,MarkoSeiler@yahoo.com,2.0,,,,,,,
object_id,8823.0,,,,6019.821716,1.0,3017.5,6034.0,9029.5,12000.0,3464.251001
creation_time,8823.0,,,,2013-07-18 15:48:32.228833792,2012-05-31 00:43:27,2013-01-18 22:39:45.500000,2013-08-09 22:08:11,2014-01-30 00:21:54,2014-05-30 23:59:19,
last_session_creation_time,8823.0,,,,1379279305.700442,1338452406.0,1363194965.0,1382888470.0,1398442604.0,1402066730.0,19531160.787044
opted_in_to_mailing_list,8823.0,,,,0.252295,0.0,0.0,0.0,1.0,1.0,0.434354
enabled_for_marketing_drip,8823.0,,,,0.151989,0.0,0.0,0.0,0.0,1.0,0.359031
org_id,8823.0,,,,142.572254,0.0,30.0,109.0,239.0,416.0,124.176422
invited_by_user_id,8823.0,,,,3237.316786,0.0,0.0,1055.0,6405.0,11999.0,3888.088044


In [27]:
all_users_df = users_df.rename(columns={
    "object_id": "user_id"
}).merge(adopted_users_df, how="left")

# all_users_df

In [28]:
adoption_df_slim = adoption_df.loc[
    :,
    [
        "user_id",
        "date",
        "adopted_user",
        "avg_time_between_logins",
        "login_count",
        "days_since_first_login",
    ],
].copy()

adoption_df_slim.columns = [
    "user_id",
    "date",
    "adopted_user",
    "avg_time_bet_logins_at_adopt",
    "login_count_at_adopt",
    "account_age_at_adopt",
]

adoption_df_slim["date"] = adoption_df_slim["date"].dt.date

In [29]:
all_users_df = all_users_df.merge(adoption_df_slim, how="left")
all_users_df['adopted_user'] = all_users_df['adopted_user'].fillna(0)

In [30]:
all_users_df["account_age_at_adopt"].hvplot.hist(
    title="Time to become adopted (days)", bins=15,
    xlabel="").opts(active_tools=["box_zoom"])

In [31]:
all_users_df.login_count_at_adopt.hvplot.hist(
    title="Login Count at Adoption", xlabel="").opts(active_tools=["box_zoom"])

In [32]:
# Get the frequency of logins for each user
user_frequency = user_engagement_datewise[[
    'user_id', 'login_count'
]].groupby('user_id').max().reset_index()
# merge the user_frequency with the all_users_df
all_users_df = all_users_df.merge(
    user_frequency, how="left").rename(columns={'login_count': 'frequency'})

In [33]:
# Fill missing values in 'adopted_user' with 0 and convert the column to integer
all_users_df["adopted_user"] = all_users_df["adopted_user"].fillna(0).astype(
    int)

# Fill missing values in 'logins' with 0 and convert the column to integer
all_users_df["frequency"] = all_users_df["frequency"].fillna(0).astype(int)

# Convert 'last_session_creation_time' to datetime using seconds as the unit
all_users_df["last_login"] = pd.to_datetime(all_users_df["last_login"],
                                            unit="s")
# get the time since last login
all_users_df["days_since_last_login"] = (max_timestamp -
                                         all_users_df["last_login"]).dt.days
all_users_df.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,last_login,recency,date,adopted_user,avg_time_bet_logins_at_adopt,login_count_at_adopt,account_age_at_adopt,frequency,days_since_last_login
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,2014-04-22 03:53:30,45.0,,0,,,,1,45
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,2014-03-31 03:45:04,67.0,2014-02-09,1,9.6,9.0,86.0,14,67
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,2013-03-19 23:14:52,443.0,,0,,,,1,443
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,2013-05-22 08:09:28,380.0,,0,,,,1,380
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,2013-01-22 10:14:20,500.0,,0,,,,1,500


In [34]:
last_login_plot = all_users_df.hvplot.hist(
    "last_login",
    bins=26,
    title="Last Login Time Distribution",
    line_width=0.1,
    xlabel="").opts(active_tools=["box_zoom"], )

last_login_plot

In [35]:
all_users_df.hvplot.hist(
    "days_since_last_login",
    by="adopted_user",
    alpha=0.5,
    title="Days since last login",
    bins=26,
    color=["silver", "green"],
).opts(active_tools=["box_zoom"])

In [36]:
# Convert 'creation_time' to datetime and extract the date
all_users_df["start_date"] = pd.to_datetime(
    all_users_df["creation_time"]).dt.date
# create a histogram of the start_date
start_date_plot = all_users_df.hvplot.hist(
    "start_date",
    alpha=0.5,
    bins=26,
    line_width=0.1,
    title="Start Date Distribution").opts(active_tools=["box_zoom"])

start_date_plot

In [37]:
all_users_df.hvplot.hist(
    ["start_date", "last_login"],
    alpha=0.5, line_width=0.1,
).opts(title="Last login and Start date distribution", show_legend=True, active_tools=["box_zoom"])

Most users don't log in beyond that time period of creating the account. The few who do are among those who may become adopted users.

In [38]:
# Convert 'last_login' and 'creation_time' to datetime
all_users_df["last_login"] = pd.to_datetime(all_users_df["last_login"])
all_users_df["creation_time"] = pd.to_datetime(all_users_df["creation_time"])
# Calculate 'account_age' in days
all_users_df["account_age"] = (max_timestamp -
                               all_users_df["creation_time"]).dt.days

display(all_users_df.account_age.describe().round())
all_users_df.hvplot.hist(
    "account_age",
    alpha=0.5,
    bins=26,
    by="adopted_user",
    line_width=0.5,
    color=["silver", "green"],
    title="Account Age Distribution").opts(active_tools=["box_zoom"])

count    8823.0
mean      322.0
std       216.0
min         6.0
25%       127.0
50%       300.0
75%       503.0
max       736.0
Name: account_age, dtype: float64

In [39]:
# Calculate 'account_age_last_login' in days
all_users_df["account_age_last_login"] = (
    pd.to_datetime(all_users_df["last_login"]) -
    pd.to_datetime(all_users_df["creation_time"])).dt.days
# Calculate quietness period as avg time between logins
all_users_df["avg_time_bet_logins"] = (all_users_df["account_age"] /
                                       all_users_df["frequency"]).round(1)

account_age_last_login_plot = all_users_df.hvplot.box(
    "account_age_last_login",
    by="adopted_user",
    c="adopted_user",
    cmap=["silver", "green"],
    ylabel="",
    legend=True,
    hover_cols=['all'],
    title="Account Age at Last Login").opts(active_tools=["box_zoom"],
                                            tools=['hover'])

account_age_last_login_plot

In [40]:
# Extract the month from 'creation_time'
all_users_df["creation_month"] = all_users_df["creation_time"].dt.month

# Convert 'user_id' to string and add leading zeros
all_users_df["user_id"] = all_users_df["user_id"].astype("string").str.zfill(5)

# Extract the domain from the 'email' column
all_users_df["domain"] = (
    all_users_df["email"].str.split("@").str[1].str.split(".").str[0])

# Fill missing values in 'invited_by_user_id' with 0, convert to string and add leading zeros
all_users_df["invited_by_user_id"] = (
    all_users_df["invited_by_user_id"]
    # .fillna(0)
    .astype(int).astype("string").str.zfill(5))

# Replace '00000' in 'invited_by_user_id' with 'Not Invited'
# all_users_df["invited_by_user_id"] = all_users_df[
#     "invited_by_user_id"].replace("00000", "Not Invited")

# Create a DataFrame of adopted users
adopted_users = all_users_df[all_users_df["adopted_user"] == 1]["user_id"]

# Count the number of users each user has referred
referred_df = all_users_df["invited_by_user_id"].value_counts().reset_index(
).iloc[1:]
referred_df.columns = ["user_id", "num_referrals"]

# Merge 'referred_df' with 'all_users_df'
all_users_df = all_users_df.merge(referred_df, how="left")

# Fill missing values in 'num_referrals' with 0
all_users_df["num_referrals"] = all_users_df["num_referrals"].fillna(0).astype(
    int)

# Create a new column 'was_invited' that indicates whether a user was invited or not
all_users_df["was_invited"] = all_users_df["invited_by_user_id"].apply(
    lambda x: "Invited" if x != "00000" else "Not Invited")

Of the Adopted users who referred others

In [41]:
invitation_columns = [
    "user_id",
    "num_referrals",
    "adopted_user",
    "was_invited",
    "invited_by_user_id",
]

# Adopted users who also referred other users
all_users_df[(all_users_df["num_referrals"] > 0)
             & (all_users_df["adopted_user"] == 1)][invitation_columns].sort_values(by="num_referrals", ascending=False).describe(
                 include="all").T.sort_values("unique").fillna("")

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
was_invited,382.0,2.0,Invited,235.0,,,,,,,
invited_by_user_id,382.0,229.0,00000,147.0,,,,,,,
user_id,382.0,382.0,04612,1.0,,,,,,,
num_referrals,382.0,,,,2.232984,1.590873,1.0,1.0,2.0,3.0,10.0
adopted_user,382.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


Of the adopted users who referred others

In [42]:
all_users_df["sent_referrals"] = np.where(all_users_df["num_referrals"] > 0,
                                          "sent_referrals",
                                          "no_referrals_sent")
# users who sent referrals
all_users_df[all_users_df["sent_referrals"] == "sent_referrals"][invitation_columns].describe(
    include="all").T.sort_values("unique").fillna("")

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
was_invited,1634.0,2.0,Invited,932.0,,,,,,,
invited_by_user_id,1634.0,794.0,00000,702.0,,,,,,,
user_id,1634.0,1634.0,00003,1.0,,,,,,,
num_referrals,1634.0,,,,2.180539,1.538591,1.0,1.0,2.0,3.0,10.0
adopted_user,1634.0,,,,0.233782,0.423365,0.0,0.0,0.0,0.0,1.0


In [43]:
# create a column for the users who invited themselves
all_users_df["invited_self"] = (
    all_users_df["user_id"] == all_users_df["invited_by_user_id"])
all_users_df["invited_self"] = all_users_df["invited_self"].astype(int)

In [44]:
all_users_df["domain"] = all_users_df["domain"].apply(lambda x: (
    "other" if x not in
    ["gmail", "yahoo", "jourrapide", "cuvox", "gustr", "hotmail"] else x))

In [45]:
all_users_df.describe(include="all").T.sort_values("unique").fillna("")
display(all_users_df.describe().round(3).T.fillna(""))
display(all_users_df.describe(include='object').T.fillna(""))

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
creation_time,8823.0,2013-07-18 15:48:32.228833792,2012-05-31 00:43:27,2013-01-18 22:39:45.500000,2013-08-09 22:08:11,2014-01-30 00:21:54,2014-05-30 23:59:19,
last_session_creation_time,8823.0,1379279305.7,1338452406.0,1363194965.0,1382888470.0,1398442604.0,1402066730.0,19531160.787
opted_in_to_mailing_list,8823.0,0.252,0.0,0.0,0.0,1.0,1.0,0.434
enabled_for_marketing_drip,8823.0,0.152,0.0,0.0,0.0,0.0,1.0,0.359
org_id,8823.0,142.572,0.0,30.0,109.0,239.0,416.0,124.176
last_login,8823.0,2013-09-15 21:08:25.700441856,2012-05-31 08:20:06,2013-03-13 17:16:05,2013-10-27 15:41:10,2014-04-25 16:16:44,2014-06-06 14:58:50,
recency,8823.0,263.244,0.0,41.0,221.0,449.0,736.0,226.056
adopted_user,8823.0,0.188,0.0,0.0,0.0,0.0,1.0,0.39
avg_time_bet_logins_at_adopt,1656.0,6.923,0.7,4.8,7.1,9.0,22.2,3.367
login_count_at_adopt,1656.0,6.418,3.0,4.0,6.0,8.0,25.0,2.977


Unnamed: 0,count,unique,top,freq
name,8823,8453,Correia Leonardo,4
email,8823,8810,MarkoSeiler@yahoo.com,2
creation_source,8823,5,ORG_INVITE,3188
date,1656,625,2012-10-14,9
start_date,8823,730,2014-05-30,53
domain,8823,7,gmail,2930
was_invited,8823,2,Invited,4776
sent_referrals,8823,2,no_referrals_sent,7189


In [46]:
all_users_df.sample().T

Unnamed: 0,5273
user_id,07205
creation_time,2012-07-27 01:57:43
name,Vennard James
email,JamesVennard@hotmail.com
creation_source,GUEST_INVITE
last_session_creation_time,1382234263.0
opted_in_to_mailing_list,1
enabled_for_marketing_drip,1
org_id,2
invited_by_user_id,03801


In [48]:
def remove_outer_percentile_outliers(df, col, lower=0.01, upper=0.99):
    """Remove the outer percentile outliers from a column in a DataFrame."""
    lower_bound = df[col].quantile(lower)
    upper_bound = df[col].quantile(upper)
    return df[(df[col] > lower_bound) & (df[col] < upper_bound)]

In [49]:
def plot_cat_active_stacked_bars(df, cat_cols, target_col="adopted_user"):
    """Create a bar plot for each categorical column showing the proportion of active users."""

    plot_opts = dict(height=300, width=400, active_tools=[
                     "box_zoom"], tools=['hover'])
    cat_plots = []
    for col in cat_cols:
        proportions = df.groupby(col)[target_col].value_counts(
            normalize=True).unstack()
        plot = proportions.hvplot.barh(
            title=(f"{col}".replace("_", " ").title()
                   if "_" in col else col.title()),
            xlabel="",
            ylabel="",
            stacked=True,
            cmap=["lightgray", "green"],
            legend=False,
        ).opts(**plot_opts, legend_position="top_right")
        cat_plots.append(plot)
    return hv.Layout(cat_plots).cols(3)


def plot_num_active_violins(df, list_of_num_cols, target_col="adopted_user"):
    """Create 2 violin plot for each numerical column with the target column."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    num_plots = []
    for col in list_of_num_cols:
        plot = df.hvplot.violin(
            y=col,
            by=target_col,
            c=target_col,
            ylabel="",
            title=f"{col}".replace("_", " ").title(),
            cmap=["lightgray", "green"],
        ).opts(**plot_opts)
        num_plots.append(plot)
    return hv.Layout(num_plots).cols(3)


def plot_num_active_hist(df, list_of_num_cols, target_col="adopted_user", nbins=12):
    """Creates a histogram plot of each numerical column with target column
        separated by color."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    hist_plots = [
        df[[num_col] + [target_col]].hvplot.hist(
            title=(f"{num_col}".replace("_", " ").title()
                   if "_" in num_col else num_col.title() + " Distribution"),
            bins=nbins,
            xlabel="",
            by=target_col,
            alpha=0.6,
            muted_alpha=0.02,
            color=["silver", "green"],
        ).opts(**plot_opts) for num_col in list_of_num_cols
    ]
    return hv.Layout(hist_plots)


def plot_cat_bars(df, list_of_cat_cols):
    """Creates a bar plot for each categorical column."""

    plot_opts = dict(height=300, width=400, active_tools=["box_zoom"])
    cat_plots = [
        df[col].astype("string").value_counts().sort_index().hvplot.barh(
            title=f"{col}".replace("_", " ").title()
            if "_" in col else col.title(),
            xlabel="",
            ylabel="",
            color="silver",
        ).opts(**plot_opts) for col in list_of_cat_cols
    ]
    return hv.Layout(cat_plots).cols(3)

#### Categorical columns

In [50]:
cat_cols = [
    "domain",
    "creation_source",
    "was_invited",
    "creation_month",
    "sent_referrals",
    "invited_self",
    "opted_in_to_mailing_list",
    "enabled_for_marketing_drip",
]

all_users_df.dtypes
plot_cat_active_stacked_bars(all_users_df, cat_cols)

- `Invited self` seems to have a small lean towards being an adopted user. 
- `GUEST_INVITE` and `PERSONAL_PROJECTS` have a similar distribution of adopted users, which is slightly higher than the other categories.
- Those who `sent_referrals` have a higher proportion of adopted users than those who did not.
- `creation month` 6 has a higher proportion of adopted users than the other months, with 5 being the lowest.
- `hotmail` has a higher proportion of adopted users than the other email providers.

We can use a combination of these features in feature engineering to help build our model. Unfortunately, all these features are lower than the others on absolute numbers despite having a higher proportion of adopted users.

In [51]:
plot_cat_bars(all_users_df, cat_cols)

#### Numerical columns

In [52]:
num_cols = [
    "frequency",
    "account_age_last_login",
    "num_referrals",
    "account_age",
    "days_since_last_login",
    "avg_time_bet_logins",
    'recency',
    'org_id',
]
num_cols.sort()

plot_num_active_violins(all_users_df, num_cols)

In [53]:
num_feat_plot = plot_num_active_violins(
    all_users_df, ['account_age', 'account_age_last_login', 'days_since_last_login'])
hv.save(num_feat_plot, 'images/relax/num_feat_plot.png')

num_feat_plot




The numerical features have some more pronounced differences between the adopted and non-adopted users. 
- `Frequency` we should not use as it is closely related to the target variable.
- `Recency` though can be used as it is the time since the last login. What it basically says is that our adopted users login more frequently, so if we see time has passed without any particular user logging in, that is a signal that we are losing that users and may require some intervention. 
- `Org ID` although it is not a numerical feature, it is much easier to visualize if we consider it like this as oppose to 417 cardinality categorical variable. There is not a clear distinction between the adopted and non-adopted users, but there are some differences in the distribution of the two groups.
- `Num referrals` seems indifferent in the distribution of the two groups.

In [54]:
# find the correct transform for the right skewed columns
plot_num_active_hist(
    all_users_df,
    num_cols).cols(2).opts(title="Numerical Column Distributions")

In [55]:
# strip out some of the outliers
stripped_outliers = []
for col in num_cols:
    col_stripped = remove_outer_percentile_outliers(all_users_df, col)

    stripped_outliers.append(plot_num_active_hist(col_stripped, [col]))

hv.Layout(stripped_outliers).cols(2).opts(
    title="Numerical Column Distributions with Outliers Removed")

At adoption columns distributions

In [56]:
adopted_col_features = [
    "avg_time_bet_logins_at_adopt",
    "login_count_at_adopt",
    "account_age_at_adopt",
]
adopted_col_features.sort()
stripped_outliers = []
for col in adopted_col_features:
    col_stripped = remove_outer_percentile_outliers(all_users_df, col)

    stripped_outliers.append(plot_num_active_hist(
        col_stripped,
        [col],
        nbins=6,
    ))

hv.Layout(stripped_outliers).opts(
    title="Features of Adopted Users at Adoption with Outliers Removed")

In [57]:
acct_age_recency_plot = all_users_df.hvplot.scatter(
    y="account_age",
    x="days_since_last_login",
    c="adopted_user",
    cmap=['silver', 'green'],
    size=30,
    alpha=0.7,
    height=500,
    width=800,
    hover_cols=["user_id"],
    grid=True,
    colorbar=False,
    marker=hv.dim('adopted_user').categorize({0: 'x', 1: 'circle'}),
    xlabel="Days Since Last Login",
    ylabel="Account Age",
).opts(active_tools=["box_zoom"],
       title=f'Adopted Users (o) & Non-Adopted Users (x)',
       backend_opts={"toolbar.autohide": True})

# save a pic of the plot
hv.save(acct_age_recency_plot, './images/relax/acct_age_recency_plot.png')
acct_age_recency_plot

In [58]:
# convert the categorical columns with only 2 unique values to astype(int)

all_users_df["is_referrer"] = all_users_df["sent_referrals"].map(
    {"sent_referrals": 1, "no_referrals_sent": 0})
all_users_df['is_invited'] = all_users_df['was_invited'].map(
    {"Invited": 1, "Not Invited": 0})

In [59]:
# check correlations of the numerical columns in a half matrix
corr_df = all_users_df.drop(columns=adopted_col_features)
corr_cols = corr_df.select_dtypes(np.number).columns.tolist()
corr_cols.sort()
# put the adopted_user coumn at the end
corr_cols.remove("adopted_user")
corr_cols.append("adopted_user")
# get the correlation matrix
corr = corr_df[corr_cols].corr()
mask = np.tril(np.ones_like(corr, dtype=bool))
corr.mask(mask).hvplot.heatmap(height=600,
                               rot=90,
                               aspect='square',
                               cmap='coolwarm_r').opts(
                                   active_tools=["box_zoom"],
                                   title="Correlation Heatmap",
                                   color_levels=7,
                                   symmetric=True,
                                   line_color='white',
                                   yaxis='right',
                                   line_width=0.5)



In [60]:
all_users_df['recency_over_account_age'] = all_users_df['recency'] / all_users_df['account_age']
slim_corr_df = all_users_df[['account_age', 'account_age_last_login', 'recency',
                                'recency_over_account_age', 'adopted_user']]


slim_corr = slim_corr_df.corr()
mask = np.tril(np.ones_like(slim_corr, dtype=bool))
slim_corr.mask(mask).hvplot.heatmap(height=400,
                                   rot=90,
                                   aspect='square',
                                   cmap='coolwarm_r').opts(
                                       active_tools=["box_zoom"],
                                       title="Correlation Heatmap",
                                       color_levels=7,
                                       symmetric=True,
                                       line_color='white',
                                       yaxis='right',
                                       line_width=0.5)



In [61]:
# check the correlation witht the target variable
corr["adopted_user"].sort_values(ascending=False).round(3).hvplot.barh().opts(
    title="Correlation with Adopted User",
    color='silver',
    height=400,
    width=600).opts(active_tools=["box_zoom"])

In [62]:
# check the point biserialed correlation of the binary columns with the target variable
point_biserial_results = [
    stats.pointbiserialr(all_users_df['adopted_user'], all_users_df[col])
    for col in corr_cols[:-1]
]
# put the results in a dataframe
corr_w_target = pd.DataFrame(point_biserial_results,
                             index=corr_cols[:-1],
                             columns=['correlation', 'p_value']).round(5)
corr_w_target[['correlation'
               ]].hvplot.heatmap(cmap='coolwarm_r', width=300,
                                 height=600).opts(color_levels=7,
                                                  active_tools=["box_zoom"],
                                                  title='Correlation',
                                                  fontsize={
                                                      'title': 10,
                                                      'labels': 10
                                                  },
                                                  symmetric=True,)

Hourly distribution of the login times

In [63]:
# Hourly distribution of the logins for each user
users_engagement_df['hour'] = users_engagement_df["time_stamp"].dt.hour
users_engagement_df['adopted_user'] = users_engagement_df['user_id'].isin(
    adoption_df['user_id']).astype(int)
users_engagement_df.hvplot.hist("hour",
                                by="adopted_user",
                                alpha=0.5,
                                color=["silver", "green"],
                                title="Hourly Login Distribution",
                                muted_alpha=0.01,
                                bins=24).opts(active_tools=["box_zoom"])

The `non-adopted` appear to be almost quintessentially evenly distributed across the hours of the day in terms of login times.

In [64]:
new_df = all_users_df.copy()

new_df["is_referrer"] = new_df["sent_referrals"].apply(
    lambda x: 1 if x in ("sent_referrals") else 0)

# Test the Active Level column
new_df["is_home_project"] = new_df["creation_source"].apply(
    lambda x: 1 if x in ("GUEST_INVITE", "PERSONAL_PROJECTS") else 0)

new_df["active_level"] = (new_df["is_referrer"] + new_df["is_home_project"] +
                          new_df["invited_self"])
hv.Layout([
    plot_cat_bars(new_df, ["active_level"]),
    plot_cat_active_stacked_bars(new_df, ["active_level"]),
])

In [65]:
# combine the domains to 'all_others' except for hotmail and yahoo
new_df["new_domain"] = new_df["domain"].apply(
    lambda x: "all_others" if x not in ("hotmail", "yahoo") else x)
(plot_cat_active_stacked_bars(new_df, ["new_domain"]) +
 plot_cat_bars(new_df, ["new_domain"]))

In [66]:
# create feature as the sum of the 2 spam columns
new_df["spam_prod"] = (new_df["opted_in_to_mailing_list"] *
                       new_df["enabled_for_marketing_drip"])
new_df["spam_sum"] = (new_df["opted_in_to_mailing_list"] +
                      new_df["enabled_for_marketing_drip"])
hv.Layout([
    plot_cat_active_stacked_bars(new_df, ["spam_prod"]),
    plot_cat_active_stacked_bars(new_df, ["spam_sum"]),
])

No difference with the combining the spam features. Now we try combining the `new_domain` with the spam features.

In [67]:
# multiply the spam prod and the domain columns
new_df["spam_prod"] = new_df["spam_prod"].astype("string")
new_df["spam_domain"] = new_df["new_domain"] + "_" + new_df["spam_prod"]
(plot_cat_active_stacked_bars(new_df, ["spam_domain"]) +
 plot_cat_bars(new_df, ["spam_domain"]))

- Only the yahoo domain seems to respond positively to the email marketing. 
- Hotmail domains were negatively affected by the email marketing. 
- All the others did not have much of a difference. (< 1% difference)


In [68]:
new_df["spam_source"] = (new_df["creation_source"] + "_" +
                         new_df["spam_prod"].astype("string"))

(plot_cat_active_stacked_bars(new_df, ["spam_source"]) +
 plot_cat_bars(new_df, ["spam_source"]))

- Those who `creation_source` is `PERSONAL_PROJECTS` have a positive response to the email marketing. 
- All others are indifferent ( < 2% difference).


In [69]:
new_df["spam_referrals"] = (new_df["sent_referrals"] + "_" +
                            new_df["spam_prod"].astype("string"))

(plot_cat_active_stacked_bars(new_df, ["spam_referrals"]) +
 plot_cat_bars(new_df, ["spam_referrals"]))

In [70]:
# spam_domain feature
all_users_df["new_domain"] = all_users_df["domain"].apply(
    lambda x: "all_others" if x not in ("hotmail", "yahoo") else x)
all_users_df["spam_prod"] = (all_users_df["opted_in_to_mailing_list"] *
                             all_users_df["enabled_for_marketing_drip"])
all_users_df["spam_domain"] = (all_users_df["new_domain"] + "_" +
                               all_users_df["spam_prod"].astype("string"))
# active_level feature
all_users_df["is_referrer"] = all_users_df["sent_referrals"].apply(
    lambda x: 1 if x in ("sent_referrals") else 0)
all_users_df["is_home_project"] = all_users_df["creation_source"].apply(
    lambda x: 1 if x in ("GUEST_INVITE", "PERSONAL_PROJECTS") else 0)
all_users_df["active_level"] = (all_users_df["is_referrer"] +
                                all_users_df["is_home_project"] +
                                all_users_df["invited_self"])

In [71]:
# check chi squared test for the categorical columns
import itertools as it

chi2_cols = [
    "active_level",
    "spam_domain",
    "creation_month",
    "sent_referrals",
]
chi2_cols.sort()
# Get all unique pairs of variables
pairs = list(it.combinations(chi2_cols, 2))
# Initialize a list to store the results
results = []
for pair in pairs:
    contingency_table = pd.crosstab(new_df[pair[0]], new_df[pair[1]])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    # Append the results to the list
    results.append([pair[0], pair[1], chi2, p])
# Convert the results to a DataFrame
results_df = pd.DataFrame(
    results,
    columns=['Variable 1', 'Variable 2', 'Chi-square Statistic', 'P-value'])
# Display the results
sorted_results_df = results_df.sort_values('Variable 1', )
for index, row in sorted_results_df.iterrows():
    if row['P-value'] < 0.05:
        print(
            f"P-value: {round(row['P-value'], 4)}||  {row['Variable 1']} & {row['Variable 2']} ||  dependent")
    else:
        print(
            f"P-value: {round(row['P-value'], 4)}||  {row['Variable 1']} & {row['Variable 2']} ||  independent")
        print(
            f"We fail to reject the null hypothesis that \033[31m{row['Variable 1']} & {row['Variable 2']}\033[0m are independent")

P-value: 0.0||  active_level & creation_month ||  dependent
P-value: 0.0||  active_level & sent_referrals ||  dependent
P-value: 0.001||  active_level & spam_domain ||  dependent
P-value: 0.0||  creation_month & sent_referrals ||  dependent
P-value: 0.0649||  creation_month & spam_domain ||  independent
We fail to reject the null hypothesis that [31mcreation_month & spam_domain[0m are independent
P-value: 0.0436||  sent_referrals & spam_domain ||  dependent


In [72]:
all_users_df["recency_over_account_age"] = (
    all_users_df["days_since_last_login"] / all_users_df["account_age"])
recency_over_account_age_hist_plot = plot_num_active_hist(
    all_users_df, ["recency_over_account_age"]).opts(height=500, width=800)

hv.save(recency_over_account_age_hist_plot,
        './images/relax/recency_over_account_age_hist_plot.png')

recency_over_account_age_hist_plot

#### Pipeline

In [73]:
class CustomPreprocessor(BaseEstimator, TransformerMixin):

  def __init__(self, max_timestamp):
    self.max_timestamp = max_timestamp

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    X = X.copy()
    X["last_login"] = pd.to_datetime(X["last_session_creation_time"], unit="s")
    X["creation_time"] = pd.to_datetime(X["creation_time"])
    X = X[~X["last_login"].isna()]
    X["invited_by_user_id"] = X["invited_by_user_id"].fillna(0).astype(int)
    X["account_age"] = (self.max_timestamp - X["creation_time"]).dt.days
    X["recency"] = (self.max_timestamp - X["last_login"]).dt.days
    X["account_age_last_login"] = (X["last_login"] -
                                   X["creation_time"]).dt.days
    X["recency_over_account_age"] = X["recency"] / X["account_age"]
    X.set_index("object_id", inplace=True)
    return X[['account_age', 'account_age_last_login', 'recency', 'recency_over_account_age']]

  def get_feature_names_out(self, input_features=None):
    return [
        'account_age', 'account_age_last_login', 'recency',
        'recency_over_account_age'
    ]


numerical_features_no_scale = ["recency_over_account_age"]
numerical_features = [
    'account_age', 'account_age_last_login', 'recency', 'recency_over_account_age'
    ]

# create a pipeline to scale the numerical features
num_pipeline = Pipeline([("scaler", RobustScaler())])
# create a pipeline for the numerical features with no scaling
num_pipeline_no_scale = Pipeline([("no_scaler", "passthrough")])
# create a column transformer for the numerical features
preprocessor = ColumnTransformer([
  ("num", num_pipeline, numerical_features),
  # ("num_no_scale", num_pipeline_no_scale, numerical_features_no_scale),
  ])

In [74]:
# read in the users dataset
X = pd.read_csv(users_path, encoding="latin-1")
y = pd.read_csv(target_path)
# Transform the features with the custom preprocessor
my_preprocessor = CustomPreprocessor(max_timestamp)
X_slim = my_preprocessor.fit_transform(X)

y.set_index('user_id', inplace=True)
y = y.squeeze()
# Check that the IDs match
assert all(X_slim.index == y.index)


# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_slim,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=628,
                                                    stratify=y)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

X_train shape: (7058, 4), X_test shape: (1765, 4)
y_train shape: (7058,), y_test shape: (1765,)


In [75]:
# Models dict
models = {}
models["LogisticRegression"] = (
    LogisticRegression(
        random_state=628,
        n_jobs=-1,
        max_iter=1000,
    ),
    {
        "classifier__C": np.logspace(-3, 3, 7),
        "classifier__class_weight": ["balanced", None],
        "classifier__solver":
        ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    },
)

models["LightGBM"] = (
    LGBMClassifier(
        random_state=628,
        n_jobs=-1,
        is_unbalance=True,
        num_leaves=31,
        boosting_type="gbdt",
        verbose=-1,
    ),
    {
        "classifier__learning_rate": stats.uniform(0.01, 0.5),
        "classifier__reg_alpha": stats.uniform(0.0, 0.05),
        "classifier__reg_lambda": stats.uniform(0.0, 0.05),
        "classifier__min_child_samples": stats.randint(100, 500),
        "classifier__min_data_in_leaf": stats.randint(20, 100),
    },
)

models["XGBoost"] = (
    XGBClassifier(
        random_state=628,
        scale_pos_weight=sum(y_train == 0) / sum(y_train == 1),
    ),
    {
        "classifier__n_estimators": stats.randint(100, 1000),
        "classifier__learning_rate": [0.001, 0.01, 0.1, 0.2, 0.4, 0.5],
        "classifier__max_depth": stats.randint(3, 10),
        "classifier__subsample": stats.uniform(0.6, 0.4),
        "classifier__colsample_bytree": stats.uniform(0.6, 0.4),
        "classifier__colsample_bylevel": stats.uniform(0.6, 0.4),
        "classifier__min_child_weight": stats.randint(1, 200),
    },
)

In [76]:
# Create an empty dictionary to store the best models
best_models = {}

# Loop through each model and perform random search
for model_name, (model, param_grid) in tqdm(models.items(),
                                            desc="Model Tuning"):
    # Create a pipeline for the model
    model_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model),
    ])

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(
        model_pipeline,
        param_grid,
        n_iter=50,
        cv=5,
        random_state=628,
        scoring="average_precision",
        n_jobs=-1,
    )

    # Fit the model
    random_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = random_search.best_estimator_

    # Print the best parameters and ROC AUC score for each model
    print(f"\nBest Parameters for {model_name}: {random_search.best_params_}")
    y_pred_proba_tuned = random_search.best_estimator_.predict_proba(X_test)[:,
                                                                             1]
    best_score = random_search.best_score_
    y_pred = best_models[model_name].predict(X_test)
    recall_score_tuned = metrics.recall_score(y_test, y_pred)
    f1_score_tuned = metrics.f1_score(y_test, y_pred)
    precision_score_tuned = metrics.precision_score(y_test, y_pred)
    #   print the scores in green so they stand out
    print(f"Preliminary scores")
    print(f"{model_name} - Best Score: \033[92m{best_score:.3f}\033[0m")
    print(f"{model_name} - F1 Score: \033[92m{f1_score_tuned:.3f}\033[0m")
    print(
        f"{model_name} - Recall Score: \033[92m{recall_score_tuned:.3f}\033[0m")
    print(
        f"{model_name} - Precision Score: \033[92m{precision_score_tuned:.3f}\033[0m"
    )

Model Tuning:   0%|          | 0/3 [00:00<?, ?it/s]




Best Parameters for LogisticRegression: {'classifier__solver': 'liblinear', 'classifier__class_weight': None, 'classifier__C': 0.1}
Preliminary scores
LogisticRegression - Best Score: [92m0.973[0m
LogisticRegression - F1 Score: [92m0.903[0m
LogisticRegression - Recall Score: [92m0.873[0m
LogisticRegression - Precision Score: [92m0.935[0m

Best Parameters for LightGBM: {'classifier__learning_rate': 0.024007763719353974, 'classifier__min_child_samples': 252, 'classifier__min_data_in_leaf': 77, 'classifier__reg_alpha': 0.04535803193343353, 'classifier__reg_lambda': 0.040575913381990375}
Preliminary scores
LightGBM - Best Score: [92m0.970[0m
LightGBM - F1 Score: [92m0.874[0m
LightGBM - Recall Score: [92m0.955[0m
LightGBM - Precision Score: [92m0.806[0m

Best Parameters for XGBoost: {'classifier__colsample_bylevel': 0.8451831313232718, 'classifier__colsample_bytree': 0.8799280773889193, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__min_child_we

In [77]:
def get_model_importance(model_name):
  """Get the feature importances for a given model."""

  model = best_models[model_name]
  if model_name == "LogisticRegression":
    importance = model.named_steps["classifier"].coef_[0]
  else:
    importance = model.named_steps["classifier"].feature_importances_
  return importance


# create function to plot the feature importances
def plot_feature_importance(model_name):
  """Plot the feature importances for a given model."""
  model = best_models[model_name]
  importance = get_model_importance(model_name)
  features = model.named_steps["preprocessor"].get_feature_names_out()

  importance_df = pd.DataFrame({
      "feature": features,
      "importance": importance
  }).sort_values(by="importance", ascending=False)
  importance_df['feature'] = importance_df['feature'].str.split(
      '__').str[1].str.replace('_', ' ').str.title()
  # Create a bar plot of the feature importances
  plot = importance_df.hvplot.barh(
      x="feature",
      y="importance",
      title=f"{model_name} Feature Importances",
      xlabel="",
      ylabel="",
      width=600,
      height=200,
      color=hv.dim("feature"),
      legend=False,
      cmap=dict(zip(importance_df["feature"], ["green"] +
                    ["silver"] * 3))).opts(active_tools=["box_zoom"])
  return plot


lr_fi_plot = plot_feature_importance("LogisticRegression")
xgb_fi_plot = plot_feature_importance("XGBoost")
lgbm_fi_plot = plot_feature_importance("LightGBM")
feature_importance_layout = hv.Layout([lr_fi_plot, xgb_fi_plot,
           lgbm_fi_plot]).cols(1).opts(shared_axes=False, )

hv.save(feature_importance_layout, 'images/relax/feature_importance_layout.png')
feature_importance_layout

Uses F-beta score, which is a generalization of the `f1_score`. The F-beta score introduces a $\beta$ parameter that weighs recall more as beta increases.

$$F_{\beta} = \frac{(1 + \beta^2) \cdot (\text{{precision}} \cdot \text{{recall}})}{(\beta^2 \cdot \text{{precision}}) + \text{{recall}} + \epsilon}$$

The beta parameter determines the weight of recall in the combined score. 
- `beta` < 1 lends more weight to precision, `beta` -> 0 considers only precision 
- `beta` > 1 favors recall, `beta` -> inf only recall

In [85]:
def get_curve_and_confusion_matrix(model,
                                   X_test,
                                   y_test,
                                   curve_type="roc",
                                   beta=2):
  """Get the ROC AUC or Precision-Recall curve and the confusion matrix for the model.
      curve_types are 'roc' or 'pr' for ROC AUC or Precision-Recall curve, respectively.
      """
  pr_roc_opts = dict(active_tools=["box_zoom"],
                     height=250,
                     width=250,
                     tools=['hover'],
                     backend_opts={"toolbar.autohide": True})
  y_pred_proba = model.predict_proba(X_test)[:, 1]
  curve_type = curve_type.lower()

  if curve_type == "roc":
    # Get the ROC AUC score
    score = metrics.roc_auc_score(y_test, y_pred_proba)
    # Get the curve values
    fpr, tpr, thresh = metrics.roc_curve(y_test, y_pred_proba)
    # Calculate the optimal threshold
    J = tpr - fpr
    optimal_idx = np.argmax(J)
    optimal_threshold = thresh[optimal_idx]
    # Plot title
    title = f"ROC Curve (AUC: {score:.2f} || opt thresh:{optimal_threshold:.2f})"
    # Curve plot
    curve_plot = hv.Curve((fpr, tpr))
    # add in the roc baseline of 0.5
    baseline = hv.Curve([(0, 0), (1, 1)]).opts(color='red', line_dash='dashed')
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)

  elif curve_type == "pr":
    # Get the average precision score
    score = metrics.average_precision_score(y_test, y_pred_proba)
    # Get the curve values
    precision, recall, thresholds = metrics.precision_recall_curve(
        y_test, y_pred_proba)
    # precision = precision[:-1]
    # recall = recall[:-1]
    # Calculate the fbeta scores
    epsilon = 1e-7
    # add small value to avoid division by zero
    fbeta_scores = (1 + beta**2) * (precision * recall) / (
        beta**2 * precision + recall + epsilon)
    f_beta_scores = np.nan_to_num(fbeta_scores)
    idx = np.argmax(f_beta_scores)
    optimal_threshold = thresholds[idx]

    # Plot title
    title = f"PR Curve (AP: {score:.2f} || opt thresh:{optimal_threshold:.2f})"
    # Curve plot
    curve_plot = hv.Curve((recall, precision))
    # fraction of positive class
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)
    positive_frac = y_test.sum() / len(y_test)
    baseline = hv.HLine(y=positive_frac).opts(
        color='red',
        line_dash='dashed',
        line_width=1,
        ylim=(-0.1, 1.1),
        yticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
        padding=0.1)
  else:
    raise ValueError("Invalid curve_type. Choose either 'roc' or 'pr'.")

  # Use the optimal threshold to convert probabilities into class predictions
  curve_plot = curve_plot.opts(
      title=title,
      color='green',
      xlabel="False Positive Rate" if curve_type == "roc" else "Recall",
      ylabel="True Positive Rate" if curve_type == "roc" else "Precision",
      **pr_roc_opts,
      xticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
      yticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
      padding=0.1,
      yaxis="left" if curve_type == "roc" else "right",
      fontsize={'title': '8pt'}) * baseline

  # create table for the positive classification scores
  report = metrics.classification_report(y_test, y_pred, output_dict=True)
  report_scores = report['1']
  report_df = pd.DataFrame(
      report_scores,
      index=[model.named_steps['classifier'].__class__.__name__]).T.round(2)
  # create a table element for the report_df
  report_table = report_df.reset_index().rename(columns={
      'index': 'Metric'
  }).hvplot.table(title="Positive Class Scores").opts(
      height=250,
      width=250,
      fontsize={'title': '8pt'},
      sortable=True,
      selectable=True,
      index_position=None)

  conf_matrix = metrics.confusion_matrix(y_test, y_pred)
  conf_matrix = pd.DataFrame(
      conf_matrix,
      index=["Actual 0", "Actual 1"],
      columns=["Predicted 0", "Predicted 1"],
  )
  conf_heatmap = conf_matrix.hvplot.heatmap(
      colorbar=False,
      logz=True,
      title=f"{model.named_steps['classifier'].__class__.__name__}").opts(
          **pr_roc_opts,
          invert_yaxis=True,
        #   color_levels=5,
          line_color="white",
          line_width=2,
          cmap = 'greens',
          fontsize={'title': '8pt'})
  # Create a DataFrame for labels
  labels_df = pd.DataFrame(conf_matrix.stack(),
                           columns=["value"]).reset_index()
  labels_df.columns = ["y", "x", "value"]

  # Create labels
  labels = hv.Labels(labels_df, ["x", "y"], "value").opts(
      text_color="silver",
      text_font_size="10pt",
      text_font_style="bold",
  )
  heatmap_curve_layout = hv.Layout([
      (conf_heatmap * labels),
      curve_plot,
      report_table,
  ]).opts(height=250).cols(3)

  return y_pred, heatmap_curve_layout

Logistic Regression

In [86]:
lr_model = best_models["LogisticRegression"]
lr_y_pred, lr_layout = get_curve_and_confusion_matrix(lr_model,
                                                      X_test,
                                                      y_test,
                                                      curve_type="pr",
                                                      beta=1)
lr_report = metrics.classification_report(y_test, lr_y_pred, output_dict=True)
lr_scores = lr_report["1"]

XGBoost

In [87]:
xgboost_model = best_models["XGBoost"]
xgboost_y_pred, xgboost_layout = get_curve_and_confusion_matrix(
    xgboost_model,
    X_test,
    y_test,
    curve_type="pr",
    beta=1,
)
xgboost_report = metrics.classification_report(
    y_test,
    xgboost_y_pred,
    output_dict=True,
)
xgb_scores = xgboost_report["1"]
# Save the XGBoost layout for the markdown report
hv.save(xgboost_layout, 'images/relax/xgboost_layout.png')



LightGBM

In [88]:
lgbm_model = best_models["LightGBM"]
lgbm_y_pred, lgbm_layout = get_curve_and_confusion_matrix(
    lgbm_model,
    X_test,
    y_test,
    curve_type="pr",
    beta=1,
)
lgbm_report = metrics.classification_report(y_test,
                                            lgbm_y_pred,
                                            output_dict=True)
lgbm_scores = lgbm_report["1"]

In [89]:
# combine layouts
cm_pr_results = hv.Layout([lr_layout, xgboost_layout, lgbm_layout]).cols(3)
hv.save(cm_pr_results, './images/relax/cm_pr_results.png')
cm_pr_results



In [90]:
# Create a slider for the beta value and a toggle for the curve type
beta_slider = pnw.FloatSlider(name="Beta",
                              start=0.0,
                              end=4.0,
                              step=0.1,
                              value=1.0)
pr_roc_toggle = pnw.RadioButtonGroup(name="Curve Type",
                                     value="PR",
                                     options=['PR', 'ROC'])


# create a function that takes the beta value and returns the layout
def create_model_pane(model_name):

  @pn.depends(beta=beta_slider.param.value, pr_roc=pr_roc_toggle.param.value)
  def get_model_layout(beta, pr_roc):
    model = best_models[model_name]
    y_pred, layout = get_curve_and_confusion_matrix(model,
                                                    X_test,
                                                    y_test,
                                                    curve_type=pr_roc,
                                                    beta=beta)
    return layout

  return pn.pane.HoloViews(get_model_layout, width=750, height=300)

# Callback for the beta slider visibility
def update_beta_slider(event):
  if event.new == 'PR':
    beta_slider.visible = True
  else:
    beta_slider.visible = False


pr_roc_toggle.param.watch(update_beta_slider, 'value')


@pn.depends(beta=beta_slider.param.value, pr_roc=pr_roc_toggle.param.value)
def get_layout_header(beta, pr_roc):
  auc_type = "ROC" if pr_roc == "roc" else "Precision-Recall"
  return pn.pane.Markdown(
      f"# Confusion Matrix and {auc_type} Curve with Beta: {beta}")


model1_pane = create_model_pane("LogisticRegression")
model2_pane = create_model_pane("XGBoost")
model3_pane = create_model_pane("LightGBM")

pn.state.kill_all_servers()

curve_temp = pn.template.FastListTemplate(
    title="Model Evaluation: Confusion Matrix and Precision-Recall Curve",
    sidebar=[
        pn.Spacer(height=10), beta_slider,
        pn.Spacer(height=10), pr_roc_toggle
    ],
)

curve_temp.main.append(get_layout_header)
curve_temp.main.append(model1_pane)
curve_temp.main.append(model2_pane)
curve_temp.main.append(model3_pane)
curve_temp.show()

Launching server at http://localhost:55202


<panel.io.server.Server at 0x20348b7bac0>