In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing     import StandardScaler
from sklearn.model_selection   import GridSearchCV, KFold 
from sklearn.model_selection   import train_test_split
from sklearn.model_selection   import cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline          import Pipeline
from sklearn.ensemble     import GradientBoostingClassifier
from sklearn.ensemble     import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn           import metrics
from sklearn.svm       import SVC
from sklearn.metrics   import roc_curve, auc, accuracy_score
from sklearn.metrics   import confusion_matrix,f1_score
from sklearn.metrics   import classification_report
from sklearn.metrics   import log_loss ,recall_score
from sklearn.metrics   import average_precision_score,precision_score
from sklearn.metrics   import roc_auc_score, precision_score, recall_score, accuracy_score, f1_score

In [3]:
users_df = pd.read_csv('takehome_users.csv',parse_dates=['creation_time'],encoding='latin-1')
users_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
users_df['last_session_creation_time'] = pd.to_datetime(users_df['last_session_creation_time'], unit='s')

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [5]:
users_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0


In [6]:
engagement_df = pd.read_csv('takehome_user_engagement.csv')
engagement_df.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [7]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   12000 non-null  int64         
 1   creation_time               12000 non-null  datetime64[ns]
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  int64         
 8   org_id                      12000 non-null  int64         
 9   invited_by_user_id          6417 non-null   float64       
dtypes: datetime64[ns](2), float64(1), int64(4), object(3)
memory usage: 937.6+ KB


In [8]:
engagement_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


The time_stamp column appears to be an object/string so we wll change that into the proper data type:

In [9]:
engagement_df['time_stamp'] = pd.to_datetime(engagement_df['time_stamp']) - pd.to_timedelta(7, unit='d')

# Initiate a week as days
days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']

# Finding how many of the 2248 unique three-logins users are adopted users 
# who have logged in three separate days in at least one 7 day period 
user_days = pd.Series([])

for day in days:
    s = engagement_df.groupby(['user_id', pd.Grouper(key='time_stamp', freq='W-' + day)])['visited'].count().reset_index().sort_values('user_id')
    s = s.groupby('user_id')['visited'].max()
    user_days = pd.concat([user_days, s], axis=1).max(axis=1)

user_days = user_days.fillna(0)

  user_days = pd.Series([])


In [10]:
engagement_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   time_stamp  207917 non-null  datetime64[ns]
 1   user_id     207917 non-null  int64         
 2   visited     207917 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


It has been changed.
Now lets look at any null values.

In [11]:
engagement_df.isnull().sum()

time_stamp    0
user_id       0
visited       0
dtype: int64

In [12]:
users_df.isnull().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
dtype: int64

 Now that there is a user_id as the first column, we will take the third column out.

Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

As this is the assessment, we will created a new df called "adopted_user" by having the users who have logged into the product at least 3 times during a week.

In [13]:
def get_user_status(x):
    if x in user_days and user_days[x] >= 3:
        return 'adopted user'
    else:
        return 'not adopted user'

users_df['adopted_users'] = users_df['object_id'].apply(get_user_status)

In [14]:
users_df['adopted_users'].value_counts()

not adopted user    10398
adopted user         1602
Name: adopted_users, dtype: int64

In [15]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   12000 non-null  int64         
 1   creation_time               12000 non-null  datetime64[ns]
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  int64         
 8   org_id                      12000 non-null  int64         
 9   invited_by_user_id          6417 non-null   float64       
 10  adopted_users               12000 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(4), object(4)


invited_by_user_id will be changed to int:

In [16]:
users_df['invited']=users_df['invited_by_user_id'].apply(lambda x: 0 if np.isnan(x) else 1)

users_df['invited'].value_counts()

1    6417
0    5583
Name: invited, dtype: int64

In [17]:
users_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_users,invited
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,not adopted user,1
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,adopted user,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,not adopted user,1
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,not adopted user,1
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,not adopted user,1


Columns with missing values:

In [18]:
missing_percentage = (users_df['invited_by_user_id'].isnull().sum() / len(users_df)) * 100
print(f"Percentage of missing values in 'invited_by_user_id': {missing_percentage:.2f}%")

Percentage of missing values in 'invited_by_user_id': 46.52%


There is more than 40% of missing data 

In [19]:
users_df.isnull().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
adopted_users                    0
invited                          0
dtype: int64

In [20]:
missing_percentage_2 = (users_df['last_session_creation_time'].isnull().sum() / len(users_df)) * 100
print(f"Percentage of missing values in 'invited_by_user_id': {missing_percentage_2:.2f}%")

Percentage of missing values in 'invited_by_user_id': 26.47%


In [21]:
df=users_df.set_index('object_id')

In [22]:
df.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_users,invited
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,not adopted user,1
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,adopted user,1
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,not adopted user,1
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,not adopted user,1
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,not adopted user,1


In [23]:
df['creation_time'] = pd.to_datetime(df['creation_time'])
df['creation_year'] = df['creation_time'].dt.year
df['creation_month'] = df['creation_time'].dt.month
df['creation_day'] = df['creation_time'].dt.day

In [24]:
df['last_session_creation_time'] = df.last_session_creation_time.fillna(
    df.last_session_creation_time.mean())

In [25]:
df['last_session_year'] = df['last_session_creation_time'].dt.year
df['last_session_month'] = df['last_session_creation_time'].dt.month
df['last_session_day'] = df['last_session_creation_time'].dt.day

In [28]:
df['adopted_users'] = df.adopted_users.apply(lambda x: 1 if x=='adopted user' else 0)

In [52]:
adopted_percentage = df['adopted_users'].eq(1).mean() * 100

print('Percentage of adopted users:', adopted_percentage, '%')

Percentage of adopted users: 13.350000000000001 %


In [53]:
print('Percentage of non-adopted users:', (100-adopted_percentage), '%')

Percentage of non-adopted users: 86.65 %


In [29]:
email_domains = ['gmail.com', 'yahoo.com', 'jourrapide.com', 'cuvox.de', 'gustr.com', 'hotmail.com']

# Extracting domain from email
df['domain'] = df['email'].str.split('@').str.get(1)

# Checking if domain is in the specified list
df['email_type'] = df['domain'].apply(lambda x: x.split('.')[0] if x in email_domains else 'other')

In [30]:
df['email_type'].value_counts()

gmail         3562
yahoo         2447
jourrapide    1259
cuvox         1202
other         1186
gustr         1179
hotmail       1165
Name: email_type, dtype: int64

In [None]:
# Dropping some columns 

df.drop(['name', 'org_id', 'invited_by_user_id', 
         'creation_time', 'last_session_creation_time'],
       axis=1,inplace= True)

In [32]:
df.head()

Unnamed: 0_level_0,email,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,adopted_users,invited,creation_year,creation_month,creation_day,last_session_year,last_session_month,last_session_day,domain,email_type
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,AugustCClausen@yahoo.com,GUEST_INVITE,1,0,0,1,2014,4,22,2014,4,22,yahoo.com,yahoo
2,MatthewPoole@gustr.com,ORG_INVITE,0,0,1,1,2013,11,15,2014,3,31,gustr.com,gustr
3,MitchellBottrill@gustr.com,ORG_INVITE,0,0,0,1,2013,3,19,2013,3,19,gustr.com,gustr
4,NicklasSClausen@yahoo.com,GUEST_INVITE,0,0,0,1,2013,5,21,2013,5,22,yahoo.com,yahoo
5,GraceRaw@yahoo.com,GUEST_INVITE,0,0,0,1,2013,1,17,2013,1,22,yahoo.com,yahoo


In [33]:
df =pd.get_dummies(df,prefix='is')

Now we can assign the X and y values for prepartion for modeling.

In [34]:
df.head()

Unnamed: 0_level_0,opted_in_to_mailing_list,enabled_for_marketing_drip,adopted_users,invited,creation_year,creation_month,creation_day,last_session_year,last_session_month,last_session_day,...,is_zsrgb.com,is_zssin.com,is_zwmry.com,is_cuvox,is_gmail,is_gustr,is_hotmail,is_jourrapide,is_other,is_yahoo
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,1,2014,4,22,2014,4,22,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,1,2013,11,15,2014,3,31,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,1,2013,3,19,2013,3,19,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,1,2013,5,21,2013,5,22,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,1,2013,1,17,2013,1,22,...,0,0,0,0,0,0,0,0,0,1


In [35]:
# Extract features and target

X = df.drop('adopted_users', axis=1)
y = df['adopted_users']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=42)

In [41]:
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Make predictions on the test data
y_pred = logreg.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score: 0.8675


In [42]:
gradient_boosting = GradientBoostingClassifier()

# Fit the models to the training data
gradient_boosting.fit(X_train, y_train)

# Make predictions on the test data
y_pred_gb = gradient_boosting.predict(X_test)

# Calculate the accuracy scores
accuracy_gb = accuracy_score(y_test, y_pred_gb)

print("Accuracy Score for Gradient Boosting:", accuracy_gb)

Accuracy Score for Gradient Boosting: 0.96625


In [43]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train, y_train)

y_pred_rf = random_forest.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("Accuracy Score for Random Forest:", accuracy_rf)

Accuracy Score for Random Forest: 0.95625


In [44]:
svm = SVC()

svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)

print("Accuracy Score for Support Vector Machine:", accuracy_svm)

Accuracy Score for Support Vector Machine: 0.8666666666666667


In [45]:
rf = RandomForestClassifier(class_weight='balanced_subsample')

In [47]:
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

0.9525

In [48]:
# feature importance ranking
feature_importance = pd.DataFrame()
feature_importance['coef'] = rf.feature_importances_
feature_importance = feature_importance.set_index(X.columns)
feature_importance.coef.nlargest(10)

last_session_month            0.191071
last_session_year             0.147243
last_session_day              0.114650
creation_month                0.066364
creation_day                  0.051866
creation_year                 0.036092
opted_in_to_mailing_list      0.007771
enabled_for_marketing_drip    0.006666
is_PERSONAL_PROJECTS          0.006574
is_GUEST_INVITE               0.005050
Name: coef, dtype: float64

In [49]:
feature_importance.head()


Unnamed: 0,coef
opted_in_to_mailing_list,0.007771
enabled_for_marketing_drip,0.006666
invited,0.003957
creation_year,0.036092
creation_month,0.066364


# Summary:

The majority of the userbase, approximately 87%, consists of non-adopted users. This highlights a significant opportunity to increase the proportion of adopted users relative to the total userbase.

Among the different classification algorithms evaluated, gradient boosting demonstrates the highest performance in predicting adopted users, achieving an impressive accuracy score of 97%. 

Feature importance analysis reveals that 'opted_in_to_mailing_list', 'enabled_for_marketing_drip' and there is a slight decrease, but 'invited' are the most influential features for predicting adopted users.