Import relevant packages

In [27]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas_profiling
from scipy import stats
import numpy as np

import warnings
warnings.filterwarnings('ignore')


Create dataframe from csv files

In [32]:
engagement = pd.read_csv('takehome_user_engagement.csv')
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [33]:
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])
engagement = engagement.set_index('time_stamp')

engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


Group by user_id and create a rolling 7 day sum of logins per user

In [34]:
engagement = engagement.groupby('user_id').rolling('7D').sum()
engagement = engagement.drop(['user_id'], axis=1)

Create adopted user column that is a 1 for any row that has 3 or more visits in the 7 day rolling window

In [35]:
engagement['adopted_user'] = np.where(engagement['visited'] >= 3, 1, 0)

Reset the index and regroup summing all the columns. If there is a value over 1 in the adopted_user column that means that they had at least 1 active 7 day window. Redo adopted_user flag so that any number greater than or equal to 1 is a 1 and all 0's stay zeros. So now we have a binary label, 1=adopted user and 0=non adopted user 

In [66]:
engagement = engagement.reset_index().groupby('user_id').sum()
engagement['adopted_user'] = np.where(engagement['adopted_user'] >= 1, int(1), int(0))

In [71]:
engagement.head()

Unnamed: 0_level_0,adopted_user
user_id,Unnamed: 1_level_1
1,0
2,1
3,0
4,0
5,0


Drop visited column, not needed anymore

In [106]:
engagement = engagement.drop(['visited'], axis=1)

In [38]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 1 to 12000
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   adopted_user  8823 non-null   int32
dtypes: int32(1)
memory usage: 103.4 KB


Load users data anc change object_id to user_id as indicated by the supporting documents so that the files have a common column to join on

In [72]:
users = pd.read_csv('takehome_users.csv')
users.rename(columns = {'object_id':'user_id'}, inplace = True)
users.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


Merge the datasets on user_id, now all of the user information also has the adopted user column

In [73]:
join = pd.merge(users, engagement, how='left', on='user_id')

In [74]:
join.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0.0


Removed columns that are just personal information, a persons name or email address doesn't have an affect on their engagement. The user id shouldn't either, it was arbitrarily assigned on signup

In [79]:
join = join.drop(['name', 'email', 'last_session_creation_time'], axis=1)
join = join.drop(['creation_time'], axis=1)
join = join.drop(['user_id'], axis=1)

Fill nan values, there is no user 0 that's why I chose that to fill the nans. For adopted user, I assume the reason they didn't have a label from the engagement file is because they signed up but never logged in so they are not an adopted user, assigned a 0

In [80]:
join['invited_by_user_id'] = join['invited_by_user_id'].fillna(0)
join['adopted_user'] = join['adopted_user'].fillna(0)

In [81]:
join['adopted_user'] = join['adopted_user'].astype('int')
join.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,GUEST_INVITE,1,0,11,10803.0,0
1,ORG_INVITE,0,0,1,316.0,1
2,ORG_INVITE,0,0,94,1525.0,0
3,GUEST_INVITE,0,0,1,5151.0,0
4,GUEST_INVITE,0,0,193,5240.0,0


In [57]:
join['enabled_for_marketing_drip'].unique()

array([0, 1], dtype=int64)

Import modeling libraries

In [17]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import sklearn.metrics as metrics
import time


Onehot encode categorical variables. invited by user and org id are numerical but they are just labels so they need to be encoded

In [82]:
join_onehot = pd.get_dummies(join, columns = ['creation_source', 'org_id', 'invited_by_user_id'])
join_onehot.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,adopted_user,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,org_id_0,org_id_1,...,invited_by_user_id_11966.0,invited_by_user_id_11972.0,invited_by_user_id_11973.0,invited_by_user_id_11974.0,invited_by_user_id_11978.0,invited_by_user_id_11981.0,invited_by_user_id_11986.0,invited_by_user_id_11994.0,invited_by_user_id_11997.0,invited_by_user_id_11999.0
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
join_onehot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 11999
Columns: 2990 entries, opted_in_to_mailing_list to invited_by_user_id_11999.0
dtypes: int64(2), object(1), uint8(2987)
memory usage: 34.5+ MB


Create X and y variables and split into train/test datasets

In [83]:
X = join_onehot.drop(columns='adopted_user')
y = join_onehot[['adopted_user']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=25)

In [92]:
start = time.time()

n_estimators = [int(x) for x in np.linspace(start = 25, stop = 225, num = 25)]
max_depth = [1,2,3,4,5,6,7,8,9,10]
eta = [.001,.005,.01,.025,.05,.1,.2,.3]
subsample = [.25,.5,.75,1]
colsample_bytree = [.25,.5,.75,1]

random_grid = {'xgbc__n_estimators': n_estimators,
               'xgbc__max_depth': max_depth,
               'xgbc__eta': eta,
               'xgbc__subsample': subsample,
               'xgbc__colsample_bytree': colsample_bytree}

xgboost2 = RandomizedSearchCV(
        estimator=XGBClassifier(),
        param_distributions = random_grid, n_iter = 25, cv = 5, verbose=2, random_state=42, n_jobs = -1)

grid_result = xgboost2.fit(X_train, y_train)
best_params = xgboost2.best_params_
print(best_params)
print('It takes %s minutes' % ((time.time() - start)/60))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { "xgbc__colsample_bytree", "xgbc__eta", "xgbc__max_depth", "xgbc__n_estimators", "xgbc__subsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'xgbc__subsample': 0.75, 'xgbc__n_estimators': 133, 'xgbc__max_depth': 7, 'xgbc__eta': 0.3, 'xgbc__colsample_bytree': 0.75}
It takes 203.00777105490366 minutes


In [86]:
start = time.time()

n_estimators = [50,100,150,200,250,500,750]
max_depth = [1,3,5,7,9]
eta = [.001,.005,.01,.05,.1,.3]
subsample = [.25,.5,.75,1]
colsample_bytree = [.25,.5,.75,1]

random_grid = {'xgbc__n_estimators': n_estimators,
               'xgbc__max_depth': max_depth,
               'xgbc__eta': eta,
               'xgbc__subsample': subsample,
               'xgbc__colsample_bytree': colsample_bytree}

xgboost = RandomizedSearchCV(
        estimator=XGBClassifier(),
        param_distributions = random_grid, n_iter = 5, cv = 5, verbose=2, random_state=42, n_jobs = -1)

grid_result = xgboost.fit(X_train, y_train)
best_params = xgboost.best_params_
print(best_params)
print('It takes %s minutes' % ((time.time() - start)/60))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Parameters: { "xgbc__colsample_bytree", "xgbc__eta", "xgbc__max_depth", "xgbc__n_estimators", "xgbc__subsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'xgbc__subsample': 0.75, 'xgbc__n_estimators': 150, 'xgbc__max_depth': 7, 'xgbc__eta': 0.1, 'xgbc__colsample_bytree': 1}
It takes 21.09429087638855 minutes


In [87]:
print("Accuracy score (training): {0:.3f}".format(xgboost.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(xgboost.score(X_test, y_test)))

Accuracy score (training): 0.871
Accuracy score (validation): 0.853


In [95]:
y_pred=xgboost.predict(X_test)

precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: [0.85418752 0.        ]
recall: [0.9988295 0.       ]
fscore: [0.92086331 0.        ]


In [93]:
print("Accuracy score (training): {0:.3f}".format(xgboost2.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(xgboost2.score(X_test, y_test)))

Accuracy score (training): 0.871
Accuracy score (validation): 0.853


In [96]:
y_pred2=xgboost2.predict(X_test)

precision, recall, fscore, support = score(y_test, y_pred2)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))

precision: [0.85418752 0.        ]
recall: [0.9988295 0.       ]
fscore: [0.92086331 0.        ]


After determining the optimal hyperparameters the model was retrained using all data to increase the accuracy of the final model

In [90]:
xgb = XGBClassifier(subsample = 0.75, n_estimators = 150, max_depth = 7, eta = 0.1, colsample_bytree = 1)
xgb.fit(X, y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.1, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.100000001, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=150, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.75,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [91]:
print("Accuracy score (full data): {0:.3f}".format(xgb.score(X, y)))

Accuracy score (full data): 0.867


In [97]:
xgb2 = XGBClassifier(subsample = 0.75, n_estimators = 133, max_depth = 7, eta = 0.3, colsample_bytree = 0.75)
xgb2.fit(X, y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, eta=0.3, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=133, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.75,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [98]:
print("Accuracy score (full data): {0:.3f}".format(xgb2.score(X, y)))

Accuracy score (full data): 0.868


In [99]:
importances = pd.DataFrame({
    'Feature': join_onehot.drop('adopted_user', axis=1).columns,
    'Importance': xgb.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)
importances = importances.set_index('Feature')
importances

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
creation_source_PERSONAL_PROJECTS,0.010905
invited_by_user_id_2367.0,0.008625
org_id_392,0.007310
org_id_82,0.007127
org_id_387,0.007117
...,...
invited_by_user_id_3567.0,0.000000
invited_by_user_id_3568.0,0.000000
invited_by_user_id_3572.0,0.000000
invited_by_user_id_3580.0,0.000000


In [105]:
importances.head(40)

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
creation_source_PERSONAL_PROJECTS,0.010905
invited_by_user_id_2367.0,0.008625
org_id_392,0.00731
org_id_82,0.007127
org_id_387,0.007117
invited_by_user_id_2994.0,0.007042
invited_by_user_id_2017.0,0.007028
org_id_270,0.006659
org_id_306,0.006595
org_id_218,0.00652


In [104]:
importances2 = pd.DataFrame({
    'Feature': join_onehot.drop('adopted_user', axis=1).columns,
    'Importance': xgb2.feature_importances_
})
importances2 = importances2.sort_values(by='Importance', ascending=False)
importances2 = importances2.set_index('Feature')
importances2

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
creation_source_PERSONAL_PROJECTS,0.009540
invited_by_user_id_4019.0,0.008224
org_id_289,0.007044
org_id_296,0.006951
org_id_387,0.006846
...,...
invited_by_user_id_3781.0,0.000000
invited_by_user_id_3783.0,0.000000
invited_by_user_id_3784.0,0.000000
invited_by_user_id_3788.0,0.000000


Two differnt xgboost classifier models were run. The difference being the size of the available parameters to see if a larger net would result in a better prediction. It did yield a better overall result of 0.1% after running 10x as long, so the effort was not worth the extra compute time. Both models had high precision scores and extremely high recall scores, precision: 0.85418 recall: 0.99883. This indicates that most of the labels are accurately predicted but if they are wrong it is almost never a false negative.