## Relax take-home challenge

In [61]:
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta

In [2]:
users = pd.read_csv('takehome_users.csv', encoding = 'latin-1') #use latin-1 encoding to avoid "invalid continuation byte" error
engagements = pd.read_csv('takehome_user_engagement.csv')

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
engagements.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [5]:
engagements.groupby(['visited']).size()

visited
1    207917
dtype: int64

In [6]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [7]:
engagements.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


**Determining whether users are adopted**

My strategy will be to sort engagements by date, find the number of days since the beginning of the dataset that the engagement occurred, and then look through these days to see whether the user was adopted.

In [8]:
engagements = engagements.sort_values(by = ['time_stamp'])
engagements.head()

Unnamed: 0,time_stamp,user_id,visited
178140,2012-05-31 08:20:06,10012,1
59486,2012-05-31 15:47:36,3428,1
175638,2012-05-31 17:19:37,9899,1
26821,2012-05-31 21:58:33,1693,1
109716,2012-06-01 00:17:30,6102,1


In [9]:
#convert the column (which is a string) to Pandas datetime
engagements['time_stamp'] = pd.to_datetime(engagements['time_stamp'])

In [10]:
type(engagements['time_stamp'][0])

pandas._libs.tslib.Timestamp

In [11]:
first_day = pd.Timestamp('2012-05-31')

In [12]:
#divide the difference in time stamp and the first day by 1 day, rounding down to the nearest integer
d = timedelta(days = 1)
engagements['day'] = ((engagements['time_stamp'] - first_day) / d).astype(int)

In [13]:
engagements.head()

Unnamed: 0,time_stamp,user_id,visited,day
178140,2012-05-31 08:20:06,10012,1,0
59486,2012-05-31 15:47:36,3428,1,0
175638,2012-05-31 17:19:37,9899,1,0
26821,2012-05-31 21:58:33,1693,1,0
109716,2012-06-01 00:17:30,6102,1,1


In [14]:
#confirm that the day count is correct, and it is rolling over to the next day correctly
engagements.iloc[5005:5015]

Unnamed: 0,time_stamp,user_id,visited,day
65243,2012-10-20 22:52:55,3812,1,142
40268,2012-10-20 23:14:30,2474,1,142
98173,2012-10-20 23:28:26,5378,1,142
162218,2012-10-20 23:34:04,9325,1,142
189036,2012-10-20 23:41:09,10733,1,142
168175,2012-10-20 23:46:31,9558,1,142
57427,2012-10-21 00:04:30,3294,1,143
159343,2012-10-21 00:23:14,9144,1,143
93610,2012-10-21 00:43:33,5152,1,143
56845,2012-10-21 01:03:55,3269,1,143


In [15]:
#determine whether user is adopted by searching for an engagement where the engagement after the next was within 7 days
#this would mean that that engagement and the two after occurred in the same 7-day window
def is_adopted(userid):
    user_df = engagements.loc[engagements['user_id'] == userid]
    days = list(user_df['day'])
    if len(days) < 3:
        return 0
    for i in range(len(days) - 2):
        if days[i+2] - days[i] <= 7:
            return 1
    return 0

In [16]:
users['adopted'] = users['object_id'].apply(is_adopted)

In [17]:
users.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0,0
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,,0
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,,0
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,,0
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0,1


In [18]:
users.groupby(['adopted']).size()

adopted
0    10344
1     1656
dtype: int64

**Adding/transforming columns**

There are relatively few usable features in the users table, but we can add some more by transforming information stored in the columns of both users and engagements.

First I create a column indicating whether the user was invited by another user:

In [19]:
users['invited'] = pd.notnull(users['invited_by_user_id']).astype(int)

In [20]:
users[['invited', 'invited_by_user_id']].head(15)

Unnamed: 0,invited,invited_by_user_id
0,1,10803.0
1,1,316.0
2,1,1525.0
3,1,5151.0
4,1,5240.0
5,1,11241.0
6,0,
7,0,
8,0,
9,1,4143.0


Next, I will try to extract useful information from the email address. First, split by the @ symbol to get the email domain name:

In [21]:
def emaildomain(address):
    address = address.split('@')
    return address[1]

In [22]:
users['domain'] = users['email'].apply(emaildomain)

In [23]:
users.groupby(['domain']).size().sort_values(ascending = False).head(12)

domain
gmail.com         3562
yahoo.com         2447
jourrapide.com    1259
cuvox.de          1202
gustr.com         1179
hotmail.com       1165
qgjbc.com            2
xybhi.com            2
oqpze.com            2
luque.com            2
rerwl.com            2
dqwln.com            2
dtype: int64

Looking at the list of most common domains, six are fairly common, and all the others have at most 2 different users. I will create binary variables out of these six domains:

In [24]:
users['gmail'] = users['email'].apply(lambda x: 'gmail' in x).astype(int)
users[['gmail', 'email']].iloc[10:20]

Unnamed: 0,gmail,email
10,0,MaltheAPaulsen@gustr.com
11,0,LaerkeLMathiesen@cuvox.de
12,0,AlexanderDFry@cuvox.de
13,1,BretKRivera@gmail.com
14,0,RalfTheiss@hotmail.com
15,0,ReneEngel@hotmail.com
16,0,AnthonyReynolds@jourrapide.com
17,0,CelinaAGregersen@jourrapide.com
18,1,ArleneRCollins@gmail.com
19,0,lqyvjilf@uhzdq.com


In [25]:
users['yahoo'] = users['email'].apply(lambda x: 'yahoo' in x).astype(int)
users['yahoo'] = users['email'].apply(lambda x: 'yahoo' in x).astype(int)
users['jourrapide'] = users['email'].apply(lambda x: 'jourrapide' in x).astype(int)
users['cuvox'] = users['email'].apply(lambda x: 'cuvox' in x).astype(int)
users['gustr'] = users['email'].apply(lambda x: 'gustr' in x).astype(int)
users['hotmail'] = users['email'].apply(lambda x: 'hotmail' in x).astype(int)

Converting creation_time column to a numerical value by getting the number of days since the first user was created:

In [36]:
users['creation_time'] = pd.to_datetime(users['creation_time'])
first_creation = users.sort_values(by=['creation_time'], ascending = True)['creation_time'].iloc[0]
first_creation

Timestamp('2012-05-31 00:43:27')

In [37]:
users['creation_days'] = users['creation_time'].apply(lambda x: (x - first_creation) / d)

In [39]:
users['creation_days'].head(10)

0    691.131979
1    533.126123
2    292.938484
3    355.309734
4    231.396447
5    565.120590
6    199.528530
7    426.201794
8    523.139549
9    230.892083
Name: creation_days, dtype: float64

Creating dummy variables for the creation_source column:

In [44]:
users = users.merge(pd.get_dummies(users['creation_source'], prefix = 'source', drop_first = True), left_index = True, right_index = True)

In [48]:
users[['creation_source', 'source_ORG_INVITE', 'source_PERSONAL_PROJECTS', 'source_SIGNUP', 'source_SIGNUP_GOOGLE_AUTH']].head(15)

Unnamed: 0,creation_source,source_ORG_INVITE,source_PERSONAL_PROJECTS,source_SIGNUP,source_SIGNUP_GOOGLE_AUTH
0,GUEST_INVITE,0,0,0,0
1,ORG_INVITE,1,0,0,0
2,ORG_INVITE,1,0,0,0
3,GUEST_INVITE,0,0,0,0
4,GUEST_INVITE,0,0,0,0
5,GUEST_INVITE,0,0,0,0
6,SIGNUP,0,0,1,0
7,PERSONAL_PROJECTS,0,1,0,0
8,PERSONAL_PROJECTS,0,1,0,0
9,ORG_INVITE,1,0,0,0


**Classification**

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

In [50]:
def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold, random_state = 42).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y.iloc[train]) # fit
        result += score_func(clf.predict(x[test]), y.iloc[test]) # evaluate score function on held-out data
    return result / nfold # average

In [51]:
print(users.columns)

Index(['object_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id', 'adopted',
       'invited', 'domain', 'gmail', 'yahoo', 'jourrapide', 'cuvox', 'gustr',
       'hotmail', 'creation_days', 'source_ORG_INVITE',
       'source_PERSONAL_PROJECTS', 'source_SIGNUP',
       'source_SIGNUP_GOOGLE_AUTH'],
      dtype='object')


Creating a list of explanatory variables, which are all variables that have been transformed into a machine-learning friendly format, and don't have any NaN values.

In [87]:
variables = ['opted_in_to_mailing_list',
       'enabled_for_marketing_drip',
       'invited', 'gmail', 'yahoo', 'jourrapide', 'cuvox', 'gustr',
       'hotmail', 'creation_days', 'source_ORG_INVITE',
       'source_PERSONAL_PROJECTS', 'source_SIGNUP',
       'source_SIGNUP_GOOGLE_AUTH']

In [88]:
users[variables].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 14 columns):
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
invited                       12000 non-null int32
gmail                         12000 non-null int32
yahoo                         12000 non-null int32
jourrapide                    12000 non-null int32
cuvox                         12000 non-null int32
gustr                         12000 non-null int32
hotmail                       12000 non-null int32
creation_days                 12000 non-null float64
source_ORG_INVITE             12000 non-null uint8
source_PERSONAL_PROJECTS      12000 non-null uint8
source_SIGNUP                 12000 non-null uint8
source_SIGNUP_GOOGLE_AUTH     12000 non-null uint8
dtypes: float64(1), int32(7), int64(2), uint8(4)
memory usage: 656.3 KB


In [90]:
df = users[variables]

In [91]:
from sklearn.preprocessing import MinMaxScaler

x = df.values
min_max_scaler = MinMaxScaler()
x = min_max_scaler.fit_transform(x)

In [92]:
df = pd.DataFrame(x, columns = df.columns)

In [96]:
df.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,invited,gmail,yahoo,jourrapide,cuvox,gustr,hotmail,creation_days,source_ORG_INVITE,source_PERSONAL_PROJECTS,source_SIGNUP,source_SIGNUP_GOOGLE_AUTH
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.946796,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.73034,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.401302,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.486746,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.316995,0.0,0.0,0.0,0.0


In [110]:
Xtrain, Xtest, ytrain, ytest = train_test_split(users[variables].values, users['adopted'], random_state = 42, test_size = 0.2)

In [111]:
rf_clf = RandomForestClassifier(random_state = 42, class_weight = 'balanced')
cv_score(rf_clf, Xtrain, ytrain)

0.78812500000000008

In [112]:
log_clf = LogisticRegression(class_weight = 'balanced')

cv_score(log_clf, Xtrain, ytrain)

0.56812499999999999

Random forests gets substantially higher cross-validated accuracy than logistic regression. Now let's do some tuning:

In [113]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':[10,20,100,500], 'min_impurity_decrease':[1e-7,1e-6,1e-5, 1e-4, 1e-3, 1e-2]}
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)
rf_clf_cv = GridSearchCV(rf_clf, param_grid, cv = 5)
rf_clf_cv.fit(Xtrain, ytrain)

print(rf_clf_cv.best_params_)

{'max_depth': 100, 'min_impurity_decrease': 1e-07}


Narrowing down the grid search:

In [114]:
param_grid = {'max_depth':[50,100,200], 'min_impurity_decrease':[1e-8, 5e-7, 1e-7,5e-6,1e-6]}
rf_clf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)
rf_clf_cv = GridSearchCV(rf_clf, param_grid, cv = 5)
rf_clf_cv.fit(Xtrain, ytrain)

print(rf_clf_cv.best_params_)

{'max_depth': 50, 'min_impurity_decrease': 1e-08}


In [115]:
clf_tuned = RandomForestClassifier(class_weight = 'balanced', random_state = 42, max_depth = 50, min_impurity_decrease = 1e-08)

cv_score(clf_tuned, Xtrain, ytrain)

0.78697916666666667

This accuracy score is similar to the cross-validated accuracy without tuning, suggesting that there is not much to be gained from tuning. Now let's look at the classification report for the test data:

In [120]:
clf_tuned = RandomForestClassifier(class_weight = 'balanced', random_state = 42, max_depth = 50, min_impurity_decrease = 1e-08)
clf_tuned.fit(Xtrain, ytrain)
print(classification_report(ytest, clf_tuned.predict(Xtest)))

             precision    recall  f1-score   support

          0       0.86      0.90      0.88      2063
          1       0.14      0.09      0.11       337

avg / total       0.76      0.79      0.77      2400



Despite using a class_weight of balanced, this model has extremely poor recall and precision for the adopted class, indicating that it is not effective at all at predicting which users will end up being adopters, and has a very high false-positive rate. Thus, the high accuracy was illusory and this model is not useful at all.

In [118]:
log_clf = LogisticRegression(class_weight = 'balanced')
log_clf.fit(Xtrain, ytrain)
print(classification_report(ytest, log_clf.predict(Xtest)))

             precision    recall  f1-score   support

          0       0.91      0.58      0.71      2063
          1       0.20      0.63      0.30       337

avg / total       0.81      0.59      0.65      2400



The logistic regression has better recall for the adopted class, and much worse recall for the non-adopted class, while having poor precision for the adopted class. The precision (0.2) indicates that only 20% of predicted adopters actually end up adopting, which is little better than random chance since around 15% of the sample ended up becoming adopters.

In [121]:
feature_importances = list(clf_tuned.feature_importances_)

ranked = []
for var in zip(variables, feature_importances):
    ranked.append(var)
    
ranked = sorted(ranked, key = lambda x: x[1], reverse = True)
ranked

[('creation_days', 0.9023958739443827),
 ('opted_in_to_mailing_list', 0.016666566790624247),
 ('source_PERSONAL_PROJECTS', 0.012189972897233413),
 ('enabled_for_marketing_drip', 0.011516485309496827),
 ('gmail', 0.0086101407744482352),
 ('source_ORG_INVITE', 0.0081512770875527153),
 ('yahoo', 0.0077837290487533916),
 ('jourrapide', 0.0064943726602586698),
 ('hotmail', 0.0063381638694525264),
 ('gustr', 0.0060032338846055426),
 ('cuvox', 0.0050818422941743581),
 ('invited', 0.0039514132154923765),
 ('source_SIGNUP', 0.0029716567372721722),
 ('source_SIGNUP_GOOGLE_AUTH', 0.0018452714862529061)]

Looking at the most important features in the random forest model, creation days has by far the highest feature importance (even after scaling). This may be a sign of overfitting, where the model uses the number of days passed to pinpoint specific users that were adopted or were not adopted. 

In [124]:
coefficients = log_clf.coef_
coefficients

array([[ 0.01957099,  0.03829126,  0.36551096,  0.05110449, -0.23953723,
        -0.06469129, -0.13290504, -0.100744  ,  0.25907883, -0.00115318,
        -0.28682992, -0.4688863 ,  0.19536791,  0.27615742]])

In [128]:
coefficients = []
for variable, coefficient in zip(variables, log_clf.coef_[0]):
    coefficients.append((variable, coefficient))

coefficients = sorted(coefficients, key = lambda x: abs(x[1]))
coefficients

[('creation_days', -0.0011531784468980619),
 ('opted_in_to_mailing_list', 0.019570994125889387),
 ('enabled_for_marketing_drip', 0.038291257552482139),
 ('gmail', 0.05110449261495062),
 ('jourrapide', -0.064691291240487198),
 ('gustr', -0.1007439950153592),
 ('cuvox', -0.13290503919152907),
 ('source_SIGNUP', 0.19536791315331281),
 ('yahoo', -0.23953723213715361),
 ('hotmail', 0.25907883283225763),
 ('source_SIGNUP_GOOGLE_AUTH', 0.27615741574601804),
 ('source_ORG_INVITE', -0.28682992120752443),
 ('invited', 0.36551096114253956),
 ('source_PERSONAL_PROJECTS', -0.46888629969342682)]

For logistic regression, the best positive predictor of whether a user would become adopted is whether they were invited by another user. The best negative predictors were that the account was created through the user being invited to join another workspace, and the user being invited to an organization.

**Discussion**

Unfortunately, I was unable to create a model that performed well in classifying whether users become adopters or not. The logistic regression coefficients may provide some useful information, but that should be taken with a grain of salt due to the model's poor metrics in both accuracy, precision, and recall.

This analysis would be improved by gathering more data, and utilizing the existing data more effectively. For example, I did not use the "last session creation time" column in users, and I did not use the engagements table other than to figure out whether a user was adopted.