# Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven day period, identify which factors predict future user adoption.

Load Packages

In [1]:
import pandas as pd
import numpy as np

Load Data

In [3]:
df_users = pd.read_csv('takehome_users.csv', encoding='ISO-8859-1')
df_engage = pd.read_csv('takehome_user_engagement.csv', parse_dates=['time_stamp'])

In [4]:
df_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
df_engage.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [6]:
#rename 'object_id' as 'user_id' 
df_users = df_users.rename({'object_id':'user_id'}, axis=1)

df_users.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [7]:
#determine 'adopted_user'
engagement = df_engage.set_index('time_stamp')

users = engagement['user_id'].unique()
adopted = []

for i in users:
    user_id = engagement['user_id'] == i
    adopt = engagement[user_id].resample('1D').count()
    adopt = adopt.rolling(window=7).sum()
    adopt = adopt.dropna()
    adopted.append(any(adopt['visited'].values >=3))
        

In [10]:
users = list(zip(users, adopted))

df_adopted = pd.DataFrame(users)
df_adopted.columns = ['user_id', 'adopted_user']

In [11]:
df = df_users.merge(df_adopted, on='user_id', how='left')

In [12]:
df.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 11999
Data columns (total 11 columns):
user_id                       12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
adopted_user                  8823 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 1.1+ MB


In [16]:
#drop missing values in adopted_user column
df.dropna(subset=['adopted_user'], inplace=True)
df['adopted_user'] = df['adopted_user'].astype(int)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 11999
Data columns (total 11 columns):
user_id                       8823 non-null int64
creation_time                 8823 non-null object
name                          8823 non-null object
email                         8823 non-null object
creation_source               8823 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      8823 non-null int64
enabled_for_marketing_drip    8823 non-null int64
org_id                        8823 non-null int64
invited_by_user_id            4776 non-null float64
adopted_user                  8823 non-null int64
dtypes: float64(2), int64(5), object(4)
memory usage: 827.2+ KB


In [19]:
#create invited_by_user column
df['invited_by_user'] = [1 if user > 0 else 0 for user in df['invited_by_user_id']]

In [21]:
#DataFrame for modeling
df = df[['adopted_user', 'invited_by_user', 'creation_source','opted_in_to_mailing_list','enabled_for_marketing_drip']]

In [22]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

In [26]:
X = df.drop('adopted_user', axis=1)
y = df['adopted_user']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=69)

pipeline = Pipeline(steps=[('encoder', OneHotEncoder()), ('rf', RandomForestClassifier(random_state=34))])

params_rf = {'rf__n_estimators': [25, 50, 75, 100], 'rf__max_depth' : [4, 8, 12]}

cv = GridSearchCV(pipeline, param_grid = params_rf, cv=5)
cv.fit(X_train, y_train)

# Print the optimal parameters and best score
print("Tuned Hyperparameter(s): {}".format(cv.best_params_))
print("Tuned Accuracy Score: {}".format(cv.best_score_))

Tuned Hyperparameter(s): {'rf__max_depth': 4, 'rf__n_estimators': 25}
Tuned Accuracy Score: 0.8189767173722208


In [31]:
#test set
y_pred = cv.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8190404231205138


In [38]:
rf = RandomForestClassifier(n_estimators=25, max_depth=4, random_state=34) 

X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=69)

rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))



0.8189766839378239
0.8190404231205138


In [39]:
importance = pd.DataFrame(rf.feature_importances_, index=X_train.columns, columns=['feature importance']).sort_values(by='feature importance', ascending=False)

In [40]:
importance

Unnamed: 0,feature importance
enabled_for_marketing_drip,0.228553
opted_in_to_mailing_list,0.191837
creation_source_GUEST_INVITE,0.184395
creation_source_PERSONAL_PROJECTS,0.137213
creation_source_SIGNUP,0.085603
creation_source_SIGNUP_GOOGLE_AUTH,0.081956
creation_source_ORG_INVITE,0.058927
invited_by_user,0.031515


# Conclusion

The Features included for modeling included:
1. invited_by_user
2. creation_source
3. opted_in_to_mailing_list
4. enabled_for_marketing_drip

The model performed decently well with an accuracy of 82% on the test set. It appears that marketing drip was the most important feature for this particular model therefore this should be maintained in the strategy in order to keep up the user retention. The mailing list showed importance as well so it is useful to put effort into trying to get new users to sign up for the mailing list. Whether or not a user was invied by another user showed the least importance in user retention. This could be useful as it may affect using promotions and incentives for active users inviting friends to sign up. 