## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read files

In [2]:
users = pd.read_csv('takehome_users.csv', encoding='ISO-8859-1')
user_engagement = pd.read_csv('takehome_user_engagement.csv', encoding='ISO-8859-1')

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


## Data wrangling

In [5]:
# Convert to datetime
users['creation_time'] = pd.to_datetime(users['creation_time'],format='%Y-%m-%d %H:%M:%S')
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'],unit='s')

In [6]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0


In [7]:
users['object_id'].nunique()

12000

There are 12000 people signed up for the product.

In [8]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [9]:
user_engagement['user_id'].nunique()

8823

Out of the 12000 people signed up, 8823 have actually used the product.

In [10]:
# Convert time_stamp to datetime and set it as the index
user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'],format='%Y-%m-%d %H:%M:%S')
user_engagement = user_engagement.set_index('time_stamp', drop=True)
user_engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


## Determining adopted users

To predict if a user is an adopted user, we first need to assign labels to the users based on whether or not they are an adopted user.

In [11]:
def adopted_users(user_id):
    # Select rows in user_engagement for the specific user_id
    user_id_engagement = user_engagement.loc[user_engagement['user_id']==user_id].groupby('user_id')
    # Resample weekly and sum up number of visits per week
    user_id_engagement = pd.DataFrame(user_id_engagement.visited.resample('W',label='left').sum())
    if((user_id_engagement.visited>2).any()==True):
        return 1
    else: 
        return 0

In [12]:
users['adopted'] = users['object_id'].apply(adopted_users)

In [13]:
users.adopted.value_counts()

0    10555
1     1445
Name: adopted, dtype: int64

I have calculated that 1445 users are adopted users, meaning they have logged into the product on three separate days in at least one seven-day period.

## Feature engineering

In [14]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,0


In [15]:
# See if there are any missing values
missing = pd.concat([users.isnull().sum(), 100 * users.isnull().mean()], axis=1)
missing.columns = ['count','%']
missing.sort_values(by='count')

Unnamed: 0,count,%
object_id,0,0.0
creation_time,0,0.0
name,0,0.0
email,0,0.0
creation_source,0,0.0
opted_in_to_mailing_list,0,0.0
enabled_for_marketing_drip,0,0.0
org_id,0,0.0
adopted,0,0.0
last_session_creation_time,3177,26.475


The missing values for invited_by_user_id most likely signify that those users were not invited by another user. I am going to create a new boolean feature, 'invited_by_user', which will be 1 if they were invited by a user and 0 if they were not invited by a user.

In [20]:
users['invited_by_user'] = users['invited_by_user_id'].isnull() == False
users['invited_by_user'] = users['invited_by_user'].astype(int)
users.drop(columns = ['invited_by_user_id'], inplace=True)

In [21]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,adopted,invited_by_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,0,1
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,1,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,0,1
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,0,1
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,0,1


Next, we can remove irrelevant columns, which includes last_session_creation_time. But before doing so, I will create a feature called usage_length which is the time (number of days) between the last session and the creation.

In [30]:
users['usage_length'] = (users['last_session_creation_time'] - users['creation_time']) / np.timedelta64(1,'D')
users['usage_length'] = users['usage_length'].fillna(0)

In [31]:
# Now remove irrelevant columns
users.drop(columns=['object_id','creation_time','name','email','last_session_creation_time','org_id'], inplace=True)

In [32]:
users.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,adopted,invited_by_user,usage_length
0,GUEST_INVITE,1,0,0,1,0.0
1,ORG_INVITE,0,0,1,1,136.0
2,ORG_INVITE,0,0,0,1,0.0
3,GUEST_INVITE,0,0,0,1,1.0
4,GUEST_INVITE,0,0,0,1,5.0


In [33]:
# Create dummy variables for the categorical variable creation_source
users_dummies = pd.get_dummies(users['creation_source'])

In [34]:
users_features = users.drop(columns=['creation_source'])
df_features = pd.concat([users_features,users_dummies], axis=1)

In [36]:
df_features.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,adopted,invited_by_user,usage_length,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
0,1,0,0,1,0.0,1,0,0,0,0
1,0,0,1,1,136.0,0,1,0,0,0
2,0,0,0,1,0.0,0,1,0,0,0
3,0,0,0,1,1.0,1,0,0,0,0
4,0,0,0,1,5.0,1,0,0,0,0


## Modeling

In [37]:
# Create X and y
X = df_features.drop(columns=['adopted'])
y = df_features.loc[:,'adopted']

In [38]:
from sklearn.model_selection import train_test_split

# Divide into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [39]:
from sklearn.ensemble import RandomForestClassifier

# Use Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print('Accuracy on training set = {}'.format(rf.score(X_train, y_train)))
print('Accuracy on test set = {}'.format(rf.score(X_test, y_test)))

Accuracy on training set = 0.9888888888888889
Accuracy on test set = 0.9753333333333334


In [40]:
# Generate confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

[[2602   37]
 [  37  324]]


In [41]:
# Generate classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2639
           1       0.90      0.90      0.90       361

    accuracy                           0.98      3000
   macro avg       0.94      0.94      0.94      3000
weighted avg       0.98      0.98      0.98      3000



In [43]:
# Get feature importances from Random Forest model
predictors = list(X_train)
feat_imp = pd.Series(rf.feature_importances_, predictors).sort_values(ascending=False)
feat_imp

usage_length                  0.982779
opted_in_to_mailing_list      0.004591
enabled_for_marketing_drip    0.003615
PERSONAL_PROJECTS             0.002614
ORG_INVITE                    0.001647
SIGNUP                        0.001498
SIGNUP_GOOGLE_AUTH            0.001242
GUEST_INVITE                  0.001237
invited_by_user               0.000776
dtype: float64

The most important feature by far is the usage_length feature that I created. 