In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn_pandas import DataFrameMapper

# Data Preprocessing
The first step is to read in the data to a Pandas DataFrame.

In [2]:
users = pd.read_csv('data/users.csv', encoding='latin1')
engagement = pd.read_csv('data/user-engagement.csv')

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


I converted the object_id column to the users index to join it with the engagement table later.

In [4]:
users.index = users.object_id
users = users.drop('object_id',1)
users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


I then converted the time_stamp column to the engagement index to enable rolling window operations.

In [6]:
engagement.index = pd.to_datetime(engagement.time_stamp)
engagement = engagement.drop('time_stamp',1)
engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


Next I grouped all logins by user_id and performed a rolling 7-day count of the visited column. 

In [7]:
weekly_logins = engagement.groupby('user_id')['visited'].rolling('7D').count()
weekly_logins.head()

user_id  time_stamp         
1        2014-04-22 03:53:30    1.0
2        2013-11-15 03:45:04    1.0
         2013-11-29 03:45:04    1.0
         2013-12-09 03:45:04    1.0
         2013-12-25 03:45:04    1.0
Name: visited, dtype: float64

This enabled me to easily find users who logged into to product on three separate days in a seven-day period and mark those users as adopted. I then added the adopted column to the users table.

In [8]:
adopted = weekly_logins.groupby('user_id').max() > 2
users = users.join(adopted)
users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,visited
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


Next I accounted for users that have never logged into the product by setting the visited column to False for those users. I also accounted for those that were not invited by another user by setting invited_by_user_id to 0, which is not an actual user ID and becomes a proxy to represent not invited by another user.

In [9]:
users.loc[users.visited.isnull(), 'visited'] = False
users.loc[users.invited_by_user_id.isnull(), 'invited_by_user_id'] = 0

# Predicting Future Adoption
Few users actually adopted the product per the definition in the problem description. This results in a biased dataset, so I experimented with resampling to see if that improves predictive capability of the model.

In [10]:
print('{:.2f}% of all users were adopted'.format(100 * sum(users.visited) / len(users)))

13.35% of all users were adopted


I first split the data into training and test sets.

In [11]:
visited = users.visited
x_train, x_test, y_train, y_test = train_test_split(users, visited, test_size=0.2)

The train_test_split function uses stratified sampling by default. I printed the adoption counts to verify that the adoption percentage of the entire dataset is maintained in the training and test sets.

In [12]:
print('Training set adoption split')
print(y_train.value_counts())
print('\nTest set adoption split')
print(y_test.value_counts())

Training set adoption split
False    8349
True     1251
Name: visited, dtype: int64

Test set adoption split
False    2049
True      351
Name: visited, dtype: int64


Next I created a DataFrameMapper from the sklearn_pandas module. This module provides a class to transform Pandas DataFrames to a desired format suitable for scikit-learn classifiers. It is intended to be used inside a scikit-learn Pipeline, but here I just use it to transform the DataFrame. Each desired column is mapped to a transformer to be applied on that column. Here I used a LabelEncoder to transform the creation_source and org_id columns. A transformer of None tells the DataFrameMapper to pass data from that column as is. Columns not included in the DataFrameMapper are discarded. I used these four variables because they provided the best results after experimenting with various variable combinations.

In [13]:
mapper = DataFrameMapper([
    ('creation_source', LabelEncoder()),
    ('org_id', LabelEncoder()),
    ('opted_in_to_mailing_list', None),
    ('enabled_for_marketing_drip', None)
])

Once I have the mapper I fit it on the entire dataset, then transformed the training and test sets. I also encoded the training and test output from False and True to 0 and 1.

In [14]:
mapper.fit(users)
x_train_encode = mapper.transform(x_train)
x_test_encode = mapper.transform(x_test)
y_train_encode = LabelEncoder().fit_transform(y_train)
y_test_encode = LabelEncoder().fit_transform(y_test)

I then fit a random forest model on the training data and printed accuracy, precision, and recall on the test set. I included precision and recall because accuracy is not always the best metric in a classification task, particularly for biased datasets. A model that always predicts a user is not adopted will be right most of the time, but is not very useful. We are also probably more interested in predicting users that will adopt the product in this dataset.

In [15]:
random_state = 8
model = RandomForestClassifier(random_state=random_state)
model.fit(x_train_encode, y_train_encode)
print('Accuracy: {:.4f}'.format(model.score(x_test_encode, y_test_encode)))
print('Precision: {:.4f}'.format(precision_score(y_test_encode, model.predict(x_test_encode))))
print('Recall: {:.4f}'.format(recall_score(y_test_encode, model.predict(x_test_encode))))

Accuracy: 0.8179
Precision: 0.2278
Recall: 0.1026


Since the data is biased, I oversampled the training data using the SMOTE class from the imbalanced-learn module. This allows me to oversample instances of adopted users to make the data balanced.

In [16]:
sm = SMOTE(random_state=random_state)
x_train_res, y_train_res = sm.fit_sample(x_train_encode, y_train_encode)

I printed the counts of each class to verify the resampled data is balanced.

In [17]:
print('Training set adoption split')
print(np.bincount(y_train_res))

Training set adoption split
[8349 8349]


Lastly, I retrained the model on the resampled data and print the same metrics as above. We lost some accuracy and precision, but gained recall.

In [18]:
model.fit(x_train_res, y_train_res)
print('Accuracy: {:.4f}'.format(model.score(x_test_encode, y_test_encode)))
print('Precision: {:.4f}'.format(precision_score(y_test_encode, model.predict(x_test_encode))))
print('Recall: {:.4f}'.format(recall_score(y_test_encode, model.predict(x_test_encode))))

Accuracy: 0.7958
Precision: 0.2140
Recall: 0.1481
