## Import Packages

In [20]:
import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import matplotlib.pyplot as plt
import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

## Read in Data

In [2]:
df_users_engagement = pd.read_csv('takehome_user_engagement.csv', parse_dates = ['time_stamp'])

In [3]:
df_users = pd.read_csv('takehome_users.csv', encoding = "ISO-8859-1", parse_dates = ['creation_time'])

In [4]:
def data_check(df):
    display(df.head())
    display(df.info())
    display(df.describe())

In [5]:
data_check(df_users_engagement)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null datetime64[ns]
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


None

Unnamed: 0,user_id,visited
count,207917.0,207917.0
mean,5913.314197,1.0
std,3394.941674,0.0
min,1.0,1.0
25%,3087.0,1.0
50%,5682.0,1.0
75%,8944.0,1.0
max,12000.0,1.0


In [6]:
data_check(df_users)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 937.6+ KB


None

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
count,12000.0,8823.0,12000.0,12000.0,12000.0,6417.0
mean,6000.5,1379279000.0,0.2495,0.149333,141.884583,5962.957145
std,3464.24595,19531160.0,0.432742,0.356432,124.056723,3383.761968
min,1.0,1338452000.0,0.0,0.0,0.0,3.0
25%,3000.75,1363195000.0,0.0,0.0,29.0,3058.0
50%,6000.5,1382888000.0,0.0,0.0,108.0,5954.0
75%,9000.25,1398443000.0,0.0,0.0,238.25,8817.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0


## Clean/Prepare Data

**Definition of "adopted user"**<br>
A user who has logged into the product on three separate days in at least one seven-day period.

In [7]:
def adopted_user(series):
    """put the login times of the user and get returned the status of the user"""
    if len(series) < 3:
        return 0
    sorted_series = sorted(series)
    for i in range(len(sorted_series)-2):
        if (sorted_series[i + 2] - sorted_series[i]).days <= 7:
            return 1
    else:
        return 0

In [8]:
#define a new dataframe for users that adopted the product
df_user_adopted = df_users_engagement.groupby('user_id', as_index = False).agg({'time_stamp': adopted_user, 'visited': np.sum})

#rename column time_stamp
df_user_adopted = df_user_adopted.rename(columns = {'time_stamp': 'user_adopted'})

In [9]:
#merge the dataframe with user data and the df_user_adopted based on user_id
df = pd.merge(df_users, df_user_adopted, left_on = 'object_id', right_on = 'user_id')
df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,user_adopted,visited
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1,0,1
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,2,1,14
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,3,0,1
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,4,0,1
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,5,0,1


In [10]:
#drop unnecessary columns: name, e-mail, object_id
df = df.drop(['object_id', 'creation_time', 'last_session_creation_time', 'name', 'email', 'user_id'], axis=1)

In [11]:
#replace invited_by_user_id the id with 0 for not invited an 1 for invited
df['invited_by_user_id'] = df['invited_by_user_id'].fillna(0)
df.loc[df['invited_by_user_id'] != 0, 'invited_by_user_id'] = 1

In [12]:
#get dummy variables for creation_source
df = pd.get_dummies(df, drop_first = True)
org_id = pd.get_dummies(df['org_id'], drop_first = True)

In [13]:
#merge the dataframes
dfs = [df, org_id]
df = pd.concat(dfs, axis = 1, join = 'inner')
columns = df.columns

In [14]:
#normalize the data in the dataframe
scaler = MinMaxScaler()
df = scaler.fit_transform(df)

In [15]:
df = pd.DataFrame(df)
df.columns = columns
df

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_adopted,visited,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,...,407,408,409,410,411,412,413,414,415,416
0,1.0,0.0,0.026442,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.002404,1.0,1.0,0.021488,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.225962,1.0,0.0,0.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.002404,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.463942,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8818,0.0,0.0,0.213942,1.0,0.0,0.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8819,0.0,0.0,0.480769,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8820,1.0,1.0,0.199519,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8821,0.0,0.0,0.014423,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train Model

### Logistic Regression

In [16]:
#define x and y
y = df['user_adopted']
X = df.loc[:, df.columns != 'user_adopted']

#split data in train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [17]:
clf = LogisticRegression(C = 0.005, penalty = 'l2', random_state = 0, fit_intercept = True, solver='liblinear')

#fit the model
model = clf.fit(X_train, y_train)

In [18]:
#print out the mean accuracy on the given test data and labels.
accuracy = clf.score(X_test, y_test)
print('the mean accuracy of the test set is:', accuracy)

the mean accuracy of the test set is: 0.8087225274725275


### Random Forest Classifier

In [21]:
rfc = RandomForestClassifier(random_state=0)

In [32]:
param_grid_rfc = {
    'n_estimators': [50, 100],
    'criterion': ["gini", "entropy"],
    'max_depth': [5, 10],
    'max_features': [1, 5, 'sqrt', 'log2'],
    'min_samples_leaf': [1, 3, 5],
    'min_samples_split': [2, 5],
}

In [33]:
clf = GridSearchCV(rfc, param_grid_rfc)

In [34]:
clf.fit(X_train, y_train)





GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=20,
                                              max_features='sqrt',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=3,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [35]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [36]:
clf.best_score_

0.9634579597360853

In [37]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

In [38]:
#refit the model
rfc = RandomForestClassifier(criterion = 'gini', max_depth = 20, max_features = 'sqrt', min_samples_leaf = 3, min_samples_split = 2, n_estimators = 100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
#print out the mean accuracy on the given test data and labels.
accuracy = rfc.score(X_test, y_test)
print('the mean accuracy of the test set is:', accuracy)

the mean accuracy of the test set is: 0.9787087912087912


In the case at hand random forest classifier seems to work well better than a logistic regression.