In [1]:
# Import the libraries
import pandas as pd
from datetime import datetime 
import time
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# load the data - use the appropriate encoding
users = pd.read_csv('takehome_users.csv', encoding = 'ISO-8859-1')
engagement = pd.read_csv('takehome_user_engagement.csv')

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [5]:
# Convert to datetime
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])

In [6]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [7]:
# function to get the rolling count of the count of user_ids with frequency of 7 days
def get_rolling_count(grp, freq):
    return grp.rolling(freq, on='time_stamp')['user_id'].count()

In [8]:
# implement the rolling function after grouping the engagement table by user_id
engagement['visits_7_days'] = engagement.groupby('user_id', as_index=False, group_keys=False).apply(get_rolling_count, '7D')
engagement.head()

Unnamed: 0,time_stamp,user_id,visited,visits_7_days
0,2014-04-22 03:53:30,1,1,1.0
1,2013-11-15 03:45:04,2,1,1.0
2,2013-11-29 03:45:04,2,1,1.0
3,2013-12-09 03:45:04,2,1,1.0
4,2013-12-25 03:45:04,2,1,1.0


In [9]:
# make a list containing the user_ids that satisfy the criteria of 3 logins within a 7 day period.
engagement_counts = []

for index, row in engagement.iterrows():
    if row.visits_7_days >= 3.0:
        if row.user_id not in engagement_counts:
            engagement_counts.append(row.user_id)

In [10]:
# Make another list that shows '1' for user_id present in engagement_counts list and '0' when not. This is the target variabled
# and is added to the users table as 'adopted_user'
adopted_user = []

for i in users.object_id:
    if i in engagement_counts:
        adopted_user.append(1)
    else:
        adopted_user.append(0)
        
users['adopted_user'] = adopted_user

In [11]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [12]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
 10  adopted_user                12000 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1.0+ MB


In [13]:
# all the NaNs are filled with 0
users = users.fillna(0)
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  12000 non-null  float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          12000 non-null  float64
 10  adopted_user                12000 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1.0+ MB


In [14]:
# unix timestamp to string
users.last_session_creation_time = users.last_session_creation_time.map(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
# string to datetime
users.last_session_creation_time = pd.to_datetime(users.last_session_creation_time)
# creation time string to datetime
users.creation_time = pd.to_datetime(users.creation_time)

In [15]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-21 20:53:30,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-30 20:45:04,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 16:14:52,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 01:09:28,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 02:14:20,0,0,193,5240.0,0


In [16]:
# creation_time and last_session_creation_time can be represented in the table as another column which shows the differencein the
# time period. How long after the creation was their last login.

last_log_days = users.last_session_creation_time - users.creation_time

In [17]:
# This difference is appended to a list
days = []
for i in last_log_days:
    val = i.days
    if val < 0:
        days.append(0)
    else:
        days.append(val)
    

In [18]:
# the list is added to the users data table
users['last_log_days'] = days
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,last_log_days
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-21 20:53:30,1,0,11,10803.0,0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-30 20:45:04,0,0,1,316.0,1,135
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 16:14:52,0,0,94,1525.0,0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 01:09:28,0,0,1,5151.0,0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 02:14:20,0,0,193,5240.0,0,4


In [19]:
print('Number of adopted_users: ' + str((users.adopted_user == 1).sum()))

Number of adopted_users: 1602


In [20]:
# import all the libraries for preprocessing, modeling and evaluation.

from sklearn.preprocessing import  LabelEncoder
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef as MCC

In [21]:
# encode the creation_source column
LE = LabelEncoder()
users['creation_source_code'] = LE.fit_transform(users.creation_source)

In [22]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,last_log_days,creation_source_code
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-21 20:53:30,1,0,11,10803.0,0,0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-30 20:45:04,0,0,1,316.0,1,135,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 16:14:52,0,0,94,1525.0,0,0,1
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 01:09:28,0,0,1,5151.0,0,0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 02:14:20,0,0,193,5240.0,0,4,0


In [23]:
print(users['creation_source'].unique())
print(users['creation_source_code'].unique())

['GUEST_INVITE' 'ORG_INVITE' 'SIGNUP' 'PERSONAL_PROJECTS'
 'SIGNUP_GOOGLE_AUTH']
[0 1 3 2 4]


In [24]:
y = users.adopted_user
final_data = users.drop(['object_id', 'name', 'email', 'creation_source', 'creation_time', 'last_session_creation_time', 'adopted_user'], axis=1)
final_cols = final_data.columns
final_data.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,last_log_days,creation_source_code
0,1,0,11,10803.0,0,0
1,0,0,1,316.0,135,1
2,0,0,94,1525.0,0,1
3,0,0,1,5151.0,0,0
4,0,0,193,5240.0,4,0


In [25]:
# use minmax_scale to scale values to [0,1]
X = minmax_scale(final_data)

In [26]:
# split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)


In [27]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (8400, 6)
Training Labels Shape: (8400,)
Testing Features Shape: (3600, 6)
Testing Labels Shape: (3600,)


In [28]:
# Initiate the classifier with the 'linear' kernel
svm = SVC(kernel='linear', random_state=10, class_weight='balanced')

In [29]:
# Fit
svm.fit(X_train, y_train)

SVC(class_weight='balanced', kernel='linear', random_state=10)

In [30]:
# predict on training set
y_trainpredSVM = svm.predict(X_train)

In [31]:
print(classification_report(y_train, y_trainpredSVM))
print(MCC(y_train, y_trainpredSVM))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      7279
           1       0.83      0.94      0.88      1121

    accuracy                           0.97      8400
   macro avg       0.91      0.96      0.93      8400
weighted avg       0.97      0.97      0.97      8400

0.8678672490115139


In [32]:
# predict on test set
ypredSVM = svm.predict(X_test)

In [33]:
# Calculate and print the f1 and MCC score
print(classification_report(y_test, ypredSVM))
print(MCC(y_test, ypredSVM))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      3119
           1       0.81      0.96      0.88       481

    accuracy                           0.96      3600
   macro avg       0.90      0.96      0.93      3600
weighted avg       0.97      0.96      0.97      3600

0.8606137026688333


In [34]:
# feature importance
pd.Series(svm.coef_[0], index=final_cols)

opted_in_to_mailing_list       0.007971
enabled_for_marketing_drip    -0.011366
org_id                         0.057643
invited_by_user_id            -0.001985
last_log_days                 18.933494
creation_source_code          -0.010192
dtype: float64

In [35]:
# dropped last_log_days too

final_data1 = users.drop(['object_id', 'name', 'email', 'creation_source', 'creation_time', 'last_session_creation_time', 'adopted_user', 'last_log_days'], axis=1)
final_cols1 = final_data1.columns
final_data1.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,creation_source_code
0,1,0,11,10803.0,0
1,0,0,1,316.0,1
2,0,0,94,1525.0,1
3,0,0,1,5151.0,0
4,0,0,193,5240.0,0


In [36]:
# minmax scale and split the dataset
X1 = minmax_scale(final_data1)
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, stratify=y, random_state=10)


In [37]:
# initialise classifier
svm1 = SVC(kernel='linear', random_state=10, class_weight='balanced')

In [38]:
# fit
svm1.fit(X1_train, y_train)

SVC(class_weight='balanced', kernel='linear', random_state=10)

In [39]:
# predict on training set
y_trainpredSVM1 = svm1.predict(X1_train)

In [40]:
print(classification_report(y_train, y_trainpredSVM1))
print(MCC(y_train, y_trainpredSVM1))

              precision    recall  f1-score   support

           0       0.88      0.68      0.77      7279
           1       0.16      0.40      0.23      1121

    accuracy                           0.64      8400
   macro avg       0.52      0.54      0.50      8400
weighted avg       0.78      0.64      0.69      8400

0.05744249417613867


In [41]:
# predict on test set
ypredSVM1 = svm1.predict(X1_test)

In [42]:
print(classification_report(y_test, ypredSVM1))
print(MCC(y_test, ypredSVM1))

              precision    recall  f1-score   support

           0       0.88      0.67      0.76      3119
           1       0.15      0.39      0.22       481

    accuracy                           0.63      3600
   macro avg       0.51      0.53      0.49      3600
weighted avg       0.78      0.63      0.69      3600

0.041441141101689144


In [43]:
# Feature Importance
pd.Series(svm1.coef_[0], index=final_cols1)

opted_in_to_mailing_list      0.002948
enabled_for_marketing_drip    0.009453
org_id                        2.216791
invited_by_user_id            0.116338
creation_source_code          0.062675
dtype: float64