<H3>Identifying which factors predict future user adoption</H3>

In [1]:
# importing libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading data into dataframes
user_engagement = pd.read_csv('takehome_user_engagement.csv')
users = pd.read_csv('takehome_users.csv', encoding='latin-1')

In [3]:
# changing dates to datetime objects
users['creation_time'] = pd.to_datetime(users.creation_time)
users['last_session_creation_time'] = pd.to_datetime(users.last_session_creation_time)
user_engagement['time_stamp'] = pd.to_datetime(user_engagement.time_stamp)

In [4]:
# creating dictionary where user_id is key, and list of timestamps is the value
usage = defaultdict(list)
for ID, time in zip(user_engagement.user_id, user_engagement.time_stamp):
    usage[ID].append(time)

In [5]:
def within_7days(logins):
    '''takes a list of logins and determines if 2 or more occur in 7 days'''
    delta_days = [] # list of 2 logins that happend within 7 days
    for num in range(0, len(logins)-1):
        A = logins[num]
        B = logins[num + 1]
        delta_days.append(np.abs(A - B))
        
    final_logins = [day for day in delta_days if day < timedelta(days=7)]
    if len(final_logins) > 0:
        return 1
    else:
        return 0

In [6]:
user_adoption = defaultdict(int)
for ID, logins in usage.items():
    if len(logins) < 2:
        user_adoption[ID] = 0
    else:
        user_adoption[ID] = within_7days(logins)

In [7]:
# creating adopted column
users['adoption'] = [user_adoption[ID] for ID in users.object_id]

In [8]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adoption
0,1,2014-04-22 03:53:00,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.398138810,1,0,11,10803.0,0
1,2,2013-11-15 03:45:00,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1970-01-01 00:00:01.396237504,0,0,1,316.0,1
2,3,2013-03-19 23:14:00,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1970-01-01 00:00:01.363734892,0,0,94,1525.0,0
3,4,2013-05-21 08:09:00,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.369210168,0,0,1,5151.0,0
4,5,2013-01-17 10:14:00,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.358849660,0,0,193,5240.0,0


In [9]:
# Identify features and target
X = users[['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id']]
y = users.adoption

In [10]:
# one-hot encode categorical variables
X = pd.get_dummies(X, columns = ['creation_source', 'org_id'], drop_first=True)

In [11]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [12]:
# create and fit logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
# validate with test data
y_pred = clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.822

In [14]:
# determine which features are the most important
i = 0
features = []
for feature in X.columns.values:
    features.append((clf.coef_[0][i], feature))
    i += 1
#print the top 10
sorted(features, reverse=True)[:10]

[(1.2271335490254702, 'org_id_235'),
 (1.2152536404838916, 'org_id_231'),
 (1.2145711676970812, 'org_id_289'),
 (1.1871695257863442, 'org_id_156'),
 (1.130013353496323, 'org_id_399'),
 (1.1033697216291634, 'org_id_415'),
 (1.0725765804645666, 'org_id_58'),
 (1.054525195445269, 'org_id_82'),
 (1.0425218019891997, 'org_id_245'),
 (1.0260580137633066, 'org_id_381')]

<div class="alert alert-info">

Logistic Regression model has accuracy of 82.22%. Organization is the most influential feature for the users to adopt to the product.