In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [2]:
# read in csv files
takehome_user_engagement = pd.read_csv('takehome_user_engagement.csv', header=0)
takehome_users = pd.read_csv('takehome_users.csv', encoding='ISO-8859-1', header=0)

In [3]:
takehome_users.head()
#takehome_user_engagement.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
print(set(takehome_users['creation_source']))

{'SIGNUP', 'GUEST_INVITE', 'SIGNUP_GOOGLE_AUTH', 'ORG_INVITE', 'PERSONAL_PROJECTS'}


In [5]:
# map the creation source to integers
dict_creation_source = {'SIGNUP_GOOGLE_AUTH': 1, 'PERSONAL_PROJECTS': 2, 'GUEST_INVITE': 3, 'SIGNUP': 4, 'ORG_INVITE': 5}
takehome_users['creation_source'] = [dict_creation_source[i] for i in takehome_users['creation_source']]

In [6]:
# cleanup of some of the columns
takehome_users['creation_time'] = pd.to_datetime(takehome_users['creation_time'])
#takehome_users['name'] = takehome_users['name'].to_string()
#takehome_users['email'] = takehome_users['email'].to_string()
takehome_users['invited_by_user_id'] = takehome_users['invited_by_user_id'].fillna(0).astype(int)
takehome_users['last_session_creation_time'] = takehome_users['last_session_creation_time'].fillna(0).astype(int)

In [7]:
takehome_users.info()

takehome_user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null int64
last_session_creation_time    12000 non-null int32
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            12000 non-null int32
dtypes: datetime64[ns](1), int32(2), int64(5), object(2)
memory usage: 843.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [8]:
# data wrangling steps

# convert time_stamp to datetime, drop the visited column (they are all 1's)
takehome_user_engagement['time_stamp'] = pd.to_datetime(takehome_user_engagement['time_stamp'])
takehome_user_engagement = takehome_user_engagement.drop(labels='visited', axis = 1)

# count how many timestamps per user
counts = takehome_user_engagement.groupby('user_id').count().reset_index()
#counts
#takehome_user_engagement

# find the difference between the timestamps in days
deltas = [(takehome_user_engagement['time_stamp'][d+1] - takehome_user_engagement['time_stamp'][d]).days for d in range(len(takehome_user_engagement)-2)]
# if the sum of 3 consecutive deltas is less than 7, label that user as a 1
deltas = [1 if deltas[i] + deltas[i+1] + deltas[i+2] <= timedelta(days=7).days else 0 for i in range(len(deltas)-2)]

# empty list for users 
x = []

for i in range(len(deltas)):
    x.append((takehome_user_engagement['user_id'][i], deltas[i]))
    
x = pd.DataFrame(x, columns=['user_id', 'adopted_user'])

# only include the users with at least 3 time stamps
counts = counts[counts['time_stamp'] >= 3]
counts.drop('time_stamp', axis=1)

# adopted users dataframe where adopted_user is 1
adopted_users = x[x['adopted_user'] == 1]
adopted_users = adopted_users.drop_duplicates() # distinct users

# merge adopted_users with counts to eliminate false adopted users (<3 logins)
adopted_users = adopted_users.merge(counts, left_on='user_id', right_on='user_id')

# merge the original takehome_users with the adopted users by each user
df = takehome_users.merge(adopted_users, on=None, left_on='object_id', right_on='user_id', how='left')
df.fillna(0, inplace=True)
df = df.drop(['time_stamp', 'user_id'], axis=1)

df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,3,1398138810,1,0,11,10803,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,5,1396237504,0,0,1,316,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,5,1363734892,0,0,94,1525,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,3,1369210168,0,0,1,5151,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,3,1358849660,0,0,193,5240,0.0


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split

In [10]:
# features for training/testing, dropping non-numerical attributes
X = df.drop(['object_id', 'name', 'email', 'creation_time', 'adopted_user'], axis=1)
y = df['adopted_user']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#clf = LogisticRegression()
clf = DecisionTreeClassifier()
#clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# confusion matrix, classification report, and accuracy score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_pred, y_test))
print('')

# prints the sorted list of features by importance for the classifier
i = 0
features = []
for feature in X.columns.values:
    features.append((clf.feature_importances_[i], feature))
    i += 1
    
sorted(features, reverse=True)

[[2261  231]
 [ 212  296]]
             precision    recall  f1-score   support

        0.0       0.91      0.91      0.91      2492
        1.0       0.56      0.58      0.57       508

avg / total       0.85      0.85      0.85      3000

0.8523333333333334



[(0.6998507775123937, 'last_session_creation_time'),
 (0.1559233730650813, 'org_id'),
 (0.08756637119291258, 'invited_by_user_id'),
 (0.03087422354878446, 'creation_source'),
 (0.01565515818918444, 'opted_in_to_mailing_list'),
 (0.010130096491643396, 'enabled_for_marketing_drip')]

## Importance of Factors

The factors that predict future user adoption are ordered by importance above. The features are also listed below with some of my comments on how they affect user adoption.

last_session_creation_time: the more recent the login (higher unix timestamp) then the more likely it is that the user is currently using the product... however, one must watch out for new users who just signed up.

org_id: depending on the group of users the user belongs to, some groups would be more active than others.. so depending on the org_id, some organization have more adopted users than others.

invited_by_user_id: if a user was invited by another user, either the invited user wanted to join or the user who sent the invite was seeking some sort of benefit for sending so many invitations. Either way, this does not seem like an important factor.

creation_source: there are 5 possible sources for creation: personal projects, guest invitation, organization invitation, standard signup through the website, and signup through google authentification. This factor leads me to believe that adopted users signed up for personal projects, by organization invitation, or by authentification through Google. 

opted_in_to_mailing_list: people who opt in to be part of the mailing list are likely active users and would like to learn more about the product.

enabled_for_marketing_drip: when people allow for marketing emails and promotions, they are likely active users, or at least are willing to learn more about the product. These last two make the most sense to me for adopted users.

## My Approach

The common attribute between the two files is user_id and object_id, indicating the unique user identification number. Since we are working with ID numbers, the number itself is not correlated with the prediction. Meaning, an invitation from a higher user number does not neccessarily lead to user adoption. 

I mapped the creation_source to numbers, making it a bit easier to work with.  I also made sure all attributes were numerical or datetime for the creation date, with the exception of name and email. 

The first step was to group the login times for each user by a week with a 3 login sliding window. If the user had 3 logins within a one week time delta, then they were marked as an adopted user. For users who di dnot have 3 logins, they were automatically labeled as a 0, or an unadopted user. I then merged the adopted users with the original takehome_users on their user_id/object_id and then filling in the blanks with 0's. 

With the final dataframe, I chose to drop the name, email, creation time, object id, and adopted user from the X dataset. The y or target dataset only consists of the adopted user attribute. The decision tree classifier had the highest accuracy in predicted user adoption with ~85% accuracy. The feature importances are also listed in descending order. The dataset did not seem like a very useful set to visualize graphically, since the features were classification based. The only attribute with numerical meaning was the last session creation time, which shows how recent the user logged in. This is not black and white because if a new user just logged in without logging in 3 times, they are not an adopted user. 

As next steps, I think it would be a good idea for Relax to consider time spent logged in rather than just the time of login. This will show that a user was using the product for a while and was engaged. Another form of data I would find helpful for predicting user adoption is clicking on an email that was sent. If the email was read or opened, the user is actively engaged/interested in the product. 