In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import timedelta
from pprint import pprint
import seaborn as sns

In [99]:
engagement = pd.read_csv('takehome_user_engagement.csv')
users = pd.read_csv('takehome_users.csv', encoding = 'latin-1')

In [44]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [100]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [101]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [102]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [103]:
print('Number Of Users: {}'.format(len(engagement.user_id.unique())))

Number Of Users: 8823


In [104]:
#Convert time_stamp column to type datetime
engagement['date_time'] = pd.to_datetime(engagement.time_stamp)

In [105]:
#Drop the time from datetime
engagement['date'] = engagement.date_time.dt.date

In [106]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited,date_time,date
0,2014-04-22 03:53:30,1,1,2014-04-22 03:53:30,2014-04-22
1,2013-11-15 03:45:04,2,1,2013-11-15 03:45:04,2013-11-15
2,2013-11-29 03:45:04,2,1,2013-11-29 03:45:04,2013-11-29
3,2013-12-09 03:45:04,2,1,2013-12-09 03:45:04,2013-12-09
4,2013-12-25 03:45:04,2,1,2013-12-25 03:45:04,2013-12-25


An "adopted user" as a user who has logged into the product on three separate days in at least one seven day period.
1. Numbers of logins >= 3 on 


In [135]:
return_user=pd.Series.to_dict(engagement.groupby('user_id').date.count() >= 3)

In [142]:
return_users = engagement.copy()
return_users['return_user'] = return_users.user_id.map(return_user)
return_users = return_users[return_users.return_user == True]

In [148]:
print('Number of users with at least 3 login days: {}'.format(len(return_users.user_id.unique())))

Number of users with at least 3 login days: 2248


2. 3 login days in a period of 7 days
Or day 1 and day 3 will have to have a difference of 7. If not, day 2 and day 4 will have to have a difference of 7 till the last day login. 


In [150]:
# Create a list of all the user ID's
user_ids = [k for k,v in return_users.groupby('user_id')]

In [152]:
grouped = return_users.groupby('user_id')['date_time']

We will define a function that take in a list of dates (list) and return a boolean if there are 3 logins in 7 days.

In [163]:
def adopt(l):

    length = len(l)
    idx = 0
    # loop through each date in the list starting with the first date and ending at 2nd to the last date in the list
    while idx < (length - 2):
        #Check if the timestamp that is 2 timestamps away from the current is greater than 7 days
        if (l[idx + 2] - l[idx]).days > 7:
            #Increment idx to make the next timestamp in the list the referenced timestamp
            idx += 1
            continue
            
        else:
            #return true that the critera is satisfied
            return True  
    #after looping through all timestamps return false since the critera isn't satisfied
    return False

Now we will define a function that take in a dataframe of groupby users and a list of user ids and return a dictionary with user ids as keys and boolean as values: adopted or not adopted users.

In [164]:
def adopt_user(grouped, users):
    # Create an empty dictionary to update
    adopted = {}
    
    # Create a list of lists of all the dates for each user
    dates = [list(grouped.get_group(x)) for x in users]
    
    #Iterate through each list of dates for the corresponding user and check criteria
    for pos,user in enumerate(users):
        
        #update dictionary
        adopted[user] = adopt(dates[pos])
    return adopted
    

In [169]:
adopted_users=adopt_user(grouped, user_ids)


In [177]:
return_users['adopt_user'] = return_users.user_id.map(adopted_users)
adopt_users = return_users[return_users.adopt_user == True]

In [180]:
print('Number of "Adopted Users": {}'.format(len(adopt_users.user_id.unique())))

Number of "Adopted Users": 1656


In [186]:
adopted_users_ids = adopt_users.user_id.unique()

In [187]:
users['adopted_user'] = users.object_id.apply(lambda x: 1 if x in adopted_users_ids else 0)

In [188]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [190]:
users.creation_source.unique()

array(['GUEST_INVITE', 'ORG_INVITE', 'SIGNUP', 'PERSONAL_PROJECTS',
       'SIGNUP_GOOGLE_AUTH'], dtype=object)

In [192]:
#preprocessing creation_source
from sklearn.preprocessing import LabelEncoder
users['source_encoded'] = LabelEncoder().fit_transform(users['creation_source'])

In [194]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [196]:
Xtr, Xtest, ytr, ytest = train_test_split(users[['source_encoded','opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id']].values, (users.adopted_user==1).values, random_state=5)
clf=LogisticRegression()
clf.fit(Xtr, ytr)
print(accuracy_score(clf.predict(Xtest),ytest))

0.8593333333333333


To show the relationship between each pair.

In [224]:
adopt_df=users[users.adopted_user==1]
adopt_df.groupby(['creation_source'])['object_id'].count()

creation_source
GUEST_INVITE          369
ORG_INVITE            574
PERSONAL_PROJECTS     172
SIGNUP                302
SIGNUP_GOOGLE_AUTH    239
Name: object_id, dtype: int64

From the first look, Org_invitation has the best conversation rate of 574 adopted users. Second is guest invite with 369. This app should target companies and organizations as well as friends referral.