In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime


In [2]:
#read in related csv files.

engagement = pd.read_csv('takehome_user_engagement.csv', encoding='utf-8')
users = pd.read_csv('takehome_users.csv', encoding='latin-1')
users['last_session_creation_time']=pd.to_datetime(users['last_session_creation_time'])

print(engagement.head())
print()
print(users.head())

            time_stamp  user_id  visited
0  2014-04-22 03:53:30        1        1
1  2013-11-15 03:45:04        2        1
2  2013-11-29 03:45:04        2        1
3  2013-12-09 03:45:04        2        1
4  2013-12-25 03:45:04        2        1

   object_id        creation_time               name  \
0          1  2014-04-22 03:53:30     Clausen August   
1          2  2013-11-15 03:45:04      Poole Matthew   
2          3  2013-03-19 23:14:52  Bottrill Mitchell   
3          4  2013-05-21 08:09:28    Clausen Nicklas   
4          5  2013-01-17 10:14:20          Raw Grace   

                        email creation_source    last_session_creation_time  \
0    AugustCClausen@yahoo.com    GUEST_INVITE 1970-01-01 00:00:01.398138810   
1      MatthewPoole@gustr.com      ORG_INVITE 1970-01-01 00:00:01.396237504   
2  MitchellBottrill@gustr.com      ORG_INVITE 1970-01-01 00:00:01.363734892   
3   NicklasSClausen@yahoo.com    GUEST_INVITE 1970-01-01 00:00:01.369210168   
4          GraceRaw@y

In [3]:
print('engagement users: ',engagement['user_id'].nunique())
print('users in users: ', users['object_id'].nunique())

engagement users:  8823
users in users:  12000


In [4]:
#organize time
engagement['time_stamp']=pd.to_datetime(engagement['time_stamp'])
engagement = engagement.sort_values('time_stamp')

print('first', engagement.time_stamp.min())
print('last', engagement.time_stamp.max())

first 2012-05-31 08:20:06
last 2014-06-06 14:58:50


In [5]:
user_id = engagement.user_id.unique()
adoption_list = []

for i in user_id:
    df = engagement[engagement.user_id == i].reset_index().sort_values(by='time_stamp')
    status = False
    if len(df) < 3:
        pass
    else: 
        for j in range(0, len(df)-2): 
            time_diff = df.time_stamp[j+2] - df.time_stamp[j]
            date1 = df.time_stamp[j].date()
            date2 = df.time_stamp[j+1].date()
            date3 = df.time_stamp[j+2].date()
            if (time_diff < pd.Timedelta('7 days')) & (date1 != date2) & (date2 != date3):
                status = True
            else:
                pass
    adoption_list.append(status)   

adoption_users = pd.DataFrame({'user_id': user_id, 'adopted': adoption_list})
adoption_users.head()

Unnamed: 0,adopted,user_id
0,False,10012
1,False,3428
2,False,9899
3,True,1693
4,False,6102


In [6]:
relax = users.merge(adoption_users, left_on='object_id', right_on='user_id', how='outer')
relax.adopted.fillna(False, inplace=True)
relax.drop(['user_id', 'object_id'], axis=1, inplace=True)
relax.head()

Unnamed: 0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.398138810,1,0,11,10803.0,False
1,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1970-01-01 00:00:01.396237504,0,0,1,316.0,True
2,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1970-01-01 00:00:01.363734892,0,0,94,1525.0,False
3,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.369210168,0,0,1,5151.0,False
4,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.358849660,0,0,193,5240.0,False


In [7]:
#see types of invites
print('invite source:')
print(relax.creation_source.value_counts())
print()
print(relax.org_id.value_counts())
print()
print(relax.invited_by_user_id.value_counts())

invite source:
ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64

0      319
1      233
2      201
3      168
4      159
6      138
5      128
9      124
7      119
10     104
8       97
14      87
11      75
12      75
17      74
18      73
13      72
16      72
20      68
15      64
24      63
25      62
28      61
23      60
30      59
22      58
33      57
40      57
21      56
27      55
      ... 
399     13
410     13
322     12
387     12
346     12
407     12
295     12
354     12
344     12
405     12
381     12
356     12
301     12
232     12
364     11
183     11
365     11
352     10
353     10
294     10
378     10
304     10
395      9
315      9
355      9
396      9
400      8
397      8
386      7
416      2
Name: org_id, Length: 417, dtype: int64

10741.0    13
2527.0     12
2308.0     11
1525.0     11
11770.0    11
10628.0    10
7012.0     10
461

In [8]:
#create dummy variables for invites
invite = pd.get_dummies(relax.creation_source)
relax = relax.join(invite)

#days since sign up
current_time = pd.to_datetime(relax.last_session_creation_time).max()
relax.creation_time = pd.to_datetime(relax.creation_time)
relax['days_since_signup'] = (relax.creation_time - current_time).apply(lambda x: x.days)



In [9]:
main_org = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10]
relax.org_id[relax.org_id.isin(main_org)] = 1
relax.org_id[relax.org_id != 1] = 0

relax.invited_by_user_id[relax.invited_by_user_id.isin(main_org)] = 1
relax.invited_by_user_id[relax.invited_by_user_id != 1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
relax.head()

Unnamed: 0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH,days_since_signup
0,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.398138810,1,0,0,0.0,False,1,0,0,0,0,16182
1,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1970-01-01 00:00:01.396237504,0,0,1,0.0,True,0,1,0,0,0,16024
2,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1970-01-01 00:00:01.363734892,0,0,0,0.0,False,0,1,0,0,0,15783
3,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.369210168,0,0,1,0.0,False,1,0,0,0,0,15846
4,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1970-01-01 00:00:01.358849660,0,0,0,0.0,False,1,0,0,0,0,15722


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

#prep data for modeling
data = relax.drop(['creation_time', 'name', 'email', 'creation_source', 'last_session_creation_time'], axis=1)
X = data.drop(['adopted'], axis=1)
y = data.adopted

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

# Normalize data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
#log regression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

print( 'R-squared: ', logreg.score(X_test, y_test))


R-squared:  0.872083333333


In [13]:
results = pd.DataFrame()
co_ef = pd.DataFrame(abs(logreg.coef_)).unstack().unstack()
results['Features'] = X.columns
results = pd.concat([results, co_ef], axis=1)
results.columns = ['Features', 'Coefficients']
results.sort_values('Coefficients', ascending=False)

Unnamed: 0,Features,Coefficients
2,org_id,0.251884
9,days_since_signup,0.24557
6,PERSONAL_PROJECTS,0.22228
3,invited_by_user_id,0.115983
4,GUEST_INVITE,0.107122
8,SIGNUP_GOOGLE_AUTH,0.09364
7,SIGNUP,0.035269
0,opted_in_to_mailing_list,0.014802
1,enabled_for_marketing_drip,0.004812
5,ORG_INVITE,0.001242


## Conculsion:

The three biggest predictors found from the model are the whether or not they joined because of a major organization, days since they signed up, and if the creation source was through PERSONAL_PROJECTS.  Having users join a major organization was important to staying connected to the platform.  Also noteworthy is that those invited by other users took third which reinforces the idea that there need to be a sense of community in order to grow the platform.  Users who signed up for personal projects were more likely to become adopted users, a marketing push to focus on this will help grow more users that will stay with the platform.  Lastly but not least, days since sign up was another important factor.  This tells us that there could be something in the past that the company was going that was better than the way things are running now.  Developers should go back and see if there were features of the platform removed or a point where maybe too many features existed to improve the platform.  