https://christophm.github.io/interpretable-ml-book/feature-importance.html

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
users=pd.read_csv('takehome_users.csv',encoding = "ISO-8859-1")
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [3]:
user_engagement=pd.read_csv('takehome_user_engagement.csv',encoding = "ISO-8859-1")
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [4]:
user_engagement.visited.value_counts()
#the visited column is of no use

1    207917
Name: visited, dtype: int64

In [5]:
#change time_stamp to type datetime
user_engagement['time_stamp']=pd.to_datetime(user_engagement['time_stamp'])

In [6]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null datetime64[ns]
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [7]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [8]:
#we now work on adding to the users table a column that indicates whether the user is an adopted user

In [9]:

user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [10]:
#find which users were adopted
adopted=[]
#loop over each user
for u in users.object_id:
    
    keep=user_engagement[user_engagement['user_id']==u]
    keep=keep.set_index('time_stamp')
    keep=keep.sort_index()
    #do a 7 day rolling count of a user engagement.
    keep=keep.rolling(window='7d').visited.sum().max()
    if keep>=3:
        adopted.append(1)
    else:
        adopted.append(0)
 

In [11]:
users['adopted']=adopted

In [12]:
users.describe()

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
count,12000.0,8823.0,12000.0,12000.0,12000.0,6417.0,12000.0
mean,6000.5,1379279000.0,0.2495,0.149333,141.884583,5962.957145,0.1335
std,3464.24595,19531160.0,0.432742,0.356432,124.056723,3383.761968,0.340128
min,1.0,1338452000.0,0.0,0.0,0.0,3.0,0.0
25%,3000.75,1363195000.0,0.0,0.0,29.0,3058.0,0.0
50%,6000.5,1382888000.0,0.0,0.0,108.0,5954.0,0.0
75%,9000.25,1398443000.0,0.0,0.0,238.25,8817.0,0.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0,1.0


In [13]:
#we see that 13.35 percent of users are adopted users.

In [14]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [19]:
users['creation_time']=pd.to_datetime(users['creation_time'])

#we change the creation time to how many weeks the user has been signed up.
#since there might later users might have a better chance of adoption (maybe the app is better now)

latestTime=users.creation_time.max()

col=users.apply(lambda x: pd.Timedelta(latestTime-x.creation_time).days/7,axis=1)
users['weeks_member']=col
users=users.drop('creation_time',axis=1)

In [21]:
users.head()

Unnamed: 0,object_id,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,weeks_member
0,1,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0,5.428571
1,2,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1,28.0
2,3,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0,62.428571
3,4,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0,53.428571
4,5,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0,71.142857


In [22]:
X=users.drop(['object_id','name','email','adopted','last_session_creation_time'],axis=1)
Y=users['adopted']
X.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,weeks_member
0,GUEST_INVITE,1,0,11,10803.0,5.428571
1,ORG_INVITE,0,0,1,316.0,28.0
2,ORG_INVITE,0,0,94,1525.0,62.428571
3,GUEST_INVITE,0,0,1,5151.0,53.428571
4,GUEST_INVITE,0,0,193,5240.0,71.142857


In [23]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 6 columns):
creation_source               12000 non-null object
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
weeks_member                  12000 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 562.6+ KB


In [None]:
#we now have our features, however some of the features are categorical and require to be encoded.
#for simplicity, we go with integer encoding. org_id is already integer encoded
#technically, invited_by_user_id should also be one-hot encoded, however that would be problematic due to the number
#of features that would create. We thus transform this feature into invited_or_notInvited


In [31]:
def is_invited(x):
    if pd.isna(x.invited_by_user_id)==True:
        return 0
    else:
        return 1

X['invited_by_user_id']=X.apply(lambda y: is_invited(y),axis=1 )

In [34]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 6 columns):
creation_source               12000 non-null object
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            12000 non-null int64
weeks_member                  12000 non-null float64
dtypes: float64(1), int64(4), object(1)
memory usage: 562.6+ KB


In [36]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
le.fit(X.creation_source)
X['creation_source']=le.transform(X['creation_source'])


758      31.285714
7660     49.285714
11489    76.857143
10761     7.428571
5415      7.285714
Name: weeks_member, dtype: float64

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train,X_test,y_train,y_test=train_test_split(X['weeks_member'],Y,test_size=0.3)

In [57]:
X_train.head()

2993      99.857143
7700     100.142857
8923      77.285714
8176      90.000000
10976     93.714286
Name: weeks_member, dtype: float64

In [59]:
model=RandomForestClassifier(n_estimators=1000)
model.fit(np.reshape(X_train.values,(len(X_train),1)),y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
DF=pd.DataFrame()
DF['FeatureName']=X.columns
DF['importance']=model.feature_importances_

In [None]:
DF.sort_values('importance',ascending=False)

In [61]:
from sklearn.metrics import confusion_matrix,f1_score

#print(confusion_matrix(y_test,model.predict(X_test)))
print(f1_score(y_test,model.predict(np.reshape(X_test.values,(len(X_test),1)))))


#the model is doing very poorly.

0.02008032128514056


In [45]:
print(confusion_matrix(y_train,model.predict(X_train)))
print(f1_score(y_train,model.predict(X_train)))

[[7268    0]
 [   4 1128]]
0.9982300884955753


In [None]:
#keeping it simple, for each feature