In [12]:
import pickle
import numpy as np
import pandas as pd

import datetime
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split,GridSearchCV

In [141]:
with open('./data/processed/answer_time_series_2years.pkl', 'rb') as picklefile:
    answer_time_series_2years = pickle.load(picklefile)

In [142]:
with open('./data/processed/id_signup.pkl', 'rb') as picklefile:
    id_signup = pickle.load(picklefile)

In [143]:
# Join user answer activity time series with signup date, so only activities after sign up is counted
answer_time_series_2years.set_index('id', inplace=True)
id_signup.set_index('id', inplace=True)
answer_time_series_2years = answer_time_series_2years.join(id_signup, how='left')
answer_time_series_2years['signup_month'] = pd.to_datetime(answer_time_series_2years['signup_month'], format='%Y%m')

In [7]:
# Split data into many rows with 12 months' record, use the last n months to determine if user has churned
def monthsSince2016(signup_month):
    baseline_date = datetime.datetime(2016, 1, 1, 00, 00)
    return (signup_month.year-baseline_date.year)*12 + (signup_month.month-baseline_date.month)
def build_timeseries(df):
    ts_colname = ['M', 'M+1', 'M+2', 'M+3',
             'M+4', 'M+5', 'M+6', 'M+7',
             'M+8', 'M+9', 'M+10', 'M+11']
    df['cutoff'] = df.apply(lambda x:monthsSince2016(x[24]), axis=1)
    long_list = df[df['cutoff']<=0].iloc[:,0:12]
    long_list.columns=ts_colname
    print ('round 0 - added', long_list.shape[0], 'rows')
    for i in range(1, df.shape[1] - 14):
        df['cutoff'] = df.apply(lambda x:monthsSince2016(x[24])+i, axis=1)
        eligible_values = df[df['cutoff']<=0].iloc[:,i:i+12]
        eligible_values.columns=ts_colname
        long_list = pd.concat([long_list, eligible_values])
        print ('round', i, '- added', eligible_values.shape[0], 'rows')
    long_list['Active'] = long_list.apply(lambda x:1 if np.sum(x[-4:])>0 else 0, axis=1)
    long_list.drop(['M+8', 'M+9', 'M+10', 'M+11'], inplace=True, axis=1)
    return long_list

In [145]:
%time answer_ts = build_timeseries(answer_time_series_2years)

round 0 - added 540804 rows
round 1 - added 520335 rows
round 2 - added 508354 rows
round 3 - added 497304 rows
round 4 - added 486162 rows
round 5 - added 475787 rows
round 6 - added 465894 rows
round 7 - added 455600 rows
round 8 - added 445698 rows
round 9 - added 436368 rows
round 10 - added 426356 rows
round 11 - added 416322 rows
CPU times: user 3min 28s, sys: 4.84 s, total: 3min 33s
Wall time: 3min 11s


In [149]:
answer_ts['Active'] = answer_ts.apply(lambda x:1 if np.sum(x[-4:])>0 else 0, axis=1)

In [155]:
answer_ts.drop(['M+8', 'M+9', 'M+10', 'M+11'], inplace=True, axis=1)

In [157]:
with open('./data/processed/answer_ts.pkl', 'wb') as picklefile:
    pickle.dump(answer_ts, picklefile)

### Read the user demographic data

In [3]:
with open('./data/processed/answer_ts.pkl', 'rb') as picklefile:
    answer_ts = pickle.load(picklefile)
with open('./data/processed/user_basic.pkl', 'rb') as picklefile:
    user_basic = pickle.load(picklefile)

In [4]:
user_basic.head()

Unnamed: 0,id,about_me_length,creation_year,creation_month,last_access_year,last_access_month,location,up_votes,down_votes,profile_image,website_url
0,16399,5,2008,9,2017,1,"Toronto, Canada",501,37,1,http://www.google.com/
1,15351,0,2008,9,2013,1,Austria,2,0,1,
2,47721,43,2008,12,2016,1,"Toronto, Canada",12,3,1,http://www.fuzzylizard.com
3,7984,0,2008,9,2018,1,"Laval, Canada",94,17,1,
4,16954,562,2008,9,2012,1,"Hastings, United Kingdom",66,3,1,http://dominicblackwell.com


In [6]:
user_basic['location'] = user_basic.location.apply(lambda x:x.split(', ')[-1])
website_count = pd.DataFrame(user_basic.website_url.value_counts())
website_count.reset_index(inplace=True)
personal_website = set(website_count[website_count['website_url']==1]['index'].values)
user_basic['personal_website'] = user_basic['website_url'].apply(lambda x:1 if x in personal_website else 0)
user_basic.set_index('id', inplace=True)
user_basic.drop(['last_access_year', 'last_access_month', 'website_url'], axis=1, inplace=True)

In [11]:
user_basic.head()

Unnamed: 0_level_0,about_me_length,creation_year,creation_month,location,up_votes,down_votes,profile_image,personal_website
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16399,5,2008,9,Canada,501,37,1,0
15351,0,2008,9,Austria,2,0,1,0
47721,43,2008,12,Canada,12,3,1,1
7984,0,2008,9,Canada,94,17,1,0
16954,562,2008,9,United Kingdom,66,3,1,1


## Formalize X and Y

In [16]:

answer_ts.head()

Unnamed: 0_level_0,M,M+1,M+2,M+3,M+4,M+5,M+6,M+7,Active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0
13,0,25,19,9,7,21,13,4,1
22,0,0,0,0,0,0,0,0,0


In [15]:
answer_ts.head()

Unnamed: 0_level_0,M,M+1,M+2,M+3,M+4,M+5,M+6,M+7,Active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0
13,0,25,19,9,7,21,13,4,1
22,0,0,0,0,0,0,0,0,0


In [13]:
y = topans['Active']
X = topans.drop('Active', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4444, stratify=y)

NameError: name 'topans' is not defined