In [1]:
import pickle
import patsy
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import seaborn as sns

from copy import deepcopy
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report, make_scorer, fbeta_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

### Subset answer_time_series to include only top answerers (Avg 2 answers monthly)

In [34]:
with open('./data/processed/top_answerer_3year.pkl', 'rb') as picklefile:
    top_answerer_3year = pickle.load(picklefile)
with open('./data/processed/user_basic.pkl', 'rb') as picklefile:
    user_basic = pickle.load(picklefile)

### Segment time series into 18 months, ignoring numbers before user signup

In [39]:
top_answerer_3year.drop(['changes', 'change_history'], axis=1, inplace=True)
user_basic.set_index('id', inplace=True)
top_answerer_ts_creation = pd.merge(top_answerer_3year, user_basic[['creation_year', 'creation_month']], how='left', left_index=True, right_index=True)

In [46]:
# Split data into many rows with 12 months' record, use the last n months to determine if user has churned
def monthsSince2015(signup_year, signup_month):
    baseline_date = datetime(2015, 1, 1, 00, 00)
    return (signup_year-2015)*12 + (signup_month-baseline_date.month)
def build_timeseries(df_ts, mode='convertEndStatus'):
    df = deepcopy(df_ts)
    ts_colname = ['M', 'M+1', 'M+2', 'M+3',
             'M+4', 'M+5', 'M+6', 'M+7',
             'M+8', 'M+9', 'M+10', 'M+11',
             'M+12', 'M+13', 'M+14', 'M+15',
             'M+16', 'M+17']
    df['cutoff'] = df.apply(lambda x:monthsSince2015(x[36], x[37]), axis=1)
    long_list = df[df['cutoff']<=0].iloc[:,0:18]
    long_list.columns=ts_colname
    print ('round 0 - added', long_list.shape[0], 'rows')
    for i in range(1, df.shape[1] - 19):
        eligible_values = df[df['cutoff']<=i].iloc[:,i:i+18]
        eligible_values.columns=ts_colname
        long_list = pd.concat([long_list, eligible_values])
        print ('round', i, '- added', eligible_values.shape[0], 'rows')
    if (mode == 'convertEndStatus'):
        long_list['inactive'] = long_list.apply(lambda x:1 if np.sum(x[-6:])==0 else 0, axis=1)
        long_list.drop(['M+12', 'M+13', 'M+14', 'M+15', 'M+16', 'M+17'], inplace=True, axis=1)
    elif (mode == 'countLast4moAnswer'):
        long_list['last6Month'] = long_list.apply(lambda x:np.sum(x[-6:]), axis=1)
        long_list.drop(['M+12', 'M+13', 'M+14', 'M+15', 'M+16', 'M+17'], inplace=True, axis=1)
    return long_list

In [55]:
%time top_answerer = build_timeseries(top_answerer_ts_creation, 'convertEndStatus')

round 0 - added 13025 rows
round 1 - added 13300 rows
round 2 - added 13587 rows
round 3 - added 13920 rows
round 4 - added 14184 rows
round 5 - added 14491 rows
round 6 - added 14784 rows
round 7 - added 15094 rows
round 8 - added 15376 rows
round 9 - added 15681 rows
round 10 - added 15924 rows
round 11 - added 16192 rows
round 12 - added 16537 rows
round 13 - added 16866 rows
round 14 - added 17183 rows
round 15 - added 17511 rows
round 16 - added 17838 rows
round 17 - added 18139 rows
round 18 - added 18446 rows
round 19 - added 18446 rows
CPU times: user 59.8 s, sys: 464 ms, total: 1min
Wall time: 56.2 s


In [59]:
with open('./data/processed/top_answerer.pkl', 'wb') as picklefile:
    pickle.dump(top_answerer, picklefile)

### Read and format the user demographic data

In [66]:
with open('./data/processed/user_basic.pkl', 'rb') as picklefile:
    user_basic = pickle.load(picklefile)

In [67]:
user_basic.head()

Unnamed: 0,id,about_me_length,creation_year,creation_month,last_access_year,last_access_month,location,up_votes,down_votes,profile_image,website_url
0,16399,5,2008,9,2017,1,"Toronto, Canada",501,37,1,http://www.google.com/
1,15351,0,2008,9,2013,1,Austria,2,0,1,
2,47721,43,2008,12,2016,1,"Toronto, Canada",12,3,1,http://www.fuzzylizard.com
3,7984,0,2008,9,2018,1,"Laval, Canada",94,17,1,
4,16954,562,2008,9,2012,1,"Hastings, United Kingdom",66,3,1,http://dominicblackwell.com


In [155]:
user_basic['location'] = user_basic.location.apply(lambda x:x.split(', ')[-1])
website_count = pd.DataFrame(user_basic.website_url.value_counts())
website_count.reset_index(inplace=True)
personal_website = set(website_count[website_count['website_url']==1]['index'].values)
user_basic['personal_website'] = user_basic['website_url'].apply(lambda x:1 if x in personal_website else 0)
user_basic.set_index('id', inplace=True)
user_basic.drop(['last_access_year', 'last_access_month', 'website_url'], axis=1, inplace=True)

In [12]:
with open('./data/processed/user_basic_processed.pkl', 'wb') as picklefile:
    pickle.dump(user_basic, picklefile)

## Merge the user demographic data with answer time series

In [69]:
with open('./data/processed/top_answerer.pkl', 'rb') as picklefile:
    top_answerer = pickle.load(picklefile)
with open('./data/processed/user_basic_processed.pkl', 'rb') as picklefile:
    user_basic = pickle.load(picklefile)

In [70]:
user_basic.drop(['up_votes', 'down_votes'], axis=1, inplace=True)

In [71]:
top_answerer_basic = pd.merge(user_basic, top_answerer, how='right', left_index=True, right_index=True)

## Formalize X and Y

In [72]:
with open('./data/processed/location_dict.pkl', 'rb') as picklefile:
    location_dict = pickle.load(picklefile)

In [73]:
location_dict.values()

dict_values(['NDF', 'NDF', 'APAC', 'AM', 'EMEA', 'EMEA', 'EMEA', 'AM', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'AM', 'EMEA', 'EMEA', 'APAC', 'AM', 'EMEA', 'EMEA', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'AM', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'APAC', 'APAC', 'APAC', 'APAC', 'AM', 'APAC', 'AM', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'APAC', 'AM', 'EMEA', 'EMEA', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'AM', 'APAC', 'AM', 'AM', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'AM', 'APAC', 'AM', 'AM', 'APAC', 'EMEA', 'APAC', 'EMEA', 'AM', '', 'AM', 'EMEA', 'AM', 'EMEA', 'EMEA', 'EMEA', '', 'AM', 'AM', 'AM', 'EMEA', 'APAC', 'EMEA', 'EMEA', 'AM', 'EMEA', 'EMEA', 'AM', 'AM', 'AM', 'AM', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'AM', 'AM', 'AM'])

In [75]:
y = top_answerer_basic['inactive']
X = top_answerer_basic.drop('inactive', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4444, stratify=y)

In [76]:
## Dummify location
with open('./data/processed/location_dict.pkl', 'rb') as picklefile:
    location_dict = pickle.load(picklefile)
    
def map_dummify_Locations(df, location_dict):
    df['location'] = df.location.apply(lambda x:location_dict[x] if x in set(location_dict.keys()) else 'Others')
    location_dummy = patsy.dmatrix('location', data=df, return_type='dataframe')
    df = pd.concat([df, location_dummy], axis = 1)
    df.drop('location', axis=1, inplace=True)
    new_colnames = [item.replace('[','-') for item in list(df.columns)]
    new_colnames = [item.replace(']','') for item in new_colnames]
    df.columns = new_colnames
    df.drop('Intercept', axis=1, inplace=True)
    return df

In [77]:
X_train = map_dummify_Locations(X_train, location_dict)
X_test = map_dummify_Locations(X_test, location_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
with open('./data/processed/user_ts_X_train.pkl', 'wb') as picklefile:
    pickle.dump(X_train, picklefile)

## Prepare data for training

In [80]:
# Normalize the data
ssX = StandardScaler()
X_train_norm = ssX.fit_transform(X_train)

In [87]:
X_train_resampled, y_train_resampled = RandomUnderSampler(random_state=4444).fit_sample(X_train_norm, y_train.ravel())

In [95]:
def gridSearchFiveModels(X, y):
    models = [
        ('knn', KNN),
        ('logistic', LogisticRegression),
        ('tree', DecisionTreeClassifier),
        ('forest', RandomForestClassifier),
        ('xgboost', XGBClassifier)
    ]

    param_choices = [
        {
            'n_neighbors': range(2,12)
        },
        {
            'C': np.logspace(-3,6,12),
            'penalty':['l1', 'l2']
        },
        {
            'max_depth': [2,3,4,5],
            'min_samples_leaf': [3,6,10]
        },
        {
            'n_estimators': [50, 100, 200],
            'max_depth': [1,2,3,4,5],
            'min_samples_leaf': [3,6,10]
        },
        {
            'max_depth': [3,4,5],
            'n_estimators': [1, 50, 100,200],
            'objective':['binary:logistic']
        }
    ]

    grids = {}
    #ftwo_scorer = make_scorer(fbeta_score, average='binary', beta=1.41)
    
    for model_info, params in zip(models, param_choices):
        print ('Now Fitting', model_info, '\n')
        name, model = model_info
        grid = RandomizedSearchCV(model(), params, scoring='roc_auc', cv=5, n_jobs=-1)
        grid.fit(X, y)
        s = "{}: best score: {}".format(name, grid.best_score_)
        print(s)
        grids[name] = grid
    return grids

In [96]:
grid_clas = gridSearchFiveModels(X_train_resampled, y_train_resampled)

Now Fitting ('knn', <class 'sklearn.neighbors.classification.KNeighborsClassifier'>) 

knn: best score: 0.7794393876540429
Now Fitting ('logistic', <class 'sklearn.linear_model.logistic.LogisticRegression'>) 

logistic: best score: 0.7926119111453442
Now Fitting ('tree', <class 'sklearn.tree.tree.DecisionTreeClassifier'>) 

tree: best score: 0.8363590972722968
Now Fitting ('forest', <class 'sklearn.ensemble.forest.RandomForestClassifier'>) 

forest: best score: 0.8429240146827498
Now Fitting ('xgboost', <class 'xgboost.sklearn.XGBClassifier'>) 

xgboost: best score: 0.8522513970117119


In [369]:
print(classification_report(y_train_subsample, grid_SMOTE['tree'].best_estimator_.predict(X_train_subsample)))

             precision    recall  f1-score   support

          0       0.79      0.81      0.80      4883
          1       0.81      0.80      0.81      5117

avg / total       0.80      0.80      0.80     10000



In [372]:
print(classification_report(y_train, grid_SMOTE['tree'].best_estimator_.predict(X_train_norm)))

             precision    recall  f1-score   support

          0       0.93      0.81      0.87    107335
          1       0.39      0.67      0.49     19040

avg / total       0.85      0.79      0.81    126375



In [373]:
confusion_matrix(y_train, grid_SMOTE['tree'].best_estimator_.predict(X_train_norm))

array([[87266, 20069],
       [ 6227, 12813]])