In [1]:
import pickle
import patsy
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import seaborn as sns

from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report, make_scorer, fbeta_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.pipeline import Pipeline

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

### Subset answer_time_series to include only top answerers (Avg 2 answers monthly)

In [5]:
with open('./data/processed/answer_time_series_2years2.pkl', 'rb') as picklefile:
    answer_ts = pickle.load(picklefile)
with open('./data/processed/user_basic.pkl', 'rb') as picklefile:
    user_basic = pickle.load(picklefile)

In [7]:
print (answer_ts.shape)
answer_ts.head()

(869963, 24)


Unnamed: 0_level_0,m_201601,m_201602,m_201603,m_201604,m_201605,m_201606,m_201607,m_201608,m_201609,m_201610,...,m_201703,m_201704,m_201705,m_201706,m_201707,m_201708,m_201709,m_201710,m_201711,m_201712
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
13,0,25,19,9,7,21,13,4,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,6,2,0


In [8]:
def setMinAnswer(rowdata):
    if (int(rowdata[2]) < 2016):
        return 48
    else:
        return (2018-int(rowdata[2]))*24 + (12-int(rowdata[3]))*2

In [9]:
user_basic['min_answer'] = user_basic.apply(lambda row:setMinAnswer(row), axis=1)
user_basic.set_index('id', inplace=True)
top_answerer_ts = pd.merge(answer_ts, user_basic['min_answer'].to_frame(), how='left', left_index=True, right_index=True)
top_answerer_ts = top_answerer_ts[top_answerer_ts.iloc[:,:-1].sum(axis=1) > top_answerer_ts.min_answer]
top_answerer_ts.drop('min_answer', inplace=True, axis=1)
print (top_answerer_ts.shape)

(17434, 24)


## SO members who posts more than 2 questions a month are 17434 out of 869963 <span style = "color:red">Ratio is 2%</span>

### Segment time series into 12 months, ignoring numbers before user signup

In [10]:
user_basic.head()

Unnamed: 0_level_0,about_me_length,creation_year,creation_month,last_access_year,last_access_month,location,up_votes,down_votes,profile_image,website_url,min_answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
16399,5,2008,9,2017,1,"Toronto, Canada",501,37,1,http://www.google.com/,48
15351,0,2008,9,2013,1,Austria,2,0,1,,48
47721,43,2008,12,2016,1,"Toronto, Canada",12,3,1,http://www.fuzzylizard.com,48
7984,0,2008,9,2018,1,"Laval, Canada",94,17,1,,48
16954,562,2008,9,2012,1,"Hastings, United Kingdom",66,3,1,http://dominicblackwell.com,48


In [11]:
top_answerer_ts_creation = pd.merge(top_answerer_ts, user_basic[['creation_year', 'creation_month']], how='left', left_index=True, right_index=True)

In [27]:
# Split data into many rows with 12 months' record, use the last n months to determine if user has churned
def monthsSince2016(signup_year, signup_month):
    baseline_date = datetime(2016, 1, 1, 00, 00)
    return (signup_year-2016)*12 + (signup_month-baseline_date.month)
def build_timeseries(df, mode='convertEndStatus'):
    ts_colname = ['M', 'M+1', 'M+2', 'M+3',
             'M+4', 'M+5', 'M+6', 'M+7',
             'M+8', 'M+9', 'M+10', 'M+11']
    df['cutoff'] = df.apply(lambda x:monthsSince2016(x[24], x[25]), axis=1)
    long_list = df[df['cutoff']<=0].iloc[:,0:12]
    long_list.columns=ts_colname
    print ('round 0 - added', long_list.shape[0], 'rows')
    for i in range(1, df.shape[1] - 14):
        df['cutoff'] = df.apply(lambda x:monthsSince2016(x[24], x[25])+i, axis=1)
        eligible_values = df[df['cutoff']<=0].iloc[:,i:i+12]
        eligible_values.columns=ts_colname
        long_list = pd.concat([long_list, eligible_values])
        print ('round', i, '- added', eligible_values.shape[0], 'rows')
    if (mode == 'convertEndStatus'):
        long_list['Inactive'] = long_list.apply(lambda x:1 if np.sum(x[-4:])==0 else 0, axis=1)
        long_list.drop(['M+8', 'M+9', 'M+10', 'M+11'], inplace=True, axis=1)
    elif (mode == 'countLast4moAnswer'):
        long_list['last4Month'] = long_list.apply(lambda x:np.sum(x[-4:]), axis=1)
        long_list.drop(['M+8', 'M+9', 'M+10', 'M+11'], inplace=True, axis=1)
    return long_list

In [None]:
%time top_answerer = build_timeseries(top_answerer_ts_creation, 'convertEndStatus')

In [152]:
with open('./data/processed/top_answer.pkl', 'wb') as picklefile:
    pickle.dump(top_answerer, picklefile)

In [16]:
top_answerer_ts_creation.shape

(17434, 27)

In [31]:
%time top_answerer_12months = build_timeseries(top_answerer_ts_creation, mode='countLast4moAnswer')

round 0 - added 14444 rows
round 1 - added 14229 rows
round 2 - added 13987 rows
round 3 - added 13772 rows
round 4 - added 13526 rows
round 5 - added 13295 rows
round 6 - added 13046 rows
round 7 - added 12803 rows
round 8 - added 12566 rows
round 9 - added 12354 rows
round 10 - added 12097 rows
round 11 - added 11850 rows
round 12 - added 11644 rows
CPU times: user 35.4 s, sys: 24 ms, total: 35.4 s
Wall time: 35.3 s


In [35]:
answer_ts.shape

(869963, 24)

In [33]:
top_answerer_12months.shape

(169613, 9)

### Read and format the user demographic data

In [153]:
with open('./data/processed/user_basic.pkl', 'rb') as picklefile:
    user_basic = pickle.load(picklefile)

In [154]:
user_basic.head()

Unnamed: 0,id,about_me_length,creation_year,creation_month,last_access_year,last_access_month,location,up_votes,down_votes,profile_image,website_url
0,16399,5,2008,9,2017,1,"Toronto, Canada",501,37,1,http://www.google.com/
1,15351,0,2008,9,2013,1,Austria,2,0,1,
2,47721,43,2008,12,2016,1,"Toronto, Canada",12,3,1,http://www.fuzzylizard.com
3,7984,0,2008,9,2018,1,"Laval, Canada",94,17,1,
4,16954,562,2008,9,2012,1,"Hastings, United Kingdom",66,3,1,http://dominicblackwell.com


In [155]:
user_basic['location'] = user_basic.location.apply(lambda x:x.split(', ')[-1])
website_count = pd.DataFrame(user_basic.website_url.value_counts())
website_count.reset_index(inplace=True)
personal_website = set(website_count[website_count['website_url']==1]['index'].values)
user_basic['personal_website'] = user_basic['website_url'].apply(lambda x:1 if x in personal_website else 0)
user_basic.set_index('id', inplace=True)
user_basic.drop(['last_access_year', 'last_access_month', 'website_url'], axis=1, inplace=True)

In [12]:
with open('./data/processed/user_basic_processed.pkl', 'wb') as picklefile:
    pickle.dump(user_basic, picklefile)

## Merge the user demographic data with answer time series

In [2]:
with open('./data/processed/top_answer.pkl', 'rb') as picklefile:
    top_answerer = pickle.load(picklefile)
with open('./data/processed/user_basic_processed.pkl', 'rb') as picklefile:
    user_basic = pickle.load(picklefile)

In [3]:
user_basic.drop(['up_votes', 'down_votes'], axis=1, inplace=True)

In [4]:
top_answerer_basic = pd.merge(user_basic, top_answerer, how='right', left_index=True, right_index=True)

## Formalize X and Y

In [5]:
with open('./data/processed/location_dict.pkl', 'rb') as picklefile:
    location_dict = pickle.load(picklefile)

In [6]:
location_dict.values()

dict_values(['NDF', 'NDF', 'APAC', 'AM', 'EMEA', 'EMEA', 'EMEA', 'AM', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'AM', 'EMEA', 'EMEA', 'APAC', 'AM', 'EMEA', 'EMEA', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'AM', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'APAC', 'APAC', 'APAC', 'APAC', 'AM', 'APAC', 'AM', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'APAC', 'AM', 'EMEA', 'EMEA', 'APAC', 'EMEA', 'EMEA', 'EMEA', 'AM', 'APAC', 'AM', 'AM', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'AM', 'APAC', 'AM', 'AM', 'APAC', 'EMEA', 'APAC', 'EMEA', 'AM', '', 'AM', 'EMEA', 'AM', 'EMEA', 'EMEA', 'EMEA', '', 'AM', 'AM', 'AM', 'EMEA', 'APAC', 'EMEA', 'EMEA', 'AM', 'EMEA', 'EMEA', 'AM', 'AM', 'AM', 'AM', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'AM', 'AM', 'AM'])

In [7]:
y = top_answerer_basic['Inactive']
X = top_answerer_basic.drop('Inactive', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4444, stratify=y)

In [8]:
## Dummify location
with open('./data/processed/location_dict.pkl', 'rb') as picklefile:
    location_dict = pickle.load(picklefile)
    
def map_dummify_Locations(df, location_dict):
    df['location'] = df.location.apply(lambda x:location_dict[x] if x in set(location_dict.keys()) else 'Others')
    location_dummy = patsy.dmatrix('location', data=df, return_type='dataframe')
    df = pd.concat([df, location_dummy], axis = 1)
    df.drop('location', axis=1, inplace=True)
    new_colnames = [item.replace('[','-') for item in list(df.columns)]
    new_colnames = [item.replace(']','') for item in new_colnames]
    df.columns = new_colnames
    df.drop('Intercept', axis=1, inplace=True)
    return df

In [9]:
X_train = map_dummify_Locations(X_train, location_dict)
X_test = map_dummify_Locations(X_test, location_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
with open('./data/processed/user_ts_X_train.pkl', 'wb') as picklefile:
    pickle.dump(X_train, picklefile)

## Prepare data for training

In [171]:
# Normalize the data
ssX = StandardScaler()
X_train_norm = ssX.fit_transform(X_train)

In [363]:
def gridSearchFiveModels(X, y):
    models = [
        ('knn', KNN),
        ('logistic', LogisticRegression),
        ('tree', DecisionTreeClassifier),
        ('forest', RandomForestClassifier),
        ('xgboost', XGBClassifier)
    ]

    param_choices = [
        {
            'n_neighbors': range(2,12)
        },
        {
            'C': np.logspace(-3,6,12),
            'penalty':['l1', 'l2']
        },
        {
            'max_depth': [2,3,4,5],
            'min_samples_leaf': [3,6,10]
        },
        {
            'n_estimators': [50, 100, 200],
            'max_depth': [1,2,3,4,5],
            'min_samples_leaf': [3,6,10]
        },
        {
            'max_depth': [3,4,5],
            'n_estimators': [1, 50, 100,200],
            'objective':['binary:logistic']
        }
    ]

    grids = {}
    #ftwo_scorer = make_scorer(fbeta_score, average='binary', beta=1.41)
    
    for model_info, params in zip(models, param_choices):
        print ('Now Fitting', model_info, '\n')
        name, model = model_info
        grid = GridSearchCV(model(), params, scoring='accuracy', cv=5, n_jobs=-1)
        grid.fit(X, y)
        s = "{}: best score: {}".format(name, grid.best_score_)
        print(s)
        grids[name] = grid
    return grids

In [364]:
def oversampleSubsampleGV(df_x, df_y, oversampler):
    X_train_resampled, y_train_resampled = oversampler.fit_sample(df_x, df_y)
    subsample = [np.random.choice(X_train_resampled.shape[0], 10000, replace=False)]
    X_train_subsample = X_train_resampled[subsample]
    y_train_subsample = y_train_resampled[subsample]
    return gridSearchFiveModels(X_train_subsample, y_train_subsample)

In [365]:
grid_SMOTE = oversampleSubsampleGV(X_train_norm, y_train, SMOTE(random_state=444))

Now Fitting ('knn', <class 'sklearn.neighbors.classification.KNeighborsClassifier'>) 

knn: best score: 0.7009
Now Fitting ('logistic', <class 'sklearn.linear_model.logistic.LogisticRegression'>) 

logistic: best score: 0.7284
Now Fitting ('tree', <class 'sklearn.tree.tree.DecisionTreeClassifier'>) 

tree: best score: 0.8215
Now Fitting ('forest', <class 'sklearn.ensemble.forest.RandomForestClassifier'>) 

forest: best score: 0.8266
Now Fitting ('xgboost', <class 'xgboost.sklearn.XGBClassifier'>) 



  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:


xgboost: best score: 0.8905


In [369]:
print(classification_report(y_train_subsample, grid_SMOTE['tree'].best_estimator_.predict(X_train_subsample)))

             precision    recall  f1-score   support

          0       0.79      0.81      0.80      4883
          1       0.81      0.80      0.81      5117

avg / total       0.80      0.80      0.80     10000



In [372]:
print(classification_report(y_train, grid_SMOTE['tree'].best_estimator_.predict(X_train_norm)))

             precision    recall  f1-score   support

          0       0.93      0.81      0.87    107335
          1       0.39      0.67      0.49     19040

avg / total       0.85      0.79      0.81    126375



In [373]:
confusion_matrix(y_train, grid_SMOTE['tree'].best_estimator_.predict(X_train_norm))

array([[87266, 20069],
       [ 6227, 12813]])

## Check OverSampler

In [355]:
grid_OverSampler = oversampleSubsampleGV(X_train_norm, y_train, RandomOverSampler(random_state=444))

Now Fitting ('knn', <class 'sklearn.neighbors.classification.KNeighborsClassifier'>) 

knn: best score: 0.7326493638673417
Now Fitting ('logistic', <class 'sklearn.linear_model.logistic.LogisticRegression'>) 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


logistic: best score: 0.7978230897577827
Now Fitting ('tree', <class 'sklearn.tree.tree.DecisionTreeClassifier'>) 

tree: best score: 0.7878882805073041
Now Fitting ('forest', <class 'sklearn.ensemble.forest.RandomForestClassifier'>) 

forest: best score: 0.7864924999426376
Now Fitting ('xgboost', <class 'xgboost.sklearn.XGBClassifier'>) 



  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:


xgboost: best score: 0.8015896195611256


In [356]:
print(classification_report(y_train_subsample, grid_SMOTE['logistic'].best_estimator_.predict(X_train_subsample)))

             precision    recall  f1-score   support

          0       0.75      0.59      0.66      4883
          1       0.68      0.82      0.74      5117

avg / total       0.71      0.71      0.70     10000



In [357]:
confusion_matrix(y_train, grid_OverSampler['logistic'].best_estimator_.predict(X_train_norm))

array([[47582, 59753],
       [ 1504, 17536]])

In [358]:
print(classification_report(y_train, grid_OverSampler['logistic'].best_estimator_.predict(X_train_norm)))

             precision    recall  f1-score   support

          0       0.97      0.44      0.61    107335
          1       0.23      0.92      0.36     19040

avg / total       0.86      0.52      0.57    126375



### Check ADASYN

In [359]:
grid_OverSampler = oversampleSubsampleGV(X_train_norm, y_train, ADASYN(random_state=4444))

Now Fitting ('knn', <class 'sklearn.neighbors.classification.KNeighborsClassifier'>) 

knn: best score: 0.7270766990761682
Now Fitting ('logistic', <class 'sklearn.linear_model.logistic.LogisticRegression'>) 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


logistic: best score: 0.7783184123982735
Now Fitting ('tree', <class 'sklearn.tree.tree.DecisionTreeClassifier'>) 

tree: best score: 0.8287347702945849
Now Fitting ('forest', <class 'sklearn.ensemble.forest.RandomForestClassifier'>) 

forest: best score: 0.8296440563903343
Now Fitting ('xgboost', <class 'xgboost.sklearn.XGBClassifier'>) 



  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:


xgboost: best score: 0.896752625575368


In [360]:
print(classification_report(y_train_subsample, grid_ADASYN['logistic'].best_estimator_.predict(X_train_subsample)))

             precision    recall  f1-score   support

          0       0.80      0.46      0.58      4883
          1       0.63      0.89      0.74      5117

avg / total       0.72      0.68      0.66     10000



In [361]:
confusion_matrix(y_train, grid_ADASYN['logistic'].best_estimator_.predict(X_train_norm))

array([[49766, 57569],
       [ 1832, 17208]])

In [362]:
print(classification_report(y_train, grid_ADASYN['logistic'].best_estimator_.predict(X_train_norm)))

             precision    recall  f1-score   support

          0       0.96      0.46      0.63    107335
          1       0.23      0.90      0.37     19040

avg / total       0.85      0.53      0.59    126375



## Try SVM on the data

In [375]:
svc = SVC(kernel='rbf') # RBF is default
svc.fit(X_train_subsample, y_train_subsample)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [376]:
confusion_matrix(y_train, svc.predict(X_train_norm))

array([[61290, 46045],
       [ 2163, 16877]])

In [378]:
svc_linear = SVC(kernel='linear') # RBF is default
svc_linear.fit(X_train_subsample, y_train_subsample)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [380]:
confusion_matrix(y_train, svc_linear.predict(X_train_norm))

array([[45667, 61668],
       [ 1118, 17922]])

In [381]:
param_grid = {'degree': np.linspace(2,8,7), 'C': np.linspace(0.1,100,101)}
svc_poly = SVC(kernel='poly')
grid = GridSearchCV(svc_poly, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train_subsample, y_train_subsample)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'degree': array([2., 3., 4., 5., 6., 7., 8.]), 'C': array([  0.1  ,   1.099, ...,  99.001, 100.   ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [382]:
confusion_matrix(y_train, grid.predict(X_train_norm))

array([[69493, 37842],
       [ 3400, 15640]])

In [383]:
accuracy_score(y_train, grid.predict(X_train_norm))

0.6736538081107813