In [None]:
# Author: Jin Zhang
# Use Python 2 
from __future__ import print_function, division

In [None]:
!python --version

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)
plt.rcParams.update({'font.size': 22})


In [None]:
from sklearn import pipeline, preprocessing

from sklearn.model_selection import train_test_split, cross_val_predict,cross_val_score, StratifiedShuffleSplit,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, \
                            accuracy_score, f1_score, roc_auc_score, roc_curve, \
                             precision_recall_curve,log_loss, confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from enchant.checker import SpellChecker
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest


In [None]:
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
lme4=importr('lme4')


# Import and Clean Data

In [None]:
df_use = pd.read_csv('./input/df_use.csv', parse_dates= ['date_posted', 'date_completed','date_thank_you_packet_mailed', 'date_expiration'])
df_outsample = pd.read_csv('./input/df_outsample.csv', parse_dates= ['date_posted', 'date_completed','date_thank_you_packet_mailed', 'date_expiration'])

In [None]:
df_use = df_use.rename(columns = {'item_total_expense':'most_exp_item_cost','item_expense':'project_expense'})

df_use.loc[pd.isnull(df_use.most_exp_item_cost),'most_exp_item_cost'] = \
                              df_use.loc[pd.isnull(df_use.most_exp_item_cost),'project_expense'] 

In [None]:
df_outsample = df_outsample.rename(columns = {'item_total_expense':'most_exp_item_cost',\
                                              'item_expense':'project_expense'})

df_outsample.loc[pd.isnull(df_outsample.most_exp_item_cost),'most_exp_item_cost'] = \
                              df_outsample.loc[pd.isnull(df_outsample.most_exp_item_cost),'project_expense'] 

In [None]:
df_outsample = df_outsample[df_outsample.funding_status!='live']
df_outsample = df_outsample[~pd.isnull(df_outsample.most_exp_item_cost)]

In [None]:
df_outsample.info()

In [None]:
X_all, y_all, X_outsample, y_outsample = df_use.drop('funding_success',axis=1), df_use['funding_success'],\
                                         df_outsample.drop('funding_success',axis=1), df_outsample['funding_success']


In [None]:
rs0 = StratifiedShuffleSplit(n_splits=1, test_size=0.05, random_state=100)

for modeling_index,holdout_index in rs0.split(X_all,y_all):
    X = X_all.iloc[modeling_index,:]
    X_holdout =  X_all.iloc[holdout_index,:]
    y = y_all.iloc[modeling_index]
    y_holdout = y_all.iloc[holdout_index]

In [None]:
rs = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=102)

for train_index, test_index in rs.split(X,y):
    X_train = X.iloc[train_index,:]
    X_test =  X.iloc[test_index,:]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

# define pipeline classes and helper functions

In [None]:
# merge dictionary helper function

def merge_two_dicts(x, y):
    """Given two dicts, merge them into a new dict as a shallow copy."""
    z = x.copy()
    z.update(y)
    return z

In [None]:
class Column_Selector(BaseEstimator, TransformerMixin):
    def __init__(self,selected_columns, **kwargs):
        self.kwargs = kwargs
        self.selected_columns=selected_columns
        
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X[self.selected_columns]
    

In [None]:
class Get_Location_Input(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        self.model = None
    def fit(self, X, y=None):
        steps = [('test',Column_Selector(selected_columns=['school_latitude','school_longitude'])),
        ('kn',KNeighborsClassifier(n_neighbors  = 184, weights='distance')),]
        kn1 = pipeline.Pipeline(steps)
        kn1.fit(X,y)
        self.model = kn1
        return self
    def transform(self,X):
        return pd.DataFrame(pd.Series(self.model.predict(X), index = X.index, name='location'))
    def _get_param_names(self):
        return ['location']

In [None]:
# tune parameter for knearest neighbors
param_kn1 = dict(kn__n_neighbors = range(180,189,2))

steps = [('test',Column_Selector(selected_columns=['school_latitude','school_longitude'])),
        ('kn',KNeighborsClassifier(weights='distance')),]
kn1 = pipeline.Pipeline(steps)

grid_kn1= GridSearchCV(kn1,\
                    param_grid=param_kn1, \
                    scoring='roc_auc',\
                    n_jobs=-1)


grid_kn1.fit(X_train,y_train)
#grid1.cv_results_, \
grid_kn1.best_params_, \
grid_kn1.best_score_


In [None]:
class Get_Raw_Features(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        X=X.assign(project_duration = (X['date_expiration'] - X['date_posted'])/np.timedelta64(1, 'D'),\
           has_secondary_focus = X['secondary_focus_subject'].isnull().astype(int))


        d_boolean = {'t':1,'f':0}
        X_boolean =  X[['school_charter','school_magnet','school_year_round',\
                        'school_nlns','school_kipp','school_charter_ready_promise',\
                        'teacher_teach_for_america','teacher_ny_teaching_fellow',\
                        'eligible_double_your_impact_match','eligible_almost_home_match']]\
                              .applymap(lambda x: d_boolean[x])

        X_numeric = X[['has_secondary_focus','students_reached','posted_month','most_exp_item_cost',\
                       'project_expense','total_price_excluding_optional_support','project_duration']]
        
        
        return pd.concat([X_boolean,X_numeric],axis=1)
    def _get_param_names(self):
        return (['school_charter','school_magnet','school_year_round',\
                        'school_nlns','school_kipp','school_charter_ready_promise',\
                        'teacher_teach_for_america','teacher_ny_teaching_fellow',\
                        'eligible_double_your_impact_match','eligible_almost_home_match'] +
               ['has_secondary_focus','students_reached','posted_month','most_exp_item_cost',\
                       'project_expense','total_price_excluding_optional_support','project_duration'])

In [None]:
class Get_Category_Mean(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        self.stored_result = {}
        
        
    def fit(self, X, y=None):
        X.teacher_prefix = X.teacher_prefix.fillna('N/A')
        X.grade_level = X.grade_level.fillna('N/A')
        
        #y_data = pd.Series(y, name ='funding_success',index=X.index)
        df = pd.concat([X,y],axis=1)
        
        category_var_list = ['teacher_prefix','primary_focus_subject','primary_focus_area',\
                     'resource_type','poverty_level','grade_level']        
        z = {}
        for var in category_var_list:
            
            z = merge_two_dicts(z, df.groupby(var,as_index=False)[['funding_success']]\
                               .mean().rename(columns={'funding_success':var+'_mean'}).to_dict())
            
        self.stored_result = z
        return self
        
    
    def transform(self, X):
        X.teacher_prefix = X.teacher_prefix.fillna('N/A')
        X.grade_level = X.grade_level.fillna('N/A')
        
        X_merge = X.copy(deep=True)
        category_var_list = ['teacher_prefix','primary_focus_subject','primary_focus_area',\
                             'resource_type','poverty_level','grade_level'] 
        
        for var in category_var_list: 
            join_d = {}
            for index in [var,var+'_mean']:
                join_d[index] = self.stored_result[index]
            
            X_merge = X_merge.merge(pd.DataFrame.from_dict(join_d), how='left', on = var)
            
        return X_merge.filter(regex='_mean')
        
    def _get_param_names(self):
        param_list = []
        category_var_list = ['teacher_prefix','primary_focus_subject','primary_focus_area',\
                     'resource_type','poverty_level','grade_level']
        for var in category_var_list:
            param_list.append(var+'_mean')
        return param_list

In [None]:
class Get_Category_ShrinkMean(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        self.stored_result = {}
        
    def fit(self, X, y=None):
        X.teacher_prefix = X.teacher_prefix.fillna('N/A')
        X.grade_level = X.grade_level.fillna('N/A')
        
        category_var_list = ['teacher_prefix','primary_focus_subject','primary_focus_area',\
                     'resource_type','poverty_level','grade_level']
        z = {}
        for var in category_var_list:
            rX = pandas2ri.py2ri(X[[var]].reset_index(drop=True))
            ry = pandas2ri.py2ri(y.reset_index(drop=True))
            
            rstring="""
            function(rX,ry){
                library(lme4)
                                
                fid <- factor(rX[,1])
                mod <-lmer(ry ~ 1  + (1|fid))            

                df <- data.frame(ranef(mod)$fid+fixef(mod))
                df

            }
            """
            rfunc=robjects.r(rstring)
            r_df=rfunc(rX,ry)
            pred=pandas2ri.ri2py(r_df).reset_index()
            pred.columns = [var,var+'_shrink_mean']
            
            
            z = merge_two_dicts(z, pred.to_dict())
            
        self.stored_result = z
        return self
        

    def transform(self, X):
        X.teacher_prefix = X.teacher_prefix.fillna('N/A')
        X.grade_level = X.grade_level.fillna('N/A')
        
        X_merge = X.copy(deep=True)
        category_var_list = ['teacher_prefix','primary_focus_subject','primary_focus_area',\
                             'resource_type','poverty_level','grade_level'] 
        
        for var in category_var_list: 
            join_d = {}
            for index in [var,var+'_shrink_mean']:
                join_d[index] = self.stored_result[index]
            
            X_merge = X_merge.merge(pd.DataFrame.from_dict(join_d), how='left', on = var)
            
        return X_merge.filter(regex='_shrink_mean')
        
    def _get_param_names(self):
        param_list = []
        category_var_list = ['teacher_prefix','primary_focus_subject','primary_focus_area',\
                     'resource_type','poverty_level','grade_level']
        for var in category_var_list:
            param_list.append(var+'_shrink_mean')
        return param_list

In [None]:
class Get_Text_Features(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        text_var_list = ['title','need_statement','essay']
        X_text_basic = pd.DataFrame({})
        for var in text_var_list:
            X_text_basic = pd.concat([X_text_basic, 
                           X[[var]].apply(lambda x: x.str.split().str.len()).rename(columns = {var: var +'_word_count'})],\
                           axis=1)
            X_text_basic = pd.concat([X_text_basic,
                           X[[var]].apply(lambda x: x.str.count('!')).rename(columns = {var: var +'_exclamation_count'})],\
                           axis=1)
            X_text_basic = pd.concat([X_text_basic, 
                           X[[var]].apply(lambda x: x.str.count(r'\?')).rename(columns = {var: var +'_question_count'})],\
                           axis=1)
        return X_text_basic
    def _get_param_names(self):
        param_list = []
        text_var_list = ['title','need_statement','essay']
        for var in text_var_list:
            param_list.append(var +'_word_count')
            param_list.append(var +'_exclamation_count')
            param_list.append(var +'_question_count')
        return param_list
        
        

In [None]:
def print_feature_importances(model,feature_union_step,index_list=[]):
    if 'feature_importances_' in dir(model):
        coefs = list(model.feature_importances_)
    elif 'coef_' in dir(model):
        coefs = list(model.coef_[0])
    if coefs:
        feature_names = []
        for item in reg2.named_steps['combined'].transformer_list:
            feature_names += item[1]._get_param_names()
        if index_list:
            result = []
            for index,item in enumerate(feature_names):
                if index in index_list:
                    result.append(item)
            feature_names = result
        features = sorted(zip(feature_names,coefs),key=lambda x: abs(x[1]),reverse=True)
        for f in features:
            print("{}: {}".format(f[0],f[1]))

# Compare Models

In [None]:
# dummy model predictions

In [None]:
y_test_pred_dummy = np.ones_like(y_test)
y_holdout_pred_dummy = np.ones_like(y_holdout)
y_outsample_pred_dummy = np.ones_like(y_outsample)


roc_auc_score(y_test,y_test_pred_dummy)

In [None]:
f1_score(y_test, y_test_pred_dummy)

In [None]:
# regular features with categorical mean

In [None]:
def print_model_result(model, model_name,feature,feature_name,X,y):

    
    steps = [(feature_name,feature),
             (model_name,model),]

    reg = pipeline.Pipeline(steps)
    reg.fit(X,y)
    
    print(model_name + ' test set has auc score:', np.mean(cross_val_score(reg, X, y, scoring = 'roc_auc')))
    

In [None]:
model_dict = {'logistic regression': LogisticRegression(),
              'gaussian naive bayes': GaussianNB(),
              'decision tree classifier': DecisionTreeClassifier(),
              'random forest classifier': RandomForestClassifier(class_weight='balanced_subsample'),
              'gradient boosting classifier': GradientBoostingClassifier()
             }

feature_name = 'combined'
combined_features = pipeline.FeatureUnion([('raw_features', Get_Raw_Features()),
                                               ('category_mean',Get_Category_Mean()),
                                               ('text_features',Get_Text_Features()),
                                               ('location_feature',Get_Location_Input()),
                                              ])
for model_name, model in model_dict.items():
    print_model_result(model, model_name,combined_features,feature_name,X,y)
    


In [None]:
# features with shrinkage mean
# performs on par with categorical mean, therefore not use it

In [None]:
model_dict = {'logistic regression': LogisticRegression(),
              'gaussian naive bayes': GaussianNB(),
              'decision tree classifier': DecisionTreeClassifier(),
              'random forest classifier': RandomForestClassifier(class_weight='balanced_subsample'),
              'gradient boosting classifier': GradientBoostingClassifier()
             }

feature_name = 'combined'
combined_features_s = pipeline.FeatureUnion([('raw_features', Get_Raw_Features()),
                                               ('category_mean',Get_Category_ShrinkMean()),
                                               ('text_features',Get_Text_Features()),
                                               ('location_feature',Get_Location_Input()),
                                              ])
for model_name, model in model_dict.items():
    print_model_result(model, model_name,combined_features_s,feature_name,X,y)
    

# Tune hyperprameters for gradient boosting classifier

In [None]:
# ({'gb__n_estimators': 50}, 0.70534596882581591)
param_test1 = dict(gb__n_estimators = range(20,81,10))

steps = [('combined',combined_features),
         ('gb',GradientBoostingClassifier(learning_rate=0.2, min_samples_split=500,\
                                                            min_samples_leaf=50,max_depth=8,max_features='sqrt',\
                                                            subsample=0.8,random_state=10)),]
cf1 = pipeline.Pipeline(steps)

grid1= GridSearchCV(cf1,\
                    param_grid=param_test1, \
                    scoring='roc_auc',\
                    n_jobs=-1)




grid1.fit(X_train,y_train)
#grid1.cv_results_, \
grid1.best_params_, \
grid1.best_score_


In [None]:
# ({'gb__max_depth': 6, 'gb__min_samples_split': 800}, 0.70520058108538486)
param_test2 = dict(gb__max_depth = range(3,7,1), gb__min_samples_split = range(200,1001,200))

steps2 = [('combined',combined_features),
         ('gb',GradientBoostingClassifier(n_estimators = 50, learning_rate=0.2, \
                                          max_features='sqrt',\
                                          subsample=0.8,random_state=10)),]
cf2 = pipeline.Pipeline(steps2)

grid2= GridSearchCV(cf2,\
                    param_grid=param_test2, \
                    scoring='roc_auc',\
                    n_jobs = -1)




grid2.fit(X_train,y_train)
#grid1.cv_results_, \
grid2.best_params_, \
grid2.best_score_

In [None]:
#({'gb__min_samples_leaf': 60, 'gb__min_samples_split': 2000}, 0.70525856924776154)

param_test3 = dict(gb__min_samples_split = range(2000,4100,200), gb__min_samples_leaf = range(30,71,10))
steps3 = [('combined',combined_features),
         ('gb',GradientBoostingClassifier(n_estimators = 50, learning_rate=0.2, \
                                          max_depth = 6,\
                                          max_features='sqrt',\
                                          subsample=0.8,random_state=10)),]
cf3 = pipeline.Pipeline(steps3)

grid3= GridSearchCV(cf3,\
                    param_grid=param_test3, \
                    scoring='roc_auc',\
                    n_jobs = -1)




grid3.fit(X_train,y_train)

grid3.best_params_, \
grid3.best_score_

In [None]:
# ({'gb__max_features': 13}, 0.70684376863401222)

param_test4 = dict(gb__max_features = range(5,20,2))
steps4 = [('combined',combined_features),
         ('gb',GradientBoostingClassifier(n_estimators = 50, learning_rate=0.2, \
                                          max_depth = 6,min_samples_leaf=60,min_samples_split=2000, \
                                          subsample=0.8,random_state=10)),]
cf4 = pipeline.Pipeline(steps4)

grid4= GridSearchCV(cf4,\
                    param_grid=param_test4, \
                    scoring='roc_auc',\
                    n_jobs = -1)




grid4.fit(X_train,y_train)

grid4.best_params_, \
grid4.best_score_

In [None]:
# ({'gb__subsample': 0.9}, 0.90704918296693149)

param_test5 = dict(gb__subsample = [0.80,0.85,0.90,0.95])

steps5 = [('combined',combined_features),
         ('gb',GradientBoostingClassifier(n_estimators = 50, learning_rate=0.2, \
                                          max_depth = 6,min_samples_leaf=50,min_samples_split=2000, \
                                          random_state=10,max_features=13)),]
cf5 = pipeline.Pipeline(steps5)

grid5= GridSearchCV(cf5,\
                    param_grid=param_test5, \
                    scoring='roc_auc',\
                    n_jobs = -1)




grid5.fit(X_train,y_train)

grid5.best_params_, \
grid5.best_score_

In [None]:
steps6 = [('combined',combined_features),
         ('gb',GradientBoostingClassifier(n_estimators = 1000, learning_rate=0.01, \
                                          max_depth = 6,min_samples_leaf=50,min_samples_split=2000, \
                                          random_state=10,max_features=13, subsample=0.90)),]
cf6 = pipeline.Pipeline(steps6)
np.mean(cross_val_score(cf6, X_train, y_train, scoring = 'roc_auc'))

In [None]:
cf6.fit(X_train,y_train)


In [None]:
roc_auc_score(y_test,cf6.predict_proba(X_test)[:,1])

In [None]:
model = cf6.named_steps['gb']
feature_union_step = cf6.named_steps['combined']
print_feature_importances(model,feature_union_step)

In [None]:
def plot_feature_importances(model,feature_union_step,index_list=[]):
    if 'feature_importances_' in dir(model):
        coefs = list(model.feature_importances_)
    elif 'coef_' in dir(model):
        coefs = list(model.coef_[0])
    if coefs:
        feature_names = []
        for item in reg2.named_steps['combined'].transformer_list:
            feature_names += item[1]._get_param_names()
        if index_list:
            result = []
            for index,item in enumerate(feature_names):
                if index in index_list:
                    result.append(item)
            feature_names = result
        
        
        plt.figure()

        features = sorted(zip(feature_names,coefs),key=lambda x: abs(x[1]))
        
        s1 = pd.Series([f[0] for f in features], name='feature')
        s2 = pd.Series([f[1] for f in features], name='importance')
        pd.concat([s1,s2],axis=1).plot.barh(x='feature', y= 'importance',figsize=(20,15))
        plt.savefig('feature_importance.png', fmt='png', dpi=300, bbox_inches='tight')
#         plt.figure(figsize=(20,10))
#         fig, ax = plt.barh(np.arange(len(features)), [f[1] for f in features])
#         ax.set_yticks(np.arange(len(features)))
#         ax.set_yticklabels([f[0] for f in features])

In [None]:
model = cf6.named_steps['gb']
feature_union_step = cf6.named_steps['combined']
plot_feature_importances(model,feature_union_step)

# Tune parameter selection for logistic regression

In [None]:
combined_features = pipeline.FeatureUnion([('raw_features', Get_Raw_Features()),
                                               ('category_mean',Get_Category_Mean()),
                                               ('text_features',Get_Text_Features()),
                                               ('location_feature',Get_Location_Input()),
                                              ])

param_lr1 = dict(selectk__k = range(5,32,2))

step_lr = [('combined',combined_features),
          ('selectk', SelectKBest()),
         ('lr',LogisticRegression()),]
lr2 = pipeline.Pipeline(step_lr)

grid_lr = GridSearchCV(lr2,\
                    param_grid=param_lr1, \
                    scoring='roc_auc',\
                    n_jobs = -1)


grid_lr.fit(X_train,y_train)

grid_lr.best_params_, \
grid_lr.best_score_

In [None]:
grid_lr.best_score_

In [None]:
model =  grid_lr.best_estimator_.named_steps['lr']
feature_union_step = grid_lr.best_estimator_.named_steps['combined']

print_feature_importances(model,feature_union_step,\
                         list(grid_lr.best_estimator_.named_steps['selectk'].get_support(indices=True))) 

# Tune parameters for random forest

In [None]:
#({'rf__min_samples_leaf': 60}, 0.69294110314893065)
rf_param_test1 = dict(rf__min_samples_leaf = range(20,81,10))

steps = [('combined',combined_features),
         ('rf',RandomForestClassifier(n_estimators=10, criterion='entropy',\
                                      class_weight='balanced_subsample',random_state=10)),]
rf_cf1 = pipeline.Pipeline(steps)

rf_grid1= GridSearchCV(rf_cf1,\
                    param_grid=rf_param_test1, \
                    scoring='roc_auc',\
                    n_jobs=-1)




rf_grid1.fit(X_train,y_train)

rf_grid1.best_params_, \
rf_grid1.best_score_

In [None]:
# ({'rf__max_features': 13}, 0.69574288015765484)

rf_param_test2 = dict(rf__max_features = range(11,30,2))

steps = [('combined',combined_features),
         ('rf',RandomForestClassifier(n_estimators=10, criterion='entropy',\
                                      min_samples_leaf = 60,\
                                      class_weight='balanced_subsample',random_state=10)),]
rf_cf2 = pipeline.Pipeline(steps)

rf_grid2= GridSearchCV(rf_cf2,\
                    param_grid=rf_param_test2, \
                    scoring='roc_auc',\
                    n_jobs=-1)




rf_grid2.fit(X_train,y_train)

rf_grid2.best_params_, \
rf_grid2.best_score_

In [None]:
# for time efficiency set n_estimators = 100
rf_param_test3 = dict(rf__n_estimators = [10,50,100,200,400,600, 800])

steps = [('combined',combined_features),
         ('rf',RandomForestClassifier(criterion='entropy',\
                                      min_samples_leaf = 60,max_features=13,\
                                      class_weight='balanced_subsample',random_state=10)),]
rf_cf3 = pipeline.Pipeline(steps)

rf_grid3= GridSearchCV(rf_cf3,\
                    param_grid=rf_param_test3, \
                    scoring='roc_auc',\
                    n_jobs=-1)




rf_grid3.fit(X_train,y_train)

rf_grid3.best_params_, \
rf_grid3.best_score_

In [None]:
steps_rf = [('combined',combined_features),
         ('rf',RandomForestClassifier(criterion='entropy', n_estimators = 100,\
                                      min_samples_leaf = 60,max_features=13,\
                                      class_weight='balanced_subsample',random_state=10)),]
rf1 = pipeline.Pipeline(steps_rf)
rf1.fit(X_train,y_train)

In [None]:
roc_auc_score(y_test,rf1.predict_proba(X_test)[:,1])

In [None]:
model =  rf1.named_steps['rf']
feature_union_step = rf1.named_steps['combined']

print_feature_importances(model,feature_union_step) 

# Plot precision recall curve

In [None]:
prec, recall, thresholds = precision_recall_curve(y_test,cf6.predict_proba(X_test)[:,1])

In [None]:

prec_lr, recall_lr, thresholds_lr = precision_recall_curve(y_test,grid_lr.predict_proba(X_test)[:,1])

In [None]:
ensemble_result = np.maximum(rf1.predict_proba(X_test)[:,1], cf6.predict_proba(X_test)[:,1])

In [None]:
prec_en, recall_en, thresholds_en = precision_recall_curve(y_test,ensemble_result)

In [None]:

prec_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test,rf1.predict_proba(X_test)[:,1])

In [None]:
prec_d, recall_d, thresholds_d = precision_recall_curve(y_test,y_test_pred_dummy)

In [None]:
prec_d

In [None]:
recall_d

In [None]:
thresholds_d

In [None]:
plt.figure(figsize=(20,10))
plt.plot(prec_rf[0:-1], recall_rf[0:-1], color='blue')

plt.plot(prec[0:-1], recall[0:-1],'red')
plt.plot(prec_lr[0:-1], recall_lr[0:-1], color = 'black')
#plt.plot(prec_d[0:-1], recall_d[0:-1], color = 'green')

plt.legend(['random forest','gradient boosting','logistic'],fontsize=22)

plt.title('precision-recall curve',fontsize=22)
plt.xlabel('precision',fontsize=22)
plt.ylabel('recall',fontsize=22);
plt.savefig('precision_recall.png',fmt='png', dpi=300, bbox_inches='tight')

In [None]:
# double check precision recall on holdout group
prec_h, recall_h, thresholds = precision_recall_curve(y_holdout,cf6.predict_proba(X_holdout)[:,1])

prec_lr_h, recall_lr_h, thresholds_lr = precision_recall_curve(y_holdout,grid_lr.predict_proba(X_holdout)[:,1])

prec_rf_h, recall_rf_h, thresholds_rf_h = precision_recall_curve(y_holdout,rf1.predict_proba(X_holdout)[:,1])

plt.figure()
plt.plot(prec_rf_h[0:-1], recall_rf_h[0:-1], color='blue')

plt.plot(prec_h[0:-1], recall_h[0:-1],'orange')
plt.plot(prec_lr_h[0:-1], recall_lr_h[0:-1], color = 'red')
plt.legend(['random forest','gradient boosting','logistic'])


plt.title('precision-recall curve for')
plt.xlabel('precision')
plt.ylabel('recall')

In [None]:
# set threshold to optimize F1 score

In [None]:
#cf6.fit(X_train,y_train)
f1_list = []
y_pred_test_proba = cf6.predict_proba(X_test)[:,1]

for threshold in np.arange(0.01,0.99,0.01):
    y_test_pred = np.array(y_pred_test_proba>threshold).astype(int)
    f1_list.append(f1_score(y_test, y_test_pred))

In [None]:
max(f1_list)

In [None]:
np.arange(0.01,0.99,0.01)[np.argmax(f1_list)]

In [None]:
confusion_matrix(y_test, np.array(y_pred_test_proba>0.16).astype(int))

In [None]:
precision_score(y_test, np.array(y_pred_test_proba>0.44).astype(int))

In [None]:
recall_score(y_test, np.array(y_pred_test_proba>0.44).astype(int))