In [1]:
# Set width of Jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

In [3]:
# Import packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import glob
import os
import json
import seaborn as sns
from datetime import date
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from eli5.sklearn import PermutationImportance
import xgboost as xgb
import lifelines
import pickle

import foodie_features
import yelp_data_pulling_and_cleaning
import build_viva_las_foodie

In [4]:
pd.set_option('display.max_row', 200)
pd.set_option('display.max_columns', 25)

In [5]:
raw_businesses_df = yelp_data_pulling_and_cleaning.pull_raw_business_data()

In [6]:
cities = ['Las Vegas']
yelp_businesses_df, categories = yelp_data_pulling_and_cleaning.clean_business_data( raw_businesses_df, \
                                                                         type_of_business_list=['Restaurant'], \
                                                                         city_filter_list=cities, \
                                                                         remove_hours=True, \
                                                                         required_num_of_closed_thresh_in_city=1000
                                                                       )

In [None]:
business_ids = yelp_businesses_df.index.values
reviews_df = yelp_data_pulling_and_cleaning.clean_reviews_data(business_ids)
reviews_df

In [None]:
chains, duplicate_locations_df = build_viva_las_foodie.calculate_additional_features(yelp_businesses_df, reviews_df)

In [None]:
yelp_businesses_df

In [None]:
features = ['is_chain', 'duplicate_location', 'cost_1', 'cost_2', 'cost_3', 'cost_4', 'is_claimed', 'sentiment', 'avg_review_length', \
            'review_count_before_date', 'rating_before_date' ]
data = build_viva_las_foodie.build_X_and_y(yelp_businesses_df, reviews_df, date(2018, 1, 1), forecast_months=[1, 3, 6, 9], \
                                           ignore_distance=False, load_NLP=True, do_distance=False, features=features)
X = data[features]

In [None]:
data[ [column for column in data if column.startswith('closed_forecast')] ].sum()

In [None]:
forecast_length = '9_months'
y = data['closed_forecast_%s'%forecast_length]

In [None]:
#X = data.append(pd.read_csv('data_2018_1_1.csv').set_index('business_id'))[features]
#y = data.append(pd.read_csv('data_2018_1_1.csv').set_index('business_id'))['closed_forecast_6_months']

In [None]:
# EDA of features
num_of_close = y[y == 1].shape[0]
num_of_open = y[y == 0].shape[0]
print "Number open(closed) in dataset: %s(%s) \n"%(num_of_open, num_of_close)

num_cost_1_closed = float(X[ (X.index.isin(y[y == 1].index)) & (X.cost_1 == 1) ].shape[0])
num_cost_1_open =  float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_1 == 1)].shape[0])
num_cost_1 = float( X[ X.cost_1 == 1 ].shape[0])

num_cost_2_closed = float( X[ (X.index.isin(y[y == 1].index)) & (X.cost_2 == 1) ].shape[0])
num_cost_2_open = float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_2 == 1)].shape[0])
num_cost_2 = float( X[ X.cost_2 == 1 ].shape[0])

num_cost_3_closed =  float( X[ (X.index.isin(y[y == 1].index)) & (X.cost_3 == 1) ].shape[0])
num_cost_3_open = float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_3 == 1)].shape[0])
num_cost_3 = float( X[ X.cost_3 == 1 ].shape[0])

num_cost_4_closed = float( X[ (X.index.isin(y[y == 1].index)) & (X.cost_4 == 1) ].shape[0])
num_cost_4_open = float( X[ (X.index.isin(y[y == 0].index)) & (X.cost_4 == 1)].shape[0])
num_cost_4 = float( X[ X.cost_4 == 1 ].shape[0])

avg_cost_closed = (1 * num_cost_1_closed + 2 * num_cost_2_closed + 3 * num_cost_3_closed + 4 * num_cost_4_closed) / num_of_close
avg_cost_open = (1 * num_cost_1_open + 2 * num_cost_2_open + 3 * num_cost_3_open + 4 * num_cost_4_open) / num_of_open
avg_cost = (1 * num_cost_1 + 2 * num_cost_2 + 3 * num_cost_3 + 4 * num_cost_4) / (num_of_open + num_of_close)

print "Avg cost for closed restaurants: ", avg_cost_closed
print "Avg cost for open resturants: ", avg_cost_open
print "Avg cost for all restaurants:", avg_cost
print "\n"
print "Avg sentiment for closed:", X[X.index.isin(y[y == 1].index)].sentiment.mean()
print "Avg sentiment for open:", X[X.index.isin(y[y == 0].index)].sentiment.mean()
print "Avg sentiment total:", X.sentiment.mean()
print "\n"
print "Avg review length for closed:", X[X.index.isin(y[y == 1].index)].avg_review_length.mean()
print "Avg review length for open:", X[X.index.isin(y[y == 0].index)].avg_review_length.mean()
print "Avg review length total", X.avg_review_length.mean()
print "\n"
print "Avg rating for closed:", X[X.index.isin(y[y == 1].index)].rating_before_date.mean()
print "Avg rating for open:", X[X.index.isin(y[y == 0].index)].rating_before_date.mean()
print "Avg rating total:", X.rating_before_date.mean()
print "\n"
print "Avg review count for closed:", X[X.index.isin(y[y == 1].index)].review_count_before_date.mean()
print "Avg review count for open:", X[X.index.isin(y[y == 0].index)].review_count_before_date.mean()
print "Avg review count total:", X.review_count_before_date.mean()
print "\n"
print "Is chain for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_chain == 1) ].shape[0]
print "Is not chain for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_chain == 0)].shape[0]
print "Is chain for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_chain == 1) ].shape[0]
print "Is not chain for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_chain == 0) ].shape[0]
print "\n"
print "Dup loc for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.duplicate_location == 1) ].shape[0]
print "Not dup loc for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.duplicate_location == 0)].shape[0] 
print "Dup loc for open:", X[ (X.index.isin(y[y == 0].index)) & (X.duplicate_location == 1) ].shape[0]
print "Not dup loc for open:", X[ (X.index.isin(y[y == 0].index)) & (X.duplicate_location == 0) ].shape[0]
print "\n"
print "Is claimed for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_claimed == 1) ].shape[0]
print "Is not claimed for closed:", X[ (X.index.isin(y[y == 1].index)) & (X.is_claimed == 0)].shape[0] 
print "Is claimed for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_claimed == 1) ].shape[0]
print "Is not claimed for open:", X[ (X.index.isin(y[y == 0].index)) & (X.is_claimed == 0) ].shape[0]

In [None]:
chain_df = pd.DataFrame(data= {'Open': [3100, 4155], 'Closed': [132, 473]}, index= ['Yes','No'] )

objects = ('Open', 'Closed')
y_pos = np.arange(len(objects))
performance = [ 100*float(chain_df.Open.Yes)/(chain_df.Open.Yes + chain_df.Open.No), 100*float(chain_df.Closed.Yes)/(chain_df.Closed.Yes + chain_df.Closed.No) ]

barplot = plt.bar(y_pos, performance, align='center')#, alpha=0.5)
barplot[0].set_color('cornflowerblue')
barplot[1].set_color('salmon')
plt.xticks(y_pos, objects)
xlocs=[i+1 for i in range(0,2)]
plt.tick_params(axis='both', which='major', labelsize=20)
plt.tick_params(axis='both', which='minor', labelsize=15)

for i, v in enumerate(performance):
    plt.text(xlocs[i] - 1.15, v + 1.5, str(round(v,0)),fontsize=20)

plt.ylim([0,50])

plt.show()

In [None]:
objects = ('Open', 'Closed')
y_pos = np.arange(len(objects))
performance = [ 478, 504 ]

barplot = plt.bar(y_pos, performance, align='center')
barplot[0].set_color('cornflowerblue')
barplot[1].set_color('salmon')
plt.xticks(y_pos, objects)
xlocs=[i+1 for i in range(0,2)]
plt.tick_params(axis='both', which='major', labelsize=20)
plt.tick_params(axis='both', which='minor', labelsize=15)

for i, v in enumerate(performance):
    plt.text(xlocs[i] - 1.18, v + 15.1, str(round(v,2)),fontsize=20)

plt.ylim([0,600])

plt.show()

In [None]:
dropped_columns = []
X_train, X_test, y_train, y_test = train_test_split(X.drop(dropped_columns,axis=1).values, y.values, test_size=0.2)
X_train_no_val, X_train_val, y_train_no_val, y_train_val = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
sns.heatmap(X.corr())

In [None]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.rcParams.update({'font.size': 17})
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()
    tick_marks = [0, 1]
    plt.xticks(tick_marks, ['Open','Closed'], rotation=45)
    plt.yticks(tick_marks, ['Open','Closed'])
    plt.tick_params(axis='both', which='major')
    plt.tick_params(axis='both', which='minor')
    for (j,i),label in np.ndenumerate(cm):
        plt.text(i,j,label,ha='center',va='center')
        plt.text(i,j,label,ha='center',va='center')
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title(title)

In [None]:
param_grid_logistic = {
    'logistic__C': np.logspace(-4, 4, 4),
    'logistic__solver' : [ 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'logistic__max_iter' : [500, 1000, 2000]
}
param_grid_rf = {
    'rf__max_depth' : [4, 6, 8],
    'rf__n_estimators' : [500, 1000, 2000]
}
param_grid_xgb = {
    'xgb__min_child_weight': [1, 5, 10],
    'xgb__gamma': [0.5, 1, 1.5, 2, 5],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],
    'xgb__learning_rate': [0.01, 0.02, 0.05, 0.1],
    'xgb__max_depth': [3, 4, 5]
    }

#pipe = Pipeline([ ( 'scaler', StandardScaler() ), ( 'logistic', LogisticRegression(penalty='l2', class_weight='balanced') ) ])
#grid_search = GridSearchCV(pipe, param_grid_logistic, cv=5, scoring='roc_auc', n_jobs=-1) #roc_auc

#pipe = Pipeline([ ( 'rf', RandomForestClassifier(class_weight='balanced') ) ])
#grid_search = GridSearchCV(pipe, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1) #roc_auc

balanced_class_ratio = float((y_train==0).sum())/(y_train==1).sum()
pipe = Pipeline([ ( 'xgb', xgb.XGBClassifier(scale_pos_weight=balanced_class_ratio) ) ])
grid_search = GridSearchCV(pipe, param_grid_xgb, cv=5, scoring='roc_auc', n_jobs=-1)

grid_search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
y_pred_train = grid_search.best_estimator_.predict(X_train)
print "F1 Score:", f1_score(y_train, y_pred_train)
print "Precision Score:", precision_score(y_train, y_pred_train)
print "Recall Score:", recall_score(y_train, y_pred_train)
print "Accuracy Score:", accuracy_score(y_train, y_pred_train)
confusion_matrix(y_true=y_train, y_pred=y_pred_train)

In [None]:
#with open('trained_classifier_%s.pkl'%forecast_length, 'wb') as fid:
#    pickle.dump(grid_search.best_estimator_, fid)

In [None]:
#with open('trained_classifier_6_months.pkl', 'rb') as fid:
#    gs_model = pickle.load(fid)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, grid_search.best_estimator_.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
opt_fpr, opt_tpr, opt_threshold = fpr[(tpr + 1 - fpr).argmax()], tpr[(tpr + 1 - fpr).argmax()], thresholds[(tpr + 1 - fpr).argmax()]

In [None]:
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.plot([opt_fpr],[opt_tpr],'bo', label='Optimal Threshold' %opt_threshold, markersize=15)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve 1-month model')
plt.legend(loc="lower right")
plt.show()

In [None]:
y_pred_roc = np.array([1 if grid_search.best_estimator_.predict_proba(X_test)[i,1] > opt_threshold else 0 for i in range(X_test.shape[0]) ])
print "F1 Score:", f1_score(y_test, y_pred_roc)
print "Precision Score:", precision_score(y_test, y_pred_roc)
print "Recall Score:", recall_score(y_test, y_pred_roc)
print "Accuracy Score:", accuracy_score(y_test, y_pred_roc)
plot_confusion_matrix(confusion_matrix(y_true=y_test, y_pred=y_pred_roc), title='%s'%forecast_length.replace('_',' ')[:-1])

In [None]:
y_pred_test = grid_search.best_estimator_.predict(X_test)
print "F1 Score:", f1_score(y_test, y_pred_test)
print "Precision Score:", precision_score(y_test, y_pred_test)
print "Recall Score:", recall_score(y_test, y_pred_test)
print "Accuracy Score:", accuracy_score(y_test, y_pred_test)
plot_confusion_matrix(confusion_matrix(y_true=y_test, y_pred=y_pred_test))

In [None]:
# Investigate coefficients, feature importances, etc.
features_to_names = { 'f{i}'.format(i=i) : X.columns.values[i] for i in range(len(X.columns)) }
print features_to_names, '\n'
if 'scaler' in grid_search.best_estimator_.named_steps:
    print grid_search.best_estimator_.named_steps['scaler'].mean_
    print grid_search.best_estimator_.named_steps['logistic'].coef_
elif 'xgb' in grid_search.best_estimator_.named_steps:
    print 'gain', sorted(grid_search.best_estimator_.named_steps['xgb'].get_booster().get_score(importance_type='gain').items(), key = lambda x : x[1], reverse=True), '\n'
    print 'weight', sorted(grid_search.best_estimator_.named_steps['xgb'].get_booster().get_score(importance_type='weight').items(), key = lambda x : x[1], reverse=True), '\n'
    print 'cover', sorted(grid_search.best_estimator_.named_steps['xgb'].get_booster().get_score(importance_type='cover').items(), key = lambda x : x[1], reverse=True), '\n'
    print 'total_gain', sorted(grid_search.best_estimator_.named_steps['xgb'].get_booster().get_score(importance_type='total_gain').items(), key = lambda x : x[1], reverse=True), '\n'
    print 'total_cover', sorted(grid_search.best_estimator_.named_steps['xgb'].get_booster().get_score(importance_type='total_cover').items(), key = lambda x : x[1], reverse=True), '\n'
    print 'fscore', sorted(grid_search.best_estimator_.named_steps['xgb'].get_booster().get_fscore().items(), key = lambda x : x[1], reverse=True)
elif 'rf' in grid_search.best_estimator_.named_steps:
    print grid_search.best_estimator_.named_steps['rf'].feature_importances_
else:
    print "Invalid pipeline"

In [None]:
#xgb.plot_importance(grid_search.best_estimator_.named_steps['xgb'].get_booster(), importance_type='gain')
#xgb.plot_importance(grid_search.best_estimator_.named_steps['xgb'].get_booster(), importance_type='weight')
#xgb.plot_importance(grid_search.best_estimator_.named_steps['xgb'].get_booster(), importance_type='cover')
xgb.plot_tree(grid_search.best_estimator_.named_steps['xgb'].get_booster())

In [None]:
# look at 3 and 4 star restaurants, but make sure to separate these because you've trained on some of them
#data[ (data.cost_3 == 1) | (data.cost_4 == 1) ][features]

#grid_search.best_estimator_.predict(X[ (X.cost_3 == 1) | (X.cost_4 == 1) ].values)
#y[ X[ (X.cost_3 == 1) | (X.cost_4 == 1) ].index ].values

In [None]:
metrics_df = pd.DataFrame(data = { 'F1':        [0.08,0.22,0.37,0.44], 'Recall'  : [0.80,0.95,0.88,0.96], \
                                   'Precision': [0.04,0.12,0.24,0.28], 'Accuracy': [0.88,0.82,0.87,0.85] }, \
                          index= [1,3,6,9] )
metrics_df

In [None]:
#plt.plot(metrics_df.index,100*metrics_df.Recall, '-o')
#plt.plot(metrics_df.index,100*metrics_df.Accuracy, '-o')
#plt.plot(metrics_df.index,[100*metrics_df.Accuracy.mean() for i in range(metrics_df.Accuracy.shape[0])], '--')
#plt.plot(metrics_df.index,100*metrics_df.Precision, '-o')
plt.plot(metrics_df.index,100*metrics_df.F1, '-o')
plt.xticks(metrics_df.index)
plt.xlabel('Months')
plt.ylabel('Percentage')
#plt.yticks([80, 82, 84, 86, 88, 90])
#plt.title('Accuracy across models')
#plt.legend()

In [None]:
model = Pipeline([ ( 'scaler', StandardScaler() ), ( 'lr', LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42, class_weight='balanced') ) ])
#model = Pipeline([ ( 'scaler', StandardScaler() ), ( 'lr', RandomForestClassifier(max_depth=5, n_estimators=1000, class_weight='balanced') ) ])
model.fit(X_train_no_val, y_train_no_val)
y_pred_val = model.predict(X_train_val)
print "F1 score:", f1_score(y_train_val, y_pred_val)
print "Accuracy score:", accuracy_score(y_train_val, y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val, y_pred=y_pred_val))

In [None]:
clf = RandomForestClassifier(max_depth=3, class_weight='balanced')
clf.fit(X_train_no_val, y_train_no_val)
y_pred_val = clf.predict(X_train_val)
print "F1 score:", f1_score(y_train_val, y_pred_val)
print "Precision Score:", precision_score(y_train_val, y_pred_val)
print "Recall Score:", recall_score(y_train_val, y_pred_val)
print "Accuracy score:", accuracy_score(y_train_val, y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val, y_pred=y_pred_val))
clf.feature_importances_

In [None]:
clf = LogisticRegression(solver='lbfgs',max_iter=1000,random_state=42,class_weight='balanced')
clf.fit(X_train_no_val, y_train_no_val)
y_pred_val = clf.predict(X_train_val)
print "F1 score:", f1_score(y_train_val, y_pred_val)
print "Precision Score:", precision_score(y_train_val, y_pred_val)
print "Recall Score:", recall_score(y_train_val, y_pred_val)
print "Accuracy score:", accuracy_score(y_train_val, y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val, y_pred=y_pred_val)) #(clf.predict_proba(X_train_val) >= 0.5).astype(int).sum(axis=1)) #y_pred_val

In [None]:
balanced_class_ratio = float((y_train_no_val==0).sum())/(y_train_no_val==1).sum()
clf = xgb.XGBClassifier(scale_pos_weight=balanced_class_ratio, learning_rate=0.05)
clf.fit(X_train_no_val, y_train_no_val)
y_pred_val = clf.predict(X_train_val)
print "F1 Score:", f1_score(y_train_val, y_pred_val)
print "Precision Score:", precision_score(y_train_val, y_pred_val)
print "Recall Score:", recall_score(y_train_val, y_pred_val)
print "Accuracy Score:", accuracy_score(y_train_val, y_pred_val)
print "Confusion matrix:", confusion_matrix(y_true=y_train_val, y_pred=y_pred_val)
plot_confusion_matrix(confusion_matrix(y_true=y_train_val,y_pred=y_pred_val))
clf.feature_importances_

In [None]:
#### Save important files to be used for model in web app
business_names_str = np.array([ str(name.encode('utf-8')) for name in yelp_businesses_df.name.values ]).astype(str)
np.savetxt('VivaLasFoodieRestaurantNames.csv', np.vectorize(lambda x: x.decode('UTF-8'))(business_names_str), delimiter=',', fmt='%s')

np.savetxt('chains.csv', np.vectorize(lambda x: x.decode('UTF-8'))(np.array([str(name.encode('utf-8')) for name in chains]).astype(str)),delimiter=',', fmt='%s')

np.savetxt('duplicate_locations.csv', duplicate_locations_df.values, delimiter=',')

with open('name_to_id_dict.json', 'w') as fp:
    json.dump(name_to_id_dict, fp)

with open('id_to_features_dict.json', 'w') as fp:
    json.dump(id_to_features_dict, fp)

In [None]:
def dist_of_open_businesses_in_city(businesses_df, categories):
    open_businesses = []
    closed_businesses = []
    valid_categories = []
    
    for category in categories:
        category_df = businesses_df[businesses_df['categories'].str.contains(category, na=False)]
        num_open = category_df[category_df.is_open == 1].shape[0]
        num_closed = category_df[category_df.is_open == 0].shape[0]
        if num_open + num_closed > 100 and num_closed > 50:
            open_businesses.append(category_df[category_df.is_open == 1].shape[0])
            closed_businesses.append(category_df[category_df.is_open == 0].shape[0]) 
            valid_categories.append(category)
        
    city_business_distribution = pd.DataFrame(data={'Open' : open_businesses, 'Closed' : closed_businesses}, index=valid_categories)
    
    return city_business_distribution

city_business_distribution = dist_of_open_businesses_in_city(yelp_businesses_df, categories)
city_business_distribution

In [None]:
#### Survival Analysis

In [None]:
features = ['is_chain','duplicate_location','cost_2','cost_3','cost_4', 'is_claimed', 'sentiment', 'avg_review_length', \
            'review_count_before_date', 'rating_before_date','age (in days)', 'is_open' ]# + ['city_Las Vegas']#['city_%s'%city for city in cities]
data_survival = build_X_and_y(yelp_businesses_df, reviews_df, NOV_14_2018, forecast_months=None, load_NLP=True, ignore_distance=True, do_distance=True, features=features)
data_survival_train, data_survival_test = train_test_split(data_survival, test_size=0.2)
data_survival_train

In [None]:
cph = lifelines.CoxPHFitter()
cph.fit(data_survival_train.replace(), duration_col='age (in days)', event_col='is_open')
cph.print_summary()

In [None]:
fpr, tpr, thresholds = roc_curve(data_survival_test.is_open.values, cph.predict_survival_function( data_survival_test.drop(['is_open'],axis=1) ).loc[94])
roc_auc = auc(fpr, tpr)
opt_fpr, opt_tpr, opt_threshold = fpr[(tpr + 1 - fpr).argmax()], tpr[(tpr + 1 - fpr).argmax()], thresholds[(tpr + 1 - fpr).argmax()]

In [None]:
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.plot([opt_fpr],[opt_tpr],'bo', label='Optimal Threshold' %opt_threshold, markersize=15)
plt.xlim([-0.01, 1.0])
plt.ylim([-0.01, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve 1-month model')
plt.legend(loc="lower right")
plt.show()

In [None]:
survival_probs = cph.predict_survival_function(data_survival_test.drop(['is_open'],axis=1))

In [None]:
survival_length = 94

In [None]:
y_pred_roc = np.array([1 if survival_probs.loc[survival_length].iloc[i] > 0.00009 else 0 for i in range(data_survival_test.shape[0]) ])
print "F1 Score:", f1_score(data_survival_test.is_open.values, y_pred_roc)
print "Precision Score:", precision_score(data_survival_test.is_open.values, y_pred_roc)
print "Recall Score:", recall_score(data_survival_test.is_open.values, y_pred_roc)
print "Accuracy Score:", accuracy_score(data_survival_test.is_open.values, y_pred_roc)
plot_confusion_matrix(confusion_matrix(y_true=data_survival_test.is_open.values, y_pred=y_pred_roc))#, title='3 months')