In [157]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np
from sqlalchemy import create_engine, inspect
import pandas as pd
from collections import Counter
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

PSQL = 'postgres://{}@localhost:5432/rpred'


## [Apply Models](#models)

In [74]:
cred = ""
with open("credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")

cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

In [155]:
# fit model and return scores (classification report?)
def apply_model(model, data, target, scoring=['accuracy']):
    model = make_pipeline(StandardScaler(), model)
    #return np.mean(cross_val_score(model, data, target, cv=10, scoring=scoring))
    return cross_validate(model, data, target, scoring=scoring,
                          cv=10, return_train_score=True)
            
# grid search?
def optimize():
    return None

In [189]:
features = pd.read_sql_query('''SELECT * from features order by name''', cnx)
target = pd.read_sql_query('''SELECT * FROM target order by name''', cnx)
features['key'] = features['name'] + " " + features['full_address']
features.head()

Unnamed: 0,name,full_address,review_count,stars,bar,fast_food,mexican,chinese,key
0,1000 Grammes,1495 Rue Sainte-Catherine East Ville-Marie Mon...,19,3.5,0,0,0,0,1000 Grammes 1495 Rue Sainte-Catherine East Vi...
1,108 Chinese Take Away,108 Portobello High Street Edinburgh EH15 1AL,3,5.0,0,1,0,0,108 Chinese Take Away 108 Portobello High Stre...
2,10-to-10 In Delhi,67 Nicolson Street Newington Edinburgh EH8 9BZ,50,4.5,0,0,0,0,10-to-10 In Delhi 67 Nicolson Street Newington...
3,180g,6546 Rue Waverly Rosemont-La Petite-Patrie Mon...,4,5.0,0,0,0,0,180g 6546 Rue Waverly Rosemont-La Petite-Patri...
4,181 Delicatessen,181 Bruntsfield Road Bruntsfield Edinburgh EH1...,5,4.0,0,0,0,0,181 Delicatessen 181 Bruntsfield Road Bruntsfi...


In [190]:
target['key'] = target['name'] + " " + target['full_address']
target.rename(columns={'is_open':'stay_open'}, inplace=True)
print("open:", len(target[target['stay_open'] == 1])/len(target))
print("closed:", len(target[target['stay_open'] == 0])/len(target))
target.head()

open: 0.744009305932532
closed: 0.255990694067468


Unnamed: 0,full_address,name,stay_open,key
0,1495 Rue Sainte-Catherine East Ville-Marie Mon...,1000 Grammes,1,1000 Grammes 1495 Rue Sainte-Catherine East Vi...
1,108 Portobello High Street Edinburgh EH15 1AL,108 Chinese Take Away,1,108 Chinese Take Away 108 Portobello High Stre...
2,1153 E Jefferson St Phoenix AZ 85034,12 East Cafe,0,12 East Cafe 1153 E Jefferson St Phoenix AZ 8...
3,3459 S Jones Blvd Chinatown Las Vegas NV 89146,168 Market,1,168 Market 3459 S Jones Blvd Chinatown Las Veg...
4,75 S 17th St South Side Pittsburgh PA 15203,17th Street Cafe,0,17th Street Cafe 75 S 17th St South Side Pitts...


In [202]:
df = pd.merge(features, target, on='key', how='inner')
df.drop(columns=['name_y', 'name_x', 'full_address_x', 'full_address_y'], inplace=True)
print("open:", len(df[df['stay_open'] == 1])/len(df))
print("closed:", len(df[df['stay_open'] == 0])/len(df))
df.head()

open: 0.9277450338354071
closed: 0.07225496616459288


Unnamed: 0,review_count,stars,bar,fast_food,mexican,chinese,key,stay_open
0,19,3.5,0,0,0,0,1000 Grammes 1495 Rue Sainte-Catherine East Vi...,1
1,3,5.0,0,1,0,0,108 Chinese Take Away 108 Portobello High Stre...,1
2,4,5.0,0,0,0,0,180g 6546 Rue Waverly Rosemont-La Petite-Patri...,1
3,5,4.0,0,0,0,0,181 Delicatessen 181 Bruntsfield Road Bruntsfi...,1
4,4,5.0,0,0,0,0,1865 Coffee 40 E University Dr Tempe AZ 85281,1


In [203]:
#TODO: try to remove open venues...
closed = df[df['stay_open'] == 0]
stay_open = df[df['stay_open'] == 1]

df = pd.concat([closed, closed, stay_open])
    
print(len(df))
print("open:", len(df[df['stay_open'] == 1])/len(df))
print("closed:", len(df[df['stay_open'] == 0])/len(df))

9824
open: 0.865228013029316
closed: 0.13477198697068404


In [204]:
y = df['stay_open']
X = df.drop(columns=['stay_open', 'key'])

 <a class="anchor" id="models"></a>
# Run Models

metrics = ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']

* open: 0.9277450338354071
* closed: 0.07225496616459288

In [183]:
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'explained_variance']

In [None]:
# baseline
scores = apply_model(RandomForestClassifier(), X[['stars']], y, scoring)

In [205]:
scores = apply_model(RandomForestClassifier(), X, y, scoring)
[(m[0], np.mean(m[1])) for m in scores.items()]

[('fit_time', 0.057198619842529295),
 ('score_time', 0.02292914390563965),
 ('test_accuracy', 0.856777954348155),
 ('train_accuracy', 0.888911468976114),
 ('test_precision', 0.8838582463790357),
 ('train_precision', 0.8977503511154923),
 ('test_recall', 0.9607058823529411),
 ('train_recall', 0.9836470588235293),
 ('test_f1', 0.9206622400291342),
 ('train_f1', 0.9387348987414594),
 ('test_roc_auc', 0.7824495865331779),
 ('train_roc_auc', 0.864120941090434),
 ('test_explained_variance', -0.17916042110624164),
 ('train_explained_variance', 0.10621367614435977)]

In [206]:
scores = apply_model(GradientBoostingClassifier(), X, y, scoring)
[(m[0], np.mean(m[1])) for m in scores.items()]

[('fit_time', 0.35144758224487305),
 ('score_time', 0.013536787033081055),
 ('test_accuracy', 0.8649229363538609),
 ('train_accuracy', 0.8663703513127097),
 ('test_precision', 0.8655598525882556),
 ('train_precision', 0.8662676761085073),
 ('test_recall', 0.9990588235294118),
 ('train_recall', 0.9999215686274511),
 ('test_f1', 0.9275298981250458),
 ('train_f1', 0.9283084474142397),
 ('test_roc_auc', 0.6506763633683137),
 ('train_roc_auc', 0.7151192987797568),
 ('test_explained_variance', -0.005638514736037903),
 ('train_explained_variance', 0.006860745133175472)]

In [207]:
scores = apply_model(LogisticRegression(), X, y, scoring)
[(m[0], np.mean(m[1])) for m in scores.items()]

[('fit_time', 0.013365864753723145),
 ('score_time', 0.005452680587768555),
 ('test_accuracy', 0.865228228147344),
 ('train_accuracy', 0.8652280156856943),
 ('test_precision', 0.865228228147344),
 ('train_precision', 0.8652280156856943),
 ('test_recall', 1.0),
 ('train_recall', 1.0),
 ('test_f1', 0.9277451001412723),
 ('train_f1', 0.9277450346540974),
 ('test_roc_auc', 0.6069945518877408),
 ('train_roc_auc', 0.6118128887094126),
 ('test_explained_variance', 6.661338147750939e-17),
 ('train_explained_variance', -1.3322676295501878e-16)]

# Scratch

f1 -> 1