Both my features and target are ordinal data. For instance, Michelin stars have an order, but they're categorical data not numeric (there's no reason to expect the difference between 1 star and 2 stars is the same as the difference between 2 and 3 stars). Same goes for data like Zagat ratings and Yelp reviews.
If my feature data (independent variables) were numeric, I might consider ordered logistic regression, or a clustering technique that relies on distance - maybe SVM, KNN or DBScan.
Since my features are also ordinal... random forest seems the most logical.

In [42]:
import pandas as pd
import numpy as np
%matplotlib inline

In [43]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [44]:
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [45]:
# Reading in the merged data
nyc_zm = pd.read_pickle('../nyc_zm.pkl')
sf_zm = pd.read_pickle('../sf_zm.pkl')
chicago_zm = pd.read_pickle('../chicago_zm.pkl')
dc = pd.read_pickle('../dc.pkl')
dc_prop = pd.read_pickle('../dc_prop_clean.pkl') 

In [46]:
li = [nyc_zm, sf_zm, chicago_zm, dc]
for i in li: 
    print i.shape

(2771, 15)
(2029, 15)
(1376, 15)
(1199, 13)


In [47]:
#Dropping michelin restaurants with no Zagat data
ntrain = nyc_zm[np.isfinite(nyc_zm['food'])]
strain = sf_zm[np.isfinite(sf_zm['food'])]
ctest = chicago_zm[np.isfinite(chicago_zm['food'])]

li = [ntrain, strain, ctest]
for i in li:
    print i.shape

(2767, 15)
(2023, 15)
(1376, 15)


In [48]:
# Function for dummy variable for whether there is an open table link
def ot_dum(X):
    if X.open_table != '':
        return 1
    else:
        return 0

In [49]:
#Imputer to replace missing values with mean of the column

imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)

In [50]:
# set up labelencoder for restaurants - I'm training the encoder on all restaurants

le = LabelEncoder()

In [51]:
m = ntrain.cuisine.append(strain.cuisine).append(dc.cuisine).append(ctest.cuisine)

In [52]:
m = m.as_matrix()
le.fit(m)

LabelEncoder()

In [823]:
# Function to transform the data for analysis - I want to do this as a function both to tweak it, and to keep the datasets
# separate. If I add normalization that needs to be within cities.
def engineer_data(X_data):
    X2 = X_data[['cost','cuisine','open_table', 'price_level', 'food', 'decor', 'service', 'stars']]
    X2['ot_dummy'] = X2.apply(ot_dum, axis=1)
    X2['stars'].replace(np.nan,0, inplace=True)
    X2.drop('open_table', axis=1, inplace=True)
    X2['price_lev_map'] = X2.price_level.map({'VE':4,'E':3,'M':2,'I':1})
    X2.drop('price_level', axis=1, inplace=True)
    X2['cuisine_code'] = le.transform(X2.cuisine)
    X2.drop('cuisine', axis=1, inplace=True)
    X2 = imp.fit_transform(X2)
    X2 = pd.DataFrame(X2, columns = ['cost','food', 'decor', 'service', 'stars', 'ot_dummy', 'price_lev_map','cuisine_code'])
    return X2

In [824]:
ntrain1 = engineer_data(ntrain)
strain1 = engineer_data(strain)
ctest1 = engineer_data(ctest)

In [825]:
ntrain1.head()

Unnamed: 0,cost,food,decor,service,stars,ot_dummy,price_lev_map,cuisine_code
0,176.0,4.9,4.8,4.9,2.0,0.0,4.0,60.0
1,141.0,4.9,4.8,4.8,1.0,1.0,4.0,60.0
2,182.0,4.9,4.8,4.9,3.0,1.0,4.0,119.0
3,341.0,4.8,4.7,4.8,3.0,1.0,4.0,97.0
4,58.0,4.8,4.3,4.7,0.0,0.0,3.0,5.0


In [729]:
# ROUND 1: No normalization on anything
training1 = ntrain1.append(strain1)

In [751]:
X_train = training1.drop(['stars'], axis=1)
y_train = training1.stars
X_test = ctest1.drop(['stars'], axis=1)
y_test = ctest1.stars

In [752]:
rfg = RandomForestRegressor(100, max_depth=5, min_samples_leaf=3)

In [753]:
rfg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=3,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [754]:
pred = rfg.predict(X_test).round()

In [756]:
# I tried changing max_depth, and adjusting min_samples_leaf, but nothing made a difference
result1 = pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])
result1

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,1351,3,1,0
actual1,10,5,1,0
actual2,1,2,0,0
actual3,1,0,0,1


In [518]:
# Price is crazy important
rfg.feature_importances_

array([ 0.84449215,  0.04993122,  0.02429525,  0.03879858,  0.00449126,
        0.00913733,  0.02885421])

Let's try a second round, normalizing price (cost) this time

In [963]:
def engineer_data2(X_data):
    X2 = X_data[['cost','cuisine','open_table', 'price_level', 'food', 'decor', 'service', 'stars']]
    X2['ot_dummy'] = X2.apply(ot_dum, axis=1)
    X2['stars'].replace(np.nan,0, inplace=True)
    X2.drop('open_table', axis=1, inplace=True)
    X2['price_lev_map'] = X2.price_level.map({'VE':4,'E':3,'M':2,'I':1})
    X2.drop('price_level', axis=1, inplace=True)
    X2['cuisine_code'] = le.transform(X2.cuisine)
    X2.drop('cuisine', axis=1, inplace=True)
    X2 = imp.fit_transform(X2)
    X2 = pd.DataFrame(X2, columns = ['cost','food', 'decor', 'service', 'stars', 'ot_dummy', 'price_lev_map','cuisine_code'])
    X2['cost'] = preprocessing.StandardScaler().fit_transform(X2.cost)
    return X2

In [964]:
ntrain2 = engineer_data2(ntrain)
strain2 = engineer_data2(strain)
ctest2 = engineer_data2(ctest)

In [965]:
training2 = ntrain2.append(strain2)
X_train = training2.drop(['stars'], axis=1)
y_train = training2.stars
X_test = ctest2.drop(['stars'], axis=1)
y_test = ctest2.stars

In [977]:
# Normalizing price within these datasets increased false positives a ton, until I set max_depth
# Normalizing price looks helpful

rfg2 = RandomForestRegressor(100, max_depth=5, min_samples_leaf=3)
rfg2.fit(X_train, y_train)
pred = rfg2.predict(X_test).round()
best = pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])
best

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,743,7,2,0
actual1,6,9,1,0
actual2,1,1,1,0
actual3,0,0,0,1


In [967]:
# Even normalized, price is crazy-important -- actually more important
rfg2.feature_importances_

array([ 0.88020743,  0.0238489 ,  0.02371696,  0.02389785,  0.00452905,
        0.00514243,  0.03865738])

In [968]:
#Try a classifier just on stars or no -- good roc_auc, would need to move the probability cutoff to avoid all the false
#positives
y_train = trainingc.stars.map({0:0, 1:1, 2:1, 3:1})
y_test = ctestc.stars.map({0:0, 1:1, 2:1, 3:1})

rfc = RandomForestClassifier(n_estimators = 100,class_weight = 'balanced', min_samples_leaf = 3, max_depth=5)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1'], index = ['actual0', 'actual1'])

proba = rfc.predict_proba(X_test)
proba = pd.DataFrame(proba, columns=['zero', 'one'])
roc_auc_score(y_test, proba.one)

         pred0  pred1
actual0   1287     68
actual1      4     17


0.94751361799332279

What if instead I adjust cost for relative restaurant meal costs across cities? 
https://www.numbeo.com/cost-of-living/
NYC: 80.00
Chicago: 66.50
San Francisco: 75.00
Washington, DC: 72.50

In [860]:
ntrainc = ntrain1.copy(deep=True)
ntrainc['cost'] = ntrainc['cost']/80.00

In [861]:
strainc = strain1.copy(deep=True)
strainc['cost'] = strainc['cost']/75.00

In [862]:
ctestc = ctest1.copy(deep=True)
ctestc['cost'] = ctestc['cost']/66.50

In [863]:
trainingc = ntrainc.append(strainc)
X_train = trainingc.drop(['stars'], axis=1)
y_train = trainingc.stars
X_test = ctestc.drop(['stars'], axis=1)
y_test = ctestc.stars

In [864]:
#This doesn't seem better than normalizing price

rfgc = RandomForestRegressor(100,max_depth=8, min_samples_leaf=3)
rfgc.fit(X_train, y_train)
pred = rfgc.predict(X_test).round()
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,1346,7,2,0
actual1,8,7,1,0
actual2,1,1,1,0
actual3,1,0,0,1


In [865]:
#Trying with classifier... not really better

rfc = RandomForestClassifier(n_estimators = 100,class_weight = 'balanced', min_samples_leaf = 3)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,1343,10,0,2
actual1,6,8,1,1
actual2,0,2,1,0
actual3,1,0,0,1


In [866]:
# I do really like that importance of cost goes down here
rfc.feature_importances_

array([ 0.33985302,  0.12110115,  0.12034092,  0.18248442,  0.03619662,
        0.11638105,  0.08364282])

In [867]:
# Trying to just predict stars (not number of stars) so that I can produce ROC_AUC
y_train = trainingc.stars.map({0:0, 1:1, 2:1, 3:1})
y_test = ctestc.stars.map({0:0, 1:1, 2:1, 3:1})

rfc = RandomForestClassifier(n_estimators = 100,class_weight = 'balanced', min_samples_leaf = 3)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1'], index = ['actual0', 'actual1'])

proba = rfc.predict_proba(X_test)
proba = pd.DataFrame(proba, columns=['zero', 'one'])
roc_auc_score(y_test, proba.one)

0.94269899841855542

In [868]:
rfc = RandomForestClassifier(n_estimators = 100,class_weight = 'balanced', min_samples_leaf = 3)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1'], index = ['actual0', 'actual1'])

Unnamed: 0,pred0,pred1
actual0,1342,13
actual1,6,15


In [969]:
# What about keeping normalized cost and normalizing the Zagat ratings? 
# I have to use min_max because normal distribution doesn't make sense with a cutoff
# of 5

def engineer_data4(X_data):
    X2 = X_data[['cost', 'cuisine', 'open_table', 'price_level', 'food', 'decor', 'service', 'stars']]
    X2['ot_dummy'] = X2.apply(ot_dum, axis=1)
    X2['stars'].replace(np.nan,0, inplace=True)
    X2.drop('open_table', axis=1, inplace=True)
    X2['price_lev_map'] = X2.price_level.map({'VE':4,'E':3,'M':2,'I':1})
    X2.drop('price_level', axis=1, inplace=True)
    X2['cuisine_code'] = le.transform(X2.cuisine)
    X2.drop('cuisine', axis=1, inplace=True)
    X2 = imp.fit_transform(X2)
    X2 = pd.DataFrame(X2, columns = ['cost','food', 'decor', 'service', 'stars', 'ot_dummy', 'price_lev_map', 'cuisine_code'])
    X2['food'] = preprocessing.MinMaxScaler().fit_transform(X2.food)
    X2['service'] = preprocessing.MinMaxScaler().fit_transform(X2.service)
    X2['decor'] = preprocessing.MinMaxScaler().fit_transform(X2.decor)
    X2['cost'] = preprocessing.StandardScaler().fit_transform(X2.cost)
    return X2

In [970]:
ntrain4 = engineer_data4(ntrain)
strain4 = engineer_data4(strain)
ctest4 = engineer_data4(ctest)

training4 = ntrain4.append(strain4)
X_train = training4.drop(['stars'], axis=1)
y_train = training4.stars
X_test = ctest4.drop(['stars'], axis=1)
y_test = ctest4.stars

In [971]:
ntrain4.head()

Unnamed: 0,cost,food,decor,service,stars,ot_dummy,price_lev_map,cuisine_code
0,5.02143,1.0,1.0,1.0,2.0,0.0,4.0,60.0
1,3.684074,1.0,1.0,0.979592,1.0,1.0,4.0,60.0
2,5.250691,1.0,1.0,1.0,3.0,1.0,4.0,119.0
3,11.326105,0.979592,0.979167,0.979592,3.0,1.0,4.0,97.0
4,0.512632,0.979592,0.895833,0.959184,0.0,0.0,3.0,5.0


In [972]:
#Played around with parameters, but there's no real benefit from the scaling

rfg4 = RandomForestRegressor(max_depth=20, min_samples_leaf=10)
rfg4.fit(X_train, y_train)
pred = rfg4.predict(X_test).round()
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,1343,11,1,0
actual1,7,8,1,0
actual2,1,1,1,0
actual3,1,0,1,0


In [24]:
#What if I omit restaurants with a food score less than 4.2? Note, normalizing cost before
def engineer_data5(X_data):
    X2 = X_data[['cost','cuisine', 'open_table', 'price_level', 'food', 'decor', 'service', 'stars']]
    X2['ot_dummy'] = X2.apply(ot_dum, axis=1)
    X2['stars'].replace(np.nan,0, inplace=True)
    X2.drop('open_table', axis=1, inplace=True)
    X2['price_lev_map'] = X2.price_level.map({'VE':4,'E':3,'M':2,'I':1})
    X2.drop('price_level', axis=1, inplace=True)
    X2['cuisine_code'] = le.transform(X2.cuisine)
    X2.drop('cuisine', axis=1, inplace=True)
    X2 = imp.fit_transform(X2)
    X2 = pd.DataFrame(X2, columns = ['cost','food', 'decor', 'service', 'stars', 'ot_dummy', 'price_lev_map','cuisine_code'])
    X2['cost'] = preprocessing.StandardScaler().fit_transform(X2.cost)
    X3 = X2[X2.food>4.2]
    return X3

In [1000]:
ntrain5 = engineer_data5(ntrain)
strain5 = engineer_data5(strain)
ctest5 = engineer_data5(ctest)

training = ntrain5.append(strain5)
X_train = training.drop(['stars'], axis=1)
y_train = training.stars
X_test = ctest5.drop(['stars'], axis=1)
y_test = ctest5.stars

In [975]:
ntrain5.head()

Unnamed: 0,cost,food,decor,service,stars,ot_dummy,price_lev_map,cuisine_code
0,5.02143,4.9,4.8,4.9,2.0,0.0,4.0,60.0
1,3.684074,4.9,4.8,4.8,1.0,1.0,4.0,60.0
2,5.250691,4.9,4.8,4.9,3.0,1.0,4.0,119.0
3,11.326105,4.8,4.7,4.8,3.0,1.0,4.0,97.0
4,0.512632,4.8,4.3,4.7,0.0,0.0,3.0,5.0


In [1001]:
#Looks good - but only very slightly better due to cutting off data at food scores of 4.2
rfg = RandomForestRegressor(100, max_depth=8)
rfg.fit(X_train, y_train)
pred = (rfg.predict(X_test)).round()
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3'], index = ['0 stars', '1 star', '2 star', '3 star'])

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
0 stars,743,7,2,0
1 star,5,9,2,0
2 star,1,1,1,0
3 star,0,0,0,1


In [992]:
rfg.feature_importances_

array([ 0.78672429,  0.03754881,  0.04733046,  0.04078128,  0.01284803,
        0.01114307,  0.06362405])

In [1014]:
featimp = pd.DataFrame(zip(X_test.columns, rfg.feature_importances_), columns=['Feature', 'Importance'])

In [1023]:
featimp

Unnamed: 0,Feature,Importance
5,price_lev_map,0.008971
4,ot_dummy,0.011915
1,food,0.03333
3,service,0.045291
2,decor,0.050029
6,cuisine_code,0.068189
0,cost,0.782274


In [919]:
# Try random forest classifier - not looking great
rfc = RandomForestClassifier(n_estimators = 100,class_weight = 'balanced')
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,744,7,0,1
actual1,11,4,1,0
actual2,1,1,1,0
actual3,0,0,0,1


In [893]:
#Try adaboost - actually, seems too conservative
from sklearn.ensemble import AdaBoostClassifier

In [894]:
adb = AdaBoostClassifier(rfc)

In [895]:
adb.fit(X_train, y_train)
pred = adb.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,745,6,0,1
actual1,9,3,4,0
actual2,0,2,1,0
actual3,0,0,0,1


In [559]:
from sklearn.ensemble import AdaBoostRegressor

In [896]:
adr = AdaBoostRegressor(rfg)

In [897]:
#...and this isn't conservative enough
adr.fit(X_train, y_train)
pred = adr.predict(X_test).round()
pd.DataFrame(confusion_matrix(y_test, pred), columns = ['pred0', 'pred1', 'pred2', 'pred3'], index = ['actual0', 'actual1', 'actual2', 'actual3'])

Unnamed: 0,pred0,pred1,pred2,pred3
actual0,730,19,3,0
actual1,4,8,4,0
actual2,0,2,1,0
actual3,0,0,0,1


In [924]:
def engineer_data_dc5(X_data):
    X2 = X_data[['cost','cuisine', 'open_table', 'price_level', 'food', 'decor', 'service']]
    X2['ot_dummy'] = X2.apply(ot_dum, axis=1)
    X2.drop('open_table', axis=1, inplace=True)
    X2['price_lev_map'] = X2.price_level.map({'VE':4,'E':3,'M':2,'I':1})
    X2.drop('price_level', axis=1, inplace=True)
    X2['cuisine_code'] = le.transform(X2.cuisine)
    X2.drop('cuisine', axis=1, inplace=True)
    X2 = imp.fit_transform(X2)
    X2 = pd.DataFrame(X2, columns = ['cost','food', 'decor', 'service', 'ot_dummy', 'price_lev_map', 'cuisine_code'])
    X2['cost'] = preprocessing.StandardScaler().fit_transform(X2.cost)
    X3 = X2[X2.food>4.2]
    return X3

In [951]:
dctest = engineer_data_dc5(dc_prop)

In [952]:
dc_prop = dc_prop[dc_prop.food>4.2]

In [993]:
dc_prop['pred_stars_rf'] = (rfg.predict(dctest)).round()

In [994]:
dc_prop[dc_prop.pred_stars_rf>0]

Unnamed: 0,addr_city,cost,cuisine,latitude,longitude,neighborhood,open_table,price_level,title,url,food,decor,service,pred_stars_rf
1,Washington,89,Italian,38.8939056,-77.0208435,Penn Quarter/Chinatown,,E,fiola,https://www.zagat.com/r/fiola-washington,4.8,4.7,4.7,1.0
3,Washington,89,,38.9050827,-77.0242615,Mount Vernon Square,http://www.opentable.com/restaurant/profile/10...,E,corduroy,https://www.zagat.com/r/corduroy-washington,4.8,4.5,4.6,1.0
5,Washington,325,Eclectic,38.8963737,-77.0235977,Penn Quarter/Chinatown,,VE,minibar by jose andres,https://www.zagat.com/r/minibar-by-jose-andres...,4.7,4.7,4.7,3.0
10,Washington,103,Belgian,38.9034767,-77.0521851,West End,,VE,marcel's by robert wiedmaier,https://www.zagat.com/r/marcels-washington,4.7,4.6,4.6,1.0
14,Washington,180,,38.713604,-78.1595459,,http://www.opentable.com/restaurant/profile/19...,VE,inn at little washington,https://www.zagat.com/r/the-inn-at-little-wash...,4.7,4.8,4.7,2.0
16,Washington,109,,38.9059944,-77.036972,White House/World Bank,http://www.opentable.com/restaurant/profile/30...,VE,plume,https://www.zagat.com/r/plume-washington,4.6,4.8,4.9,1.0
35,Washington,176,Greek,38.9100838,-77.0382767,Dupont Circle,,VE,komi,https://www.zagat.com/r/komi-washington,4.6,4.2,4.6,2.0
36,Washington,97,Seafood,38.9016228,-77.060997,Georgetown,,E,fiola mare,https://www.zagat.com/r/fiola-mare-washington,4.6,4.7,4.6,1.0
39,Washington,90,Japanese,38.9099731,-77.0381851,Dupont Circle,http://www.opentable.com/restaurant/profile/32...,E,sushi taro,https://www.zagat.com/r/sushi-taro-washington,4.6,4.2,4.4,1.0
44,Washington,80,Organic,38.9127617,-77.0472641,Dupont Circle,http://www.opentable.com/restaurant/profile/66...,E,nora,https://www.zagat.com/r/restaurant-nora-washin...,4.5,4.3,4.4,1.0


In [995]:
predictions = dc_prop[dc_prop.pred_stars_rf>0]

In [996]:
#Removing Inn at Little Washington which somehow is coded as Washington in Zagat
predictions = predictions.drop(14, axis = 0)

In [997]:
predictions = predictions.reset_index()
del predictions['index']
predictions = predictions[['title', 'pred_stars_rf']]
predictions.columns = ['restaurant', 'stars']
predictions

Unnamed: 0,restaurant,stars
0,fiola,1.0
1,corduroy,1.0
2,minibar by jose andres,3.0
3,marcel's by robert wiedmaier,1.0
4,plume,1.0
5,komi,2.0
6,fiola mare,1.0
7,sushi taro,1.0
8,nora,1.0
9,obelisk,1.0


In [998]:
predictions.to_csv('C:/Users/elynchklarup/Desktop/dc-michelin-challenge/submissions/erinlynch-klarup/submission_lynchkl.csv')

In this section, just trying to create a regression tree visualization to explain my model...

In [53]:
ntrain5 = engineer_data5(ntrain)
strain5 = engineer_data5(strain)
ctest5 = engineer_data5(ctest)

training = ntrain5.append(strain5)
X_train = training.drop(['stars'], axis=1)
y_train = training.stars
X_test = ctest5.drop(['stars'], axis=1)
y_test = ctest5.stars

In [54]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz

In [55]:
r = DecisionTreeRegressor(max_depth=3, min_samples_leaf = 3)
r = r.fit(X_train, y_train)


In [56]:
export_graphviz(r, out_file='tree2.dot')

In [57]:
r.feature_importances_

array([ 0.95502306,  0.        ,  0.02535983,  0.        ,  0.00394718,
        0.        ,  0.01566993])

In [59]:
training.columns

Index([u'cost', u'food', u'decor', u'service', u'stars', u'ot_dummy',
       u'price_lev_map', u'cuisine_code'],
      dtype='object')

In [60]:
pd.DataFrame(zip(X_train.columns, r.feature_importances_))

Unnamed: 0,0,1
0,cost,0.955023
1,food,0.0
2,decor,0.02536
3,service,0.0
4,ot_dummy,0.003947
5,price_lev_map,0.0
6,cuisine_code,0.01567


In [63]:
#Finding the restuarants in one leaf node -- for demo purposes in presentation

training[(training.cost>8.44) & (training.ot_dummy==1)]

Unnamed: 0,cost,food,decor,service,stars,ot_dummy,price_lev_map,cuisine_code
3,11.326105,4.8,4.7,4.8,3.0,1.0,4.0,97.0
6,10.714743,4.8,4.8,4.8,3.0,1.0,4.0,0.0
1,11.142142,4.9,4.7,4.8,3.0,1.0,4.0,60.0
9,9.817739,4.8,4.6,4.8,3.0,1.0,4.0,97.0
31,8.689543,4.7,4.7,4.8,3.0,1.0,4.0,3.0
127,8.738595,4.6,4.8,4.7,3.0,1.0,4.0,0.0


In [74]:
nyc_test = ntrain.loc[[3,6]]
sf_test = strain.loc[[1,9,31,127]]

In [75]:
example = nyc_test.append(sf_test)

In [80]:
example[['title', 'cost', 'food', 'decor','service', 'cuisine', 'open_table', 'price_level','cuisine', 'stars']]

Unnamed: 0,title,cost,food,decor,service,cuisine,open_table,price_level,cuisine.1,stars
3,per se,341,4.8,4.7,4.8,New American,http://www.opentable.com/restaurant/profile/27...,VE,New American,3.0
6,eleven madison park,325,4.8,4.8,4.8,,http://www.opentable.com/restaurant/profile/21...,VE,,3.0
1,french laundry,263,4.9,4.7,4.8,French,http://www.opentable.com/restaurant/profile/11...,VE,French,3.0
9,benu,236,4.8,4.6,4.8,New American,http://www.opentable.com/restaurant/profile/45...,VE,New American,3.0
31,manresa,213,4.7,4.7,4.8,American,http://www.opentable.com/restaurant/profile/20...,VE,American,3.0
127,at meadowood,214,4.6,4.8,4.7,,http://www.opentable.com/restaurant/profile/50...,VE,,3.0
