In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import json
from urlparse import urljoin
import urllib
import urllib2
import re

In [7]:
#Functions to retrieve restaurant name, address and stars from the Michelin site
def michelin_restaurant_name (soup):
    name = []
    for row in soup.findAll('div', {'class':'poi-item-name truncate'}):
        try:
            name.append(row.getText().strip('\n'))
            
        except:
            name.append("NA")
    return name

def address (soup):
    address = []
    for row in soup.findAll('div', {'class':'poi-item-address truncate'}):
        try:
            address.append(row.getText().strip('\n').strip('\t').strip('\n'))
            
        except:
            address.append("NA")
    return address

def stars (soup):
    star = []
    for row in soup.findAll(lambda tag: tag.name == 'span' and tag.get('class') == ['poi-item-stars']):
        count = 0
        try:
            for cnt in row.findAll('span', {'class':'star'}):
                count = count+1
            star.append(count)
        except:
            star.append(count)
    return star

In [8]:
#Iterates through the Michelin pages filtered on United States
results = pd.DataFrame()

for page in range(1,72):
    url = 'https://www.viamichelin.com/web/Restaurants/Restaurants-United_States?page='+str(page)
    r = requests.get(url)
    soup = BeautifulSoup(r.text,"html.parser")
    name_df = pd.DataFrame(michelin_restaurant_name(soup), columns = ['Restaurant_Name'])
    address_df= pd.DataFrame(address(soup), columns = ['Address'])
    star_df = pd.DataFrame(stars(soup), columns = ['Stars'])
    aggregate = pd.concat([name_df, address_df, star_df], axis = 1)
    results = results.append(aggregate)

In [9]:
#No nulls in the data
results.isnull().sum()

Restaurant_Name    0
Address            0
Stars              0
dtype: int64

In [11]:
#Datatypes are appropriate
results.dtypes

Restaurant_Name     object
Address             object
Stars              float64
dtype: object

In [14]:
results.shape

(1586, 3)

In [13]:
#Write file to csv for later use
results.to_csv(r'C:\Users\An\Desktop\DSI-DC-2\Portfolio\Michelin\Michelin_Restaurants.csv', sep='\t', encoding='utf-8')

In [53]:
#Gayot
location = 'NY'
page = 0
gayot_url = 'http://www.gayot.com/restaurants/searchresult.php?search=&rating=13&code=NY&start=50&jump='
r = requests.get(gayot_url)
soup = BeautifulSoup(r.text,"html.parser")


In [94]:
#Functions to pull restaurant name, rating and address from Gayot.com
def gayot_restaurant_name(soup):
    name = []
    for row in soup.findAll('a', {'class':'hoveru'}):
        try:
            name.append(row.getText())
            
        except:
            name.append("NA")
    return name

def gayot_rating(soup):
    rating = []
    
    for row in soup.findAll('td', {'width': '100'}):
        try:
            if row.find('strong').getText() is not None:
                #Orginal format was '13/20'. Pulled out score and convert to float
                rating.append(float(row.find('strong').getText()[0:2]))
            
        except:
            pass
    return rating

def gayot_address(soup):
    address = []
    for row in soup.findAll('td', {'width': '278'}):
        try:
            #Pull address and remove extra tags/characters
            address.append(row.getText()[2:len(row.getText())-11])
            
        except:
            pass
    return address

In [152]:
#Loop to pull Gayot data for different cities. Restaurants of rating 12 and up were pulled
#Cities DC, NY, SF and CH had to be run individually as errors occured for me while doing them together
results = pd.DataFrame()
for city in ['NY']: #Come back and do SF,CH, and DC
    for rating in range(12,21): #Start with rating 12 as 10 and 11 are too low for Michelin
        for page in range(0,760,10): #The max number of listings per rating per city is 762 (NY with rating 13)
            gayot_url = 'http://www.gayot.com/restaurants/searchresult.php?search=&rating='+str(rating)+'&code='+str(city)+'&start='+str(page)+'&jump='
            r = requests.get(gayot_url)
            soup = BeautifulSoup(r.text,"html.parser")
            name_df = pd.DataFrame(gayot_restaurant_name(soup), columns = ['Restaurant_Name'])
            address_df= pd.DataFrame(gayot_address(soup), columns = ['Address'])
            rating_df = pd.DataFrame(gayot_rating(soup), columns = ['Rating'])
            aggregate = pd.concat([name_df, address_df, rating_df], axis = 1)
            results = results.append(aggregate)

In [154]:
#Drop duplicates for each city
results.drop_duplicates(inplace=True)

In [157]:
#Write Gayot data to csv. Repeat for each city
results.to_csv(r'C:\Users\An\Desktop\DSI-DC-2\Portfolio\Michelin\NY_Gayot_Restaurants.csv', sep='\t', encoding='utf-8')

In [4]:
#Pull Zagat data. Zagat url had to be found by watching the Network and what url was being called to pull the data
#Run loop for each city
import re
import sys
import time
import json
import urllib
import sqlite3

sf = pd.DataFrame()
for page in range(1, 50):
    zagat_url= "https://www.zagat.com/proxy/v1.4?vertical=46&orderby=score_food&sort=desc&page="+str(page)+"&city=1021&query=&key=abbc09b7c840c10937a4db331422c98b&mobile_only_content=false&limit=15&m=filter&a=place"
    html = urllib.urlopen(zagat_url).read()
    soup = BeautifulSoup(html,"html.parser")
    city=json.loads(str(soup))
    cityadd = pd.DataFrame(city["data"])
    food = pd.DataFrame()
    decor = pd.DataFrame()
    service = pd.DataFrame()
    
    #Ratings are in a dictionary within the JSON file
    for row in range (0,len(city["data"])):
        food = food.append(pd.Series(city['data'][row]['score']['score5_food']),ignore_index=True)
        decor = decor.append(pd.Series(city['data'][row]['score']['score5_decor']),ignore_index=True)
        service = service.append(pd.Series(city['data'][row]['score']['score5_service']),ignore_index=True)
    cityadd['food_rating'] = food
    cityadd['decor_rating'] = decor
    cityadd['service_rating'] = service
    sf = sf.append(cityadd)

sf = sf.reset_index(drop = True)

ny = pd.DataFrame()
for page in range(1, 50):
    zagat_url= "https://www.zagat.com/proxy/v1.4?vertical=46&orderby=score_food&sort=desc&page="+str(page)+"&city=1020&query=&key=abbc09b7c840c10937a4db331422c98b&mobile_only_content=false&limit=15&m=filter&a=place"
    html = urllib.urlopen(zagat_url).read()
    soup = BeautifulSoup(html,"html.parser")
    city=json.loads(str(soup))
    cityadd = pd.DataFrame(city["data"])
    food = pd.DataFrame()
    decor = pd.DataFrame()
    service = pd.DataFrame()
    for row in range (0,len(city["data"])):
        food = food.append(pd.Series(city['data'][row]['score']['score5_food']),ignore_index=True)
        decor = decor.append(pd.Series(city['data'][row]['score']['score5_decor']),ignore_index=True)
        service = service.append(pd.Series(city['data'][row]['score']['score5_service']),ignore_index=True)
    cityadd['food_rating'] = food
    cityadd['decor_rating'] = decor
    cityadd['service_rating'] = service
    ny = ny.append(cityadd)

ny = ny.reset_index(drop = True)

#Get fewer cities for CH and DC as they are smaller cities. Don't want too much class imbalance
ch = pd.DataFrame()
for page in range(1, 25):
    zagat_url= "https://www.zagat.com/proxy/v1.4?vertical=46&orderby=score_food&sort=desc&page="+str(page)+"&city=1013&query=&key=abbc09b7c840c10937a4db331422c98b&mobile_only_content=false&limit=15&m=filter&a=place"
    html = urllib.urlopen(zagat_url).read()
    soup = BeautifulSoup(html,"html.parser")
    city=json.loads(str(soup))
    cityadd = pd.DataFrame(city["data"])
    food = pd.DataFrame()
    decor = pd.DataFrame()
    service = pd.DataFrame()
    for row in range (0,len(city["data"])):
        food = food.append(pd.Series(city['data'][row]['score']['score5_food']),ignore_index=True)
        decor = decor.append(pd.Series(city['data'][row]['score']['score5_decor']),ignore_index=True)
        service = service.append(pd.Series(city['data'][row]['score']['score5_service']),ignore_index=True)
    cityadd['food_rating'] = food
    cityadd['decor_rating'] = decor
    cityadd['service_rating'] = service
    ch = ch.append(cityadd)

ch = ch.reset_index(drop = True)

dc = pd.DataFrame()
for page in range(1, 25):
    #zagat_url= "https://www.zagat.com/proxy/v1.4?addr_city=Washington&score5_food=4.0&vertical=46&orderby=score_food&sort=desc&page="+str(page)+"&city=1024&query=&key=abbc09b7c840c10937a4db331422c98b&mobile_only_content=false&limit=15&m=filter&a=place"
    zagat_url= "https://www.zagat.com/proxy/v1.4?vertical=46&orderby=score_food&sort=desc&page="+str(page)+"&city=1024&query=&key=abbc09b7c840c10937a4db331422c98b&mobile_only_content=false&limit=15&m=filter&a=place"
    html = urllib.urlopen(zagat_url).read()
    soup = BeautifulSoup(html,"html.parser")
    city=json.loads(str(soup))
    cityadd = pd.DataFrame(city["data"])
    food = pd.DataFrame()
    decor = pd.DataFrame()
    service = pd.DataFrame()
    for row in range (0,len(city["data"])):
        food = food.append(pd.Series(city['data'][row]['score']['score5_food']),ignore_index=True)
        decor = decor.append(pd.Series(city['data'][row]['score']['score5_decor']),ignore_index=True)
        service = service.append(pd.Series(city['data'][row]['score']['score5_service']),ignore_index=True)
    cityadd['food_rating'] = food
    cityadd['decor_rating'] = decor
    cityadd['service_rating'] = service
    dc = dc.append(cityadd)

dc = dc.reset_index(drop = True)

In [None]:
#Pull the relevant columns from the Zagat data
columns = ['title', 'cuisine', 'cost', 'price_level', 'food_rating', 'decor_rating', 'service_rating']
sf = sf[columns]
ny = ny[columns]
ch = ch[columns]
dc = dc[columns]

In [None]:
#Write the Zagat data to csv for future use
ny.to_csv(r'C:\Users\An\Desktop\DSI-DC-2\Portfolio\Michelin\NY_Zagat_Restaurants.csv', sep='\t', encoding='utf-8')
sf.to_csv(r'C:\Users\An\Desktop\DSI-DC-2\Portfolio\Michelin\SF_Zagat_Restaurants.csv', sep='\t', encoding='utf-8')
ch.to_csv(r'C:\Users\An\Desktop\DSI-DC-2\Portfolio\Michelin\CH_Zagat_Restaurants.csv', sep='\t', encoding='utf-8')
dc.to_csv(r'C:\Users\An\Desktop\DSI-DC-2\Portfolio\Michelin\DC_Zagat_Restaurants.csv', sep='\t', encoding='utf-8')

## Read, standardize and join csv data

In [162]:
#Read Gayot, Zagat and Michelin data
ny_gayot = pd.read_csv('NY_Gayot_Restaurants.csv',sep='\t', encoding='utf-8')
sf_gayot = pd.read_csv('SF_Gayot_Restaurants.csv',sep='\t', encoding='utf-8')
ch_gayot = pd.read_csv('CH_Gayot_Restaurants.csv',sep='\t', encoding='utf-8')
dc_gayot = pd.read_csv('DC_Gayot_Restaurants.csv',sep='\t', encoding='utf-8')
ny_zagat = pd.read_csv('NY_Zagat_Restaurants.csv',sep='\t', encoding='utf-8')
sf_zagat = pd.read_csv('SF_Zagat_Restaurants.csv',sep='\t', encoding='utf-8')
ch_zagat = pd.read_csv('CH_Zagat_Restaurants.csv',sep='\t', encoding='utf-8')
dc_zagat = pd.read_csv('DC_Zagat_Restaurants.csv',sep='\t', encoding='utf-8')
michelin = pd.read_csv('Michelin_Restaurants.csv',sep='\t', encoding='utf-8')

#Ensure the restaurant names are formatted consistently in order to join 
ny_zagat.rename(columns = {'title':'Restaurant_Name'}, inplace = True)
ch_zagat.rename(columns = {'title':'Restaurant_Name'}, inplace = True)
sf_zagat.rename(columns = {'title':'Restaurant_Name'}, inplace = True)
dc_zagat.rename(columns = {'title':'Restaurant_Name'}, inplace = True)
ny_gayot['Restaurant_Name'] = ny_gayot['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
ch_gayot['Restaurant_Name'] = ch_gayot['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
sf_gayot['Restaurant_Name'] = sf_gayot['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
dc_gayot['Restaurant_Name'] = dc_gayot['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
ny_zagat['Restaurant_Name'] = ny_zagat['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
ch_zagat['Restaurant_Name'] = ch_zagat['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
dc_zagat['Restaurant_Name'] = dc_zagat['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
sf_zagat['Restaurant_Name'] = sf_zagat['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())
michelin['Restaurant_Name'] = michelin['Restaurant_Name'].map(lambda x: x.lower().lstrip().rstrip())

#Drop chain restaurants
ny_gayot.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)
ch_gayot.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)
sf_gayot.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)
dc_gayot.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)
ny_zagat.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)
ch_zagat.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)
sf_zagat.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)
dc_zagat.drop_duplicates(['Restaurant_Name'],keep=False, inplace = True)

ny_gayot.reset_index(inplace=True, drop = True)
ch_gayot.reset_index(inplace=True, drop = True)
sf_gayot.reset_index(inplace=True, drop = True)
dc_gayot.reset_index(inplace=True, drop = True)
ny_zagat.reset_index(inplace=True, drop = True)
ch_zagat.reset_index(inplace=True, drop = True)
sf_zagat.reset_index(inplace=True, drop = True)
dc_zagat.reset_index(inplace=True, drop = True)
michelin.reset_index(inplace=True, drop = True)

del ny_gayot['Unnamed: 0']
del sf_gayot['Unnamed: 0']
del ch_gayot['Unnamed: 0']
del dc_gayot['Unnamed: 0']
del ny_zagat['Unnamed: 0']
del sf_zagat['Unnamed: 0']
del ch_zagat['Unnamed: 0']
del dc_zagat['Unnamed: 0']
del michelin['Unnamed: 0']

In [163]:
#Get the zipcode from the Michelin address
def zip(x):
    return x[len(x)-5:len(x)]

michelin['zip']= michelin['Address'].map(zip)

In [164]:
#Create functions to group data by cities. Zipcodes beginning with 1 are from NY. 6 for Chicago. 9 for San Francico
def ny (x):
    if (x[0]=='1'):
        return 1
    else:
        return 0
    
def ch (x):
    if (x[0]=='6') or (x=='icago'):  #Some addresses don't have zipcodes causing the 'icago'
        return 1
    else:
        return 0
    
def sf (x):
    if (x[0] == '9'):
        return 1
    else:
        return 0

#Indicates City of the row
michelin['ny'] = michelin['zip'].apply(ny)
michelin['ch'] = michelin['zip'].apply(ch)
michelin['sf'] = michelin['zip'].apply(sf)

In [165]:
#Merge Michelin, Gayot and Zagat data by restaurant name
ny = pd.merge(michelin[michelin['ny']==1], ny_gayot, left_on = 'Restaurant_Name', right_on = 'Restaurant_Name', how = 'left')
ny = pd.merge(ny, ny_zagat, left_on = 'Restaurant_Name', right_on = 'Restaurant_Name', how = 'left')
sf = pd.merge(michelin[michelin['sf']==1], sf_gayot, left_on = 'Restaurant_Name', right_on = 'Restaurant_Name', how = 'left')
sf = pd.merge(sf, sf_zagat, left_on = 'Restaurant_Name', right_on = 'Restaurant_Name', how = 'left')
ch = pd.merge(michelin[michelin['ch']==1], ch_gayot, left_on = 'Restaurant_Name', right_on = 'Restaurant_Name', how = 'left')
ch = pd.merge(ch, ch_zagat, left_on = 'Restaurant_Name', right_on = 'Restaurant_Name', how = 'left')
dc = pd.merge(dc_zagat, dc_gayot, left_on = 'Restaurant_Name', right_on = 'Restaurant_Name', how = 'left')

In [166]:
#Drop major non-DC restaurants. The below restaurants are in Virginia
dc = dc[dc['Restaurant_Name']!='the inn at little washington']
dc = dc[dc['Restaurant_Name']!='restaurant eve']
dc = dc[dc['Restaurant_Name']!='chima brazilian steakhouse']
dc = dc[dc['Restaurant_Name']!='tachibana japanese restaurant']

dc.drop_duplicates(['Restaurant_Name'], keep=False, inplace = True)
dc.reset_index(inplace=True, drop = True)

In [167]:
#Create training set based off of NY and SF data
train = pd.concat([ny,sf], ignore_index = True)[['Restaurant_Name', 'Stars', 'Rating', 'cost', 'food_rating', 'decor_rating', 'service_rating']]

### Gayot Rating imputation

In [168]:
#Gayot and Zagat do not completely overlap. Missing Gayot rating are imputed by finding the median or mean Gayot rating for 
#different Zagat ratings
#Using mean to increase differentiation and the median Gayot rating is the same for many Zagat ratings

train.groupby('food_rating')['Rating'].mean()

food_rating
4.3    14.000000
4.4    13.666667
4.5    13.983607
4.6    14.285714
4.7    14.454545
4.8    16.400000
4.9    17.333333
Name: Rating, dtype: float64

In [169]:
#Impute Gayot rating on training data
import numpy as np
train['Rating'].fillna(-5.0, inplace=True) #Searching for nulls did not work, so I had to fill nulls with a number to search propoerly

for i in range(0,len(train)):
    if (train.loc[i,('Rating')] == -5.0) and (train.loc[i,('food_rating')]): 
        if train.loc[i,('food_rating')] < 4.7:
            train.loc[i,('Rating')] = 14.0 #Assume 13.5
        elif train.loc[i,('food_rating')] == 4.7:
            train.loc[i,('Rating')] = 14.5
        elif train.loc[i,('food_rating')] == 4.8:
            train.loc[i,('Rating')] = 16.5
        else:
            train.loc[i,('Rating')] = 17.0

In [170]:
#Chicago means
ch.groupby('food_rating')['Rating'].mean()

food_rating
4.4          NaN
4.5    13.833333
4.6    14.307692
4.7    14.818182
4.8    15.500000
4.9          NaN
Name: Rating, dtype: float64

In [171]:
#Chicago imputation
ch['Rating'].fillna(-5.0, inplace=True)

for i in range(0,len(ch)):
    if (ch.loc[i,('Rating')] == -5.0) and (ch.loc[i,('food_rating')]): 
        if ch.loc[i,('food_rating')] < 4.6:
            ch.loc[i,('Rating')] = 13.5
        elif ch.loc[i,('food_rating')] <4.8:
            ch.loc[i,('Rating')] = 14.5
        else:
            ch.loc[i,('Rating')] = 15.5

In [172]:
#DC means
dc.groupby('food_rating')['Rating'].mean() 

food_rating
4.3    13.666667
4.4    13.600000
4.5    14.133333
4.6    14.111111
4.7    14.800000
4.8    14.250000
Name: Rating, dtype: float64

In [173]:
#DC imputation
dc['Rating'].fillna(-5.0, inplace=True)

for i in range(0,len(dc)):
    if (dc.loc[i,('Rating')] == -5.0) and (dc.loc[i,('food_rating')]): 
        if dc.loc[i,('food_rating')] < 4.7:
            dc.loc[i,('Rating')] = 14.0
        else:
            dc.loc[i,('Rating')] = 15.0

In [174]:
#Create dataframe to use for DC predictions. Price level and cuisine were tested to not be helpful and are not included here
#Cost is the average cost per meal while price level is a category with Inexpensive, Medium, Expensive and Very Expensive
dc_test = dc[['Rating', 'cost', 'food_rating', 'decor_rating', 'service_rating']]
dc_test.dropna(inplace=True)
dc_test.reset_index(inplace =True, drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [175]:
#Create X and y (target) from the train set, which is NY and SF
train.dropna(inplace = True)
X_train = train[train.columns -['Restaurant_Name'] - ['Stars']]
y_train = train['Stars']

#Create test data from Chicago data
test = ch[['Stars', 'Rating', 'cost', 'food_rating', 'decor_rating', 'service_rating']]
test.dropna(inplace=True)

X_test = test[test.columns - ['Stars']]
y_test = test['Stars']

  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [176]:
#Takes model.predict(dc_test) as input and displays predicted DC Michelin star restaurants
def dc_predictions (x):
    dc_result = pd.DataFrame(x)
    dc_result.rename(columns = {0:'predicted_stars'}, inplace=True)
    dc_dummy = dc_test.reset_index(drop = True)
    temp = pd.concat([dc_result, dc_dummy], axis = 1)
    dc_final = pd.merge(dc, temp, on =['Rating','cost','food_rating','decor_rating','service_rating'] )
    dc_final = dc_final[['Restaurant_Name', 'predicted_stars']]
    print '\n',dc_final[dc_final['predicted_stars'] >0]

In [128]:
#Random Forest with no tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(rf.predict(dc_test))

Accuracy 0.858974358974
Confusion Matrix 
[[63  2  0  0]
 [ 5  3  2  0]
 [ 0  2  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.93      0.97      0.95        65
        1.0       0.43      0.30      0.35        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.84      0.86      0.85        78


                                 Restaurant_Name  predicted_stars
5                                          fiola              1.0
10                        minibar by josé andrés              3.0
13                                     prime rib              1.0
15                  marcel's by robert wiedmaier              1.0
46                                          komi              1.0
48                                  little serow              1.0
72   joe's seafood, prime steak &amp; stone crab              1.0
98                      

In [222]:
#Run with balanced classes
rf = RandomForestClassifier(class_weight = 'balanced')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(rf.predict(dc_test))

Accuracy 0.897435897436
Confusion Matrix 
[[62  3  0  0]
 [ 3  7  0  0]
 [ 0  2  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.95      0.95      0.95        65
        1.0       0.58      0.70      0.64        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.88      0.90      0.89        78


            Restaurant_Name  predicted_stars
10   minibar by josé andrés              3.0
33            the oval room              1.0
46                     komi              1.0
57          restaurant nora              1.0
89          2941 restaurant              1.0
98                  equinox              1.0
99          1789 restaurant              1.0
154       sakedokoro makoto              1.0


In [178]:
#Gridsearch on Random Forest
from sklearn.grid_search import GridSearchCV

param_grid = {'n_estimators': [5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20], 'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,10,12,15], 'min_samples_split': [2,3,4], 'min_samples_leaf':[1,2,3] }
gsrf = GridSearchCV(RandomForestClassifier(), param_grid, verbose = 2, cv= 6, n_jobs = -1)
gsrf.fit(X_train, y_train)
y_pred = gsrf.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
gsrf.best_params_

Fitting 6 folds for each of 1188 candidates, totalling 7128 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 223 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 629 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-1)]: Done 1195 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1925 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2815 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 3869 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 5083 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 6461 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 7128 out of 7128 | elapsed:  6.4min finished


Accuracy 0.884615384615
Confusion Matrix 
[[63  1  1  0]
 [ 5  5  0  0]
 [ 0  2  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.93      0.97      0.95        65
        1.0       0.62      0.50      0.56        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.87      0.88      0.87        78



{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 8}

### Run multiple Random Forest models with optimal paramters to see distribution of predicted restaurants
### Use the recurring restaurants

In [117]:
rf_best1 = RandomForestClassifier(criterion= 'gini', max_depth= 3, min_samples_leaf= 2, min_samples_split = 3, n_estimators = 8)
rf_best1.fit(X_train, y_train)
y_pred = rf_best1.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(rf_best1.predict(dc_test))

Accuracy 0.858974358974
Confusion Matrix 
[[63  2  0  0]
 [ 6  3  0  1]
 [ 0  2  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.91      0.97      0.94        65
        1.0       0.43      0.30      0.35        10
        2.0       0.00      0.00      0.00         2
        3.0       0.50      1.00      0.67         1

avg / total       0.82      0.86      0.84        78


                  Restaurant_Name  predicted_stars
5                           fiola              1.0
6                        corduroy              1.0
10         minibar by josé andrés              3.0
13                      prime rib              1.0
15   marcel's by robert wiedmaier              1.0
46                           komi              1.0
53                     fiola mare              1.0
138                 bourbon steak              1.0
154             sakedokoro makoto              1.0


In [92]:
rf_best2 = RandomForestClassifier(criterion= 'gini', max_depth= 3, min_samples_leaf= 2, min_samples_split = 3, n_estimators = 8)
rf_best2.fit(X_train, y_train)
y_pred = rf_best2.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)

print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(rf_best2.predict(dc_test))

Accuracy 0.884615384615
Confusion Matrix 
[[63  2  0  0]
 [ 5  5  0  0]
 [ 0  2  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.93      0.97      0.95        65
        1.0       0.56      0.50      0.53        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.86      0.88      0.87        78


                  Restaurant_Name  predicted_stars
10         minibar by josé andrés              3.0
15   marcel's by robert wiedmaier              1.0
19                          plume              1.0
46                           komi              1.0
53                     fiola mare              1.0
138                 bourbon steak              1.0
154             sakedokoro makoto              1.0


In [99]:
rf_best3 = RandomForestClassifier(criterion= 'gini', max_depth= 3, min_samples_leaf= 2, min_samples_split = 3, n_estimators = 8)
rf_best3.fit(X_train, y_train)
y_pred = rf_best3.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(rf_best3.predict(dc_test))

Accuracy 0.858974358974
Confusion Matrix 
[[63  2  0  0]
 [ 6  4  0  0]
 [ 0  2  0  0]
 [ 0  0  1  0]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.91      0.97      0.94        65
        1.0       0.50      0.40      0.44        10
        2.0       0.00      0.00      0.00         2
        3.0       0.00      0.00      0.00         1

avg / total       0.82      0.86      0.84        78


                  Restaurant_Name  predicted_stars
5                           fiola              1.0
10         minibar by josé andrés              2.0
13                      prime rib              1.0
15   marcel's by robert wiedmaier              1.0
19                          plume              1.0
47                     sushi taro              1.0
53                     fiola mare              1.0
138                 bourbon steak              1.0


In [100]:
rf_best4 = RandomForestClassifier(criterion= 'gini', max_depth= 3, min_samples_leaf= 2, min_samples_split = 3, n_estimators = 8)
rf_best4.fit(X_train, y_train)
y_pred = rf_best4.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(rf_best4.predict(dc_test))

Accuracy 0.897435897436
Confusion Matrix 
[[64  1  0  0]
 [ 5  5  0  0]
 [ 0  2  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.93      0.98      0.96        65
        1.0       0.62      0.50      0.56        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.87      0.90      0.88        78


                  Restaurant_Name  predicted_stars
5                           fiola              1.0
10         minibar by josé andrés              3.0
13                      prime rib              1.0
15   marcel's by robert wiedmaier              1.0
46                           komi              1.0
47                     sushi taro              1.0
53                     fiola mare              1.0
138                 bourbon steak              1.0
154             sakedokoro makoto              1.0


In [103]:
rf_best5 = RandomForestClassifier(criterion= 'gini', max_depth= 3, min_samples_leaf= 2, min_samples_split = 3, n_estimators = 8)
rf_best5.fit(X_train, y_train)
y_pred = rf_best5.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(rf_best5.predict(dc_test))

Accuracy 0.846153846154
Confusion Matrix 
[[63  2  0  0]
 [ 7  3  0  0]
 [ 0  2  0  0]
 [ 1  0  0  0]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.89      0.97      0.93        65
        1.0       0.43      0.30      0.35        10
        2.0       0.00      0.00      0.00         2
        3.0       0.00      0.00      0.00         1

avg / total       0.79      0.85      0.82        78


                  Restaurant_Name  predicted_stars
6                        corduroy              1.0
10         minibar by josé andrés              3.0
13                      prime rib              1.0
15   marcel's by robert wiedmaier              1.0
19                          plume              1.0
46                           komi              1.0
53                     fiola mare              1.0
138                 bourbon steak              1.0
154             sakedokoro makoto              1.0


### Cost is the most important feature

In [48]:
rf_best.feature_importances_

array([ 0.16995427,  0.57301382,  0.06301736,  0.01831185,  0.17570271])

In [181]:
train[train.columns -['Restaurant_Name'] - ['Stars']].head(1)

  if __name__ == '__main__':


Unnamed: 0,Rating,cost,decor_rating,food_rating,service_rating
0,13.0,43.0,3.8,4.5,4.2


In [42]:
#ExtraTrees model with no tuning
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

ex = ExtraTreesClassifier()
ex.fit(X_train, y_train)
y_pred = ex.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(ex.predict(dc_test))

Accuracy 0.858974358974
Confusion Matrix 
[[62  2  1  0]
 [ 5  4  1  0]
 [ 0  1  1  0]
 [ 0  0  1  0]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.93      0.95      0.94        65
        1.0       0.57      0.40      0.47        10
        2.0       0.25      0.50      0.33         2
        3.0       0.00      0.00      0.00         1

avg / total       0.85      0.86      0.85        78


                       Restaurant_Name  predicted_stars
8    fogo de chão brazilian steakhouse              1.0
10              minibar by josé andrés              3.0
19                               plume              1.0
46                                komi              1.0
57                     restaurant nora              1.0
89                     2941 restaurant              1.0
98                             equinox              1.0
99                     1789 restaurant              1.0
154                  sakedokoro makoto          

In [184]:
param_grid = {'n_estimators': [5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20], 'criterion':['gini', 'entropy'], 'max_depth':[None,3,5,7,10,12,15], 'min_samples_split': [2,3,4,5,6], 'min_samples_leaf':[1,2,3] }
gsex = GridSearchCV(ExtraTreesClassifier(), param_grid, verbose = 2, n_jobs = -1)
gsex.fit(X_train, y_train)
y_pred = gsex.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
gsex.best_params_

Fitting 3 folds for each of 2310 candidates, totalling 6930 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 571 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 1137 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done 1867 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2757 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3811 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 5025 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 6403 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 6930 out of 6930 | elapsed:  5.2min finished


Accuracy 0.871794871795
Confusion Matrix 
[[63  1  1  0]
 [ 5  4  0  1]
 [ 1  1  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.91      0.97      0.94        65
        1.0       0.67      0.40      0.50        10
        2.0       0.00      0.00      0.00         2
        3.0       0.50      1.00      0.67         1

avg / total       0.85      0.87      0.86        78



{'criterion': 'entropy',
 'max_depth': 12,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 9}

In [133]:
ex_best = ExtraTreesClassifier(criterion='entropy', max_depth = 12, min_samples_leaf = 1, min_samples_split = 6, n_estimators = 9)
ex_best.fit(X_train, y_train)
y_pred = ex_best.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(ex_best.predict(dc_test))

Accuracy 0.871794871795
Confusion Matrix 
[[63  1  1  0]
 [ 6  4  0  0]
 [ 1  1  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.90      0.97      0.93        65
        1.0       0.67      0.40      0.50        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.85      0.87      0.85        78


            Restaurant_Name  predicted_stars
5                     fiola              1.0
10   minibar by josé andrés              3.0
13                prime rib              1.0
19                    plume              1.0
46                     komi              1.0
57          restaurant nora              1.0
98                  equinox              1.0
154       sakedokoro makoto              1.0


In [145]:
ex_best2 = ExtraTreesClassifier(criterion='entropy', max_depth = 12, min_samples_leaf = 1, min_samples_split = 6, n_estimators = 9)
ex_best2.fit(X_train, y_train)
y_pred = ex_best2.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(ex_best2.predict(dc_test))

Accuracy 0.871794871795
Confusion Matrix 
[[61  3  1  0]
 [ 5  5  0  0]
 [ 1  0  1  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.91      0.94      0.92        65
        1.0       0.62      0.50      0.56        10
        2.0       0.50      0.50      0.50         2
        3.0       1.00      1.00      1.00         1

avg / total       0.86      0.87      0.87        78


            Restaurant_Name  predicted_stars
10   minibar by josé andrés              3.0
19                    plume              2.0
27                lightfoot              1.0
46                     komi              1.0
53               fiola mare              1.0
57          restaurant nora              1.0
98                  equinox              1.0
138           bourbon steak              1.0
154       sakedokoro makoto              1.0


In [135]:
ex_best3 = ExtraTreesClassifier(criterion='entropy', max_depth = 12, min_samples_leaf = 1, min_samples_split = 6, n_estimators = 9)
ex_best3.fit(X_train, y_train)
y_pred = ex_best3.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(ex_best3.predict(dc_test))

Accuracy 0.871794871795
Confusion Matrix 
[[61  3  1  0]
 [ 5  5  0  0]
 [ 0  1  1  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.92      0.94      0.93        65
        1.0       0.56      0.50      0.53        10
        2.0       0.50      0.50      0.50         2
        3.0       1.00      1.00      1.00         1

avg / total       0.87      0.87      0.87        78


                  Restaurant_Name  predicted_stars
10         minibar by josé andrés              3.0
15   marcel's by robert wiedmaier              1.0
19                          plume              1.0
46                           komi              1.0
53                     fiola mare              1.0
57                restaurant nora              1.0
154             sakedokoro makoto              1.0
237                      i ricchi              1.0


In [136]:
ex_best4 = ExtraTreesClassifier(criterion='entropy', max_depth = 12, min_samples_leaf = 1, min_samples_split = 6, n_estimators = 9)
ex_best4.fit(X_train, y_train)
y_pred = ex_best4.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(ex_best4.predict(dc_test))

Accuracy 0.871794871795
Confusion Matrix 
[[62  2  1  0]
 [ 5  5  0  0]
 [ 0  2  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.93      0.95      0.94        65
        1.0       0.56      0.50      0.53        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.86      0.87      0.86        78


                  Restaurant_Name  predicted_stars
5                           fiola              1.0
7         l'auberge chez francois              1.0
9                           tosca              1.0
10         minibar by josé andrés              3.0
15   marcel's by robert wiedmaier              1.0
19                          plume              1.0
46                           komi              1.0
154             sakedokoro makoto              1.0


In [138]:
ex_best5 = ExtraTreesClassifier(criterion='entropy', max_depth = 12, min_samples_leaf = 1, min_samples_split = 6, n_estimators = 9)
ex_best5.fit(X_train, y_train)
y_pred = ex_best5.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(ex_best5.predict(dc_test))

Accuracy 0.846153846154
Confusion Matrix 
[[62  2  1  0]
 [ 6  4  0  0]
 [ 1  1  0  0]
 [ 1  0  0  0]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.89      0.95      0.92        65
        1.0       0.57      0.40      0.47        10
        2.0       0.00      0.00      0.00         2
        3.0       0.00      0.00      0.00         1

avg / total       0.81      0.85      0.83        78


                  Restaurant_Name  predicted_stars
5                           fiola              1.0
7         l'auberge chez francois              1.0
10         minibar by josé andrés              3.0
15   marcel's by robert wiedmaier              1.0
19                          plume              1.0
27                      lightfoot              1.0
46                           komi              1.0
53                     fiola mare              1.0
57                restaurant nora              1.0
98                        equinox         

In [149]:
from sklearn import svm

svm_rbf = svm.SVC()
svm_rbf.fit(X_train, y_train)
y_pred = svm_rbf.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(svm_rbf.predict(dc_test))

Accuracy 0.846153846154
Confusion Matrix 
[[64  1  0  0]
 [ 7  2  0  1]
 [ 0  2  0  0]
 [ 0  0  1  0]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.90      0.98      0.94        65
        1.0       0.40      0.20      0.27        10
        2.0       0.00      0.00      0.00         2
        3.0       0.00      0.00      0.00         1

avg / total       0.80      0.85      0.82        78


                  Restaurant_Name  predicted_stars
13                      prime rib              1.0
15   marcel's by robert wiedmaier              1.0
19                          plume              1.0
47                     sushi taro              1.0
154             sakedokoro makoto              1.0


In [150]:
#SVM linear
svm_l = svm.SVC(kernel = 'linear')
svm_l.fit(X_train, y_train)
y_pred = svm_l.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(svm_l.predict(dc_test))

Accuracy 0.884615384615
Confusion Matrix 
[[64  1  0  0]
 [ 5  4  1  0]
 [ 1  1  0  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.91      0.98      0.95        65
        1.0       0.67      0.40      0.50        10
        2.0       0.00      0.00      0.00         2
        3.0       1.00      1.00      1.00         1

avg / total       0.86      0.88      0.87        78


           Restaurant_Name  predicted_stars
10  minibar by josé andrés              3.0
46                    komi              2.0


In [177]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(dt.predict(dc_test))

Accuracy 0.846153846154
Confusion Matrix 
[[60  4  1  0]
 [ 6  4  0  0]
 [ 0  1  1  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.91      0.92      0.92        65
        1.0       0.44      0.40      0.42        10
        2.0       0.50      0.50      0.50         2
        3.0       1.00      1.00      1.00         1

avg / total       0.84      0.85      0.84        78


                                 Restaurant_Name  predicted_stars
5                                          fiola              1.0
6                                       corduroy              1.0
8              fogo de chão brazilian steakhouse              1.0
10                        minibar by josé andrés              3.0
11                                 the lafayette              1.0
13                                     prime rib              1.0
15                  marcel's by robert wiedmaier              1.0
18                      

In [178]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bdt = BaggingClassifier(DecisionTreeClassifier())
bdt.fit(X_train, y_train)
y_pred = bdt.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(bdt.predict(dc_test))

Accuracy 0.897435897436
Confusion Matrix 
[[63  1  1  0]
 [ 5  5  0  0]
 [ 0  1  1  0]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.93      0.97      0.95        65
        1.0       0.71      0.50      0.59        10
        2.0       0.50      0.50      0.50         2
        3.0       1.00      1.00      1.00         1

avg / total       0.89      0.90      0.89        78


                       Restaurant_Name  predicted_stars
8    fogo de chão brazilian steakhouse              1.0
9                                tosca              1.0
10              minibar by josé andrés              3.0
15        marcel's by robert wiedmaier              1.0
33                       the oval room              1.0
46                                komi              2.0
48                        little serow              1.0
154                  sakedokoro makoto              1.0


## Minibar and Komi are consistent across models

In [157]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)
y_pred = kn.predict(X_test)
print 'Accuracy', accuracy_score(y_test, y_pred)
print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
print 'Classification Report','\n', classification_report(y_test, y_pred)
dc_predictions(kn.predict(dc_test))

Accuracy 0.0128205128205
Confusion Matrix 
[[ 0  0  0 65]
 [ 0  0  0 10]
 [ 0  0  0  2]
 [ 0  0  0  1]]
Classification Report 
             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00        65
        1.0       0.00      0.00      0.00        10
        2.0       0.00      0.00      0.00         2
        3.0       0.01      1.00      0.03         1

avg / total       0.00      0.01      0.00        78


                                    Restaurant_Name  predicted_stars
0                      monocacy crossing restaurant              3.0
1                                   rasika west end              3.0
2                                   rasika west end              3.0
3                                            rasika              3.0
4                                            rasika              3.0
5                                             fiola              3.0
6                                          corduroy              3.0