In [28]:
import requests
import bs4
from bs4 import BeautifulSoup
import urllib

import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import confusion_matrix
from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, log_loss, precision_score, recall_score 


# Data Aquisition/Wrangling/Scraping
Scraping michelin restaurant reviews for only the United States, which only exist for San Francisco, New-York and Chicago.  I elected to only scrape the information for results within the United States for maintaining cultural similarities.  

My Test data will be scraped from the Washintonian top 100 Resturants for 2016.  The structure by which the reviews are written is very similar to actual Michelin reviews.  Additionally I don't have to scrap every restaurant in DC and eliminate things like chain restaurants and navigate around biased unprofessional reviews.  

For both Michelin's website and The Washingtonian I had to scrape in two parts.  First, get the url's for the actual review pages. Second,  run those urls in a different scraper to get the desired data.

In [13]:
#Baseline/test scraper
def michelin_scraper(url):
    page = requests.get(url)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
    
    for item in page_soup.findAll('li',{'class':'poi-item poi-item-restaurant'}):
    
        direction = item.find("a").get("href")
        print 'https://www.viamichelin.com'+direction

In [16]:
# scrapers for individual cities.

# New York, New York
NY_URL = []
def michelin_scraper_ny(url):
    page = requests.get(url)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
    
    for item in page_soup.findAll('li',{'class':'poi-item poi-item-restaurant'}):
    
        direction = item.find("a").get("href")
        NY_URL.append('https://www.viamichelin.com'+direction)

# Chicago, Illinois
CHI_URL = []
def michelin_scraper_chi(url):
    page = requests.get(url)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
    
    for item in page_soup.findAll('li',{'class':'poi-item poi-item-restaurant'}):
    
        direction = item.find("a").get("href")
        CHI_URL.append('https://www.viamichelin.com'+direction)

# San Fransisco, California        
SF_URL = []        
def michelin_scraper_sf(url):
    page = requests.get(url)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
    
    for item in page_soup.findAll('li',{'class':'poi-item poi-item-restaurant'}):
    
        direction = item.find("a").get("href")
        SF_URL.append('https://www.viamichelin.com'+direction)


# ----------------------------------------------------------------------------------

In [19]:
# Scraping New York's Michelin URLs
# I know there are 36 pages of Michelin rated restuarants in New York, thats why I stop there.
for i in range(1,37):

    link = 'https://www.viamichelin.com/web/Restaurants/Restaurants-New_York-_-New_York-United_States?strLocid=31NG9zOXAxMGNOREF1TnpFek1EVT1jTFRjMExqQXdOekl6&page='+str(i)
    michelin_scraper_ny(link)

In [21]:
# Scraping Chicago's Michelin URLs
# Theres only 13 pages for Chicago
for i in range(1,14):
    link = 'https://www.viamichelin.com/web/Restaurants/Restaurants-Chicago-_-Illinois-United_States?strLocid=31NG9zYWgxMGNOREV1T0Rnek1qTT1jTFRnM0xqWXpNalE9&page='+str(i)
    michelin_scraper_chi(link) 

In [24]:
for i in range(1,13):
# 13 pages for SF as well
    link = 'https://www.viamichelin.com/web/Restaurants/Restaurants-San_Francisco-_-California-United_States?strLocid=31NG9zOHAxMGNNemN1Tnpnd01EZz1nTFRFeU1pNDBNakF4Tnc9PQ==&page='+str(i)
    michelin_scraper_sf(link)

In [125]:
# Create a few dataframes for all the information
cols = ['restaurant','type','price','rating', 'review', 'url']
NY_DF = pd.DataFrame(columns = cols)
CHI_DF = pd.DataFrame(columns = cols)
SF_DF = pd.DataFrame(columns = cols)

### Pulling all the Data for Michelin New York restaurants.

In [123]:
# Building my scraper for testing
def ms_page_scraper_ny(link):
    page= requests.get(link)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
   
    for element in page_soup.findAll('div',{'class':'view-container'}):
        # Getting Restaurant name
        name =  element.find('div',{'class': 'datasheet-item datasheet-name'}).text
        name = name.strip()
    
        # Cuisine Type
        cuisine =  element.find('div',{'class':'datasheet-cooking-type' }).text
    
        # From Price
        price = element.find('div',{'class':'datasheet-price'}).text
        price = price.strip()
        price =  re.sub('[\s+]', ' ', price)
        #to and from prices will need to be separated, or averaged later.
        
        # Guide Review
        review = element.find('blockquote').text
        review = review.strip()
        
        # Stars (text)
        rating = element.find('div',{'class':'datasheet-quotation'}).text
        rating = rating.strip()
        

    NY_DF.loc[len(NY_DF)] = [name, cuisine, price, rating, review, link]
    

In [126]:
# running the scraper on all the URLs, I got earlier
for item in NY_URL:
    ms_page_scraper_ny(item)

In [128]:
# Saving new dataframe as a csv incase kernal gets lost.  
NY_DF.to_csv('michelin_NY.csv', encoding = 'utf-8')

### Pulling all the Data for Michelin Chicago restaurants.

In [133]:
# Defining scraping function
def ms_page_scraper_chi(link):
    page= requests.get(link)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
   
    for element in page_soup.findAll('div',{'class':'view-container'}):
        # Getting Restaurant name
        name =  element.find('div',{'class': 'datasheet-item datasheet-name'}).text
        name = name.strip()
    
        # Cuisine Type
        cuisine =  element.find('div',{'class':'datasheet-cooking-type' }).text
    
        # From Price
        price = element.find('div',{'class':'datasheet-price'}).text
        price = price.strip()
        price =  re.sub('[\s+]', ' ', price)
        #to and from prices will need to be separated, or averaged later.
        
        # Guide Review
        review = element.find('blockquote').text
        review = review.strip()
        
        # Stars (text)
        rating = element.find('div',{'class':'datasheet-quotation'}).text
        rating = rating.strip()
        

    CHI_DF.loc[len(CHI_DF)] = [name, cuisine, price, rating, review, link]

In [135]:
# Running Scraping function
for item in CHI_URL:
    ms_page_scraper_chi(item)

In [141]:
# Saving results
CHI_DF.to_csv('michelin_CHI.csv', encoding = 'utf-8')

### Pulling all the Data for Michelin San Francisco restaurants.

In [138]:
# Defining scraping function
def ms_page_scraper_sf(link):
    page= requests.get(link)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
   
    for element in page_soup.findAll('div',{'class':'view-container'}):
        # Getting Restaurant name
        name =  element.find('div',{'class': 'datasheet-item datasheet-name'}).text
        name = name.strip()
    
        # Cuisine Type
        cuisine =  element.find('div',{'class':'datasheet-cooking-type' }).text
    
        # From Price
        price = element.find('div',{'class':'datasheet-price'}).text
        price = price.strip()
        price =  re.sub('[\s+]', ' ', price)
        #to and from prices will need to be separated, or averaged later.
        
        # Guide Review
        review = element.find('blockquote').text
        review = review.strip()
        
        # Stars (text)
        rating = element.find('div',{'class':'datasheet-quotation'}).text
        rating = rating.strip()
        

    SF_DF.loc[len(SF_DF)] = [name, cuisine, price, rating, review, link]

In [139]:
# Running Scraping function
for item in SF_URL:
    ms_page_scraper_sf(item)

In [142]:
# Saving results
SF_DF.to_csv('michelin_SF.csv', encoding = 'utf-8')

### Location where Test Data is being pulled from.
https://www.washingtonian.com/2016/02/08/100-very-best-restaurants/2016/
- this is a two part scrap where first I have to get the links to the individual pages and then scrape the individual web pages.  While this can be done in a single function, I choose to keep it separate for simplicity. 

In [174]:
# Defining scraper (Part. 1) function for Washintonian
washington_100 = []

def washintonian_scraper(link):
    page = requests.get(link)
    page_data = page.content
    page_soup = BeautifulSoup(page_data, "lxml")
    
    links = page_soup.find_all('tr')
    for tag in links:
        link = tag.get('data-href',None)
        washington_100.append(link)

In [None]:
# Running scraper Part. 1)
washintonian_scraper('https://www.washingtonian.com/2016/02/08/100-very-best-restaurants/2016/')

In [203]:
# Defining scraper (Part. 2)
def washingtonian_page_scraper(link):
    page_r = requests.get(link)
    page_r_data = page_r.content
    page_r_soup = BeautifulSoup(page_r_data, "lxml")
    restaurant_page = page_r_soup.find_all('div',{'id': 'content'})
    for obj in restaurant_page:
        
        name = obj.find("div",{'class':'section'}).text
        
        cuisine = obj.find('div',{'class':'type'}).text
        cuisine =cuisine.strip()
        
        price = obj.find('div',{'class':'price'}).text
        price = price.strip()
        
        review = obj.find('p').text
        
        8



In [207]:
# Initializing washingtonian 100 dataframe
column = ['restaurant','type','price','review','url']
wash_100_df  = pd.DataFrame(columns = column)

In [197]:
# there was a None value in my list that needed to be removed.
washington_100 = filter(None, washington_100)

In [209]:
for item in washington_100:
    washingtonian_page_scraper(item)

In [4]:
# Saving to CSV end results.
NY_DF  = pd.read_csv('data/michelin_NY.csv')
CHI_DF = pd.read_csv('data/michelin_CHI.csv')
SF_DF = pd.read_csv('data/michelin_SF.csv')
washington_100 = pd.read_csv('data/wash_100.csv')

# -----------------------------------------------------------------------------------------

# Data Cleaning, Joining and Preprocessing

#### Merging Train Datasets

In [5]:
# Concating all the city DFs into one DF
MICH_US_DF = pd.concat([NY_DF,CHI_DF,SF_DF])

#Dropping extra column that came from reading the csvs back in.
MICH_US_DF.drop('Unnamed: 0', axis =1, inplace =True)

##### Converting Price Column into a numeric value from a string

In [6]:
# Converting Price column to numeric values.
#MICH_US_DF['price'].value_counts()

Price = []

for row in MICH_US_DF['price']:
    if row == 'From 13 USD    to 24 USD':
        Price.append(1)  

    elif row == 'From 25 USD    to 49 USD':
        Price.append(2)

    elif row == 'From 50 USD    to 74 USD':
        Price.append(3)

    elif row == 'From 75 USD    to 150 USD':
        Price.append(4)

# There are only 4 values that appear in this column
# 'From 13 USD    to 24 USD' : 1
# 'From 25 USD    to 49 USD' : 2
# 'From 50 USD    to 74 USD' : 3
# 'From 75 USD    to 150 USD': 4

MICH_US_DF['Price'] = Price

##### Converting Rating to a numeric Value from Strings
While there are considerably more categories in the this feature I am only intersted in distinguishing stars from no stars and counting the number of starts.  Michelin has ratings like "Good Standard" and "Bib Gourmand" and these will be our false and 0 values.  Im creating both a categorical Y and Boolean Y train sets.

Rating : How many starts (if any) did that restaurant recieve. This will be used for classification models.

Star : Boolean as to whether the restaurant did recieve a star(s) or not.  This will be useful for logistic regressions and  SVMs

In [7]:
# MICH_US_DF['rating'].value_counts()

rates = []
star = []

for rate in MICH_US_DF['rating']:
    if 'A MICHELIN star' in str(rate):
        rates.append(1)
        star.append(1)
        
    elif 'Two MICHELIN stars' in str(rate):
        rates.append(2)
        star.append(1)
        
    elif 'Three MICHELIN stars' in str(rate):
        rates.append(3)
        star.append(1)
        
    else:
        rates.append(0)
        star.append(0)
        
MICH_US_DF['Rating'] = rates
MICH_US_DF['Star'] = star

In [9]:
# Indexes from initial dataframes were kept and thus needed to be reset.
MICH_US_DF.reset_index(drop = True, inplace = True)

Stripping out the word 'Cuisine' from the type column.

In [10]:
type_list = MICH_US_DF['type'].tolist()

new_list = []
for item in type_list:
    new = item.replace('Cuisine ', '')
    new_list.append(new)
    
MICH_US_DF['Type'] = new_list

In [11]:
# dropping a column that appeared (I believe this occured when I imported back from a csv)
washington_100.drop('Unnamed: 0', axis =1, inplace = True)

##### Converting a Price stated in dollar signs to a numeric value

In [12]:
# washington_100['price'].value_counts()

price = []

for dollars in washington_100['price']:
    if dollars == '$':
        price.append(1)
    elif dollars == '$$':
        price.append(2)
    elif dollars == '$$$':
        price.append(3)
    elif dollars == '$$$$':
        price.append(4)
        
washington_100['Price'] = price       

##### Going to have to remove the second adjective from the type column
Some of the 'Type' categories had two or even three descriptive values.  It will make things much easier to eliminate the secondary types.

Additionally, this can be useful in a dummies varable as a restaurant can easily have more than one category while using dummies.

In [13]:
wash_type_list = washington_100['type'].tolist()

wash_type_new = []
for item in wash_type_list:

    sep = ','
    new = item.split(sep)[0]
    wash_type_new.append(new)
    
washington_100['Type'] = wash_type_new

### Creating Dataframes that will be used for the modeling and predictcting (Data Refinement)

In [14]:
MICH = MICH_US_DF[['restaurant', 'Type','Price','review']]
WASH = washington_100[['restaurant','Type','Price','review']]

MICH2 = MICH_US_DF[['restaurant', 'Type','Price','review', 'Rating', 'Star']]

In [66]:
MICH2.head()

Unnamed: 0,restaurant,Type,Price,review,Rating,Star
0,Racines NY,French,4,The American outpost of this popular Parisian ...,0,0
1,Little Park,American,2,"Chef/owner Andrew Carmellini strikes again, th...",0,0
2,Blaue Gans,Austrian,2,"This sleek, unbridled Viennese-style café feel...",0,0
3,Rosanjin,Japanese,4,From its ultra-discrete entrance to its heavil...,1,1
4,Brushstroke,Japanese,4,The name may not give too much away but as soo...,1,1


In [67]:
MICH2.to_csv('mich_df.csv')

# ----------------------------------------------------------------------------------


# Data processing 
In order to vectorize the review data with the same number of features so that the test data is compatible with the train model I will need to TFIDF them simultaneously.
The Michelin reviews (MICH) will be added to the bottom of the washington reviews (WASH) as 100 is a good cut off point that is easy to remember when splitting.

In [16]:
# adding train and test sets together.
ALL = pd.concat([WASH[['Type','review']], MICH[['Type', 'review']]])

# while the gangs all here, lets conver the 'Type' feature into dummie values.
dummies = pd.get_dummies(ALL['Type'], drop_first=True)

# Put dummies back in ALL
ALL = pd.concat([ALL, dummies], axis = 1 )

Dummies seems to have created properly with no missing values.

In [17]:
# Once again indexes have to be reset as ALL has indexes '0'-'99' twice.
ALL.reset_index(drop = True, inplace = True)

TFIDF - Term Frequency Inverse Document Frequency
-  Takes all the words in our target feature ('review') and calculates the relation of each individual word to that specific cell (review) as well as the entire corpus (all reviews).  With this we can find the predictive power of each word in comparison with that particular review and the entire corpus.

In [18]:
# Term Frequency Inverse Document Frequency-ing the data
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(stop_words='english')

review_TFIDF = tvec.fit_transform(ALL['review'])

x = pd.DataFrame(review_TFIDF.toarray())



In [19]:
ALL_TFIDF = pd.concat([ALL, x], axis = 1)

In [20]:
ALL_TFIDF.drop(['Type', 'review'], axis = 1, inplace = True)

In [21]:
# Splitting the data to original train(MICH) and test(WASH)
mich_tfidf = ALL_TFIDF.loc[100: ,]
wash_tfidf = ALL_TFIDF.loc[0:99 ,]

# reseting index so it can be merged back with Rating and Star
mich_tfidf.reset_index(drop = True, inplace = True)


I Have the Michelin and Washington Data that has been TFIDF'd as well as Dummie variabled.  Price is not included in these dataframes so i will need to add it.  I think I am going to make the restaurant the index so I know what rows are what.

In [22]:
# Merging the categorical turned numerical features back to the descriptive features.  
mich_all = pd.concat([MICH2, mich_tfidf], axis = 1 )
wash_all = pd.concat([WASH, wash_tfidf], axis = 1 )

In [23]:
mich_all.set_index('restaurant', inplace = True)
wash_all.set_index('restaurant', inplace = True)

## X Variables

In [24]:
# Removing features that will not be used in the model ()
WASH_X = wash_all.drop(['review', 'Type',], axis = 1)

MICH_X = mich_all.drop(['review', 'Type', 'Rating','Star'], axis =1)

## Y Variables

In [25]:
# Making the Y Value(s)
MICH_Y = mich_all['Star']

MICH_Y2 = mich_all['Rating']

In [26]:
MICH_X.to_csv('MICH_X.csv', encoding = 'utf-8')
MICH_Y.to_csv('MICH_Y.csv', encoding = 'utf-8')
MICH_Y2.to_csv('MICH_Y2.csv', encoding = 'utf-8')
WASH_X.to_csv('WASH_X.csv', encoding = 'utf-8')

# -----------------------------------------------------------------------------------------

# EDA: Logistic Regression


In [545]:
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y, test_size = 0.5)

lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print accuracy_score(y_test, pred_lr)

confusion_matrix(y_test, pred_lr)

0.946933962264


array([[770,  11],
       [ 34,  33]])

In [554]:
wash_pred = lr.predict(WASH_X)
WASH_X_LR_PRED = WASH_X
WASH_X_LR_PRED['pred'] = wash_pred
WASH_LR_PRED = WASH_X_LR_PRED.where(WASH_X_LR_PRED['pred'] == 1)
WASH_LR_PRED.dropna()

Unnamed: 0_level_0,Price,American,Argentinian,Asian,Austrian,Barbecue,Basque,Belgian,Brazilian,Burmese,...,15873,15874,15875,15876,15877,15878,15879,15880,15881,pred
restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Komi,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Plume,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Inn at Little Washington,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Source,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Blue Duck Tavern,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Bistro Bis,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Marcel's,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Minibar,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Del Campo,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Ashby Inn & Restaurant,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


##### None from the Honerable Mentions list made it in, so theres that.  Looks like 'Price is a highly distinguishing factor.  I double checked and there are 19 restaurants with price of 4 and there are only 14 in this prediction so it is not entirely based upon price.
- Bad Saint
- Bidwell
- Boqueria
- Chercher
- China Chilcano
- Das
- Doi Moi
- Jaleo
- Kyirisan
- Lapis
- Maketto
- Ottoman Taverna
- Oyamel
- Pearl Dive Oyster Palace
- Red Hen
- Royal
- Thip Khao
- 2Amys
- Zaytinya

## Logistic Regression w/ multiple categories.

In [597]:
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y2, test_size = 0.5)
lr2 = LogisticRegression(penalty = 'l1')
lr2.fit(X_train, y_train)
pred_lr2 = lr2.predict(X_test)
print accuracy_score(y_test, pred_lr2)
confusion_matrix(y_test, pred_lr2)

0.920990566038


array([[762,  15,   0,   0],
       [ 36,  19,   0,   0],
       [  2,   8,   0,   0],
       [  0,   6,   0,   0]])

In [602]:
WASH_X.drop('pred', axis =1, inplace = True)

In [603]:
wash_pred_2 = lr2.predict(WASH_X)
WASH_X_LR2_PRED = WASH_X
WASH_X_LR2_PRED['pred'] = wash_pred_2
WASH_LR2_PRED = WASH_X_LR2_PRED.where(WASH_X_LR2_PRED['pred'] == 1)
WASH_LR2_PRED.dropna()

Unnamed: 0_level_0,Price,American,Argentinian,Asian,Austrian,Barbecue,Basque,Belgian,Brazilian,Burmese,...,15873,15874,15875,15876,15877,15878,15879,15880,15881,pred
restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Komi,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Little Serow,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Plume,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Inn at Little Washington,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Source,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Blue Duck Tavern,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Minibar,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Del Campo,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Ashby Inn & Restaurant,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Restaurant Eve,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# -----------------------------------------------------------------------------------------

# Feature Importance Using Random Forests 
- MICH_X
- MICH_Y - Boolean Start
- MICH_Y2 - Rating 0-3


In [517]:
# Fitting the random Forest Model
from sklearn.ensemble import ExtraTreesClassifier

ETC = ExtraTreesClassifier(n_estimators=25)
ETC.fit(MICH_X, MICH_Y)


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [518]:
# Getting the importance value of each of the feature.
imp = ETC.feature_importances_

In [522]:
# Creating a dataframe of the column(feature name) and its importance.
imp_df = pd.DataFrame()
imp_df['col'] = MICH_X.columns
imp_df['importance'] = imp

In [532]:
# Sorting the feature importance column and dropping the index
imp_df.sort(['importance'], ascending = False, inplace = True)
imp_df.reset_index(inplace=True, drop = True)

  if __name__ == '__main__':


##### - "imp_250_ls" -  List of the column names of the 250 most important features according to Extra Trees Feature importance
##### - "imp_features_ls" - List of the column names of features whose importance is greater than 0

In [535]:
# Getting the top 250 features and converting them to a list
important_250 = imp_df[:250]
imp_250_ls = important_250['col']

In [565]:
# Getting all features whose importance is greater than 0
imp_df =  imp_df[(imp_df['importance'] != 0)]
imp_features_ls = imp_df['col']

##### - IMP_MICH_X  - Dataframe with features that have an importance greater than 0
##### - IMP_MICH_X_250 - Dataframe of only the top 250 features as far as predictive capability.
##### - IMP_WASH_X - Test dataframe with only features whos imporatance is greater than 0
#####  - IMP_WASH_X_250 - Test Dataframe with only the top 250 features as far as predictive capability.

In [572]:
IMP_MICH_X = MICH_X[imp_features_ls]
IMP_MICH_X_250 = MICH_X[imp_250_ls]

IMP_WASH_X = WASH_X[imp_features_ls]
IMP_WASH_X_250 = WASH_X[imp_250_ls]

# -----------------------------------------------------------------------------------------

# Washington Bib Gourmand Restaurants
The List of Michelin Honerable Mention (Bib Gourmand) restaurants was released on Thursday October 6th.  There are only 19 restaurants which leads me to believe the that number of restaurants that will recieve a Michelin star will be less (around half).  Less than half of these appear in my test dataset which could be a good or a bad thing.  Either many of the restaurants in my test data are above a 'honerable mention' grad and are more likely to get stars, or I have created a great deal of bias by not having every posible restaurant in my test data.  

I Can use this two ways for model tuning.  
- Compare my results to these and make sure my model doesnt give these a star (or just take them off my final list). 
- Take their rows out of test dataset and put them into the test dataset.

In [500]:
# Michelin Bib Gourmand list 
'''https://www.washingtonpost.com/news/going-out-guide/wp/2016/10/06/
michelin-announces-its-first-d-c-honors-the-bib-gourmand-list-of-affordable-restaurants/'''
wash_bib = ['Bad Saint', 'Bidwell', 'Boqueria','Chercher', 'China Chilcano', 
            'Das', 'Doi Moi', 'Jaleo','Kyirisan', 'Lapis','Maketto',
            'Ottoman Taverna', 'Oyamel','Pearl Dive Oyster Palace',
            'The Red Hen','Royal','Thip Khao','2Amys','Zaytinya']

# Bib Gourmand Restaurants in my test data.  
wash_bib_ls = []
for item in washington_100['restaurant']:
    if item in wash_bib:
        wash_bib_ls.append(item)

In [498]:
# Extrating the Bib Gourmand restaurants from test data to incorporate encorporate into the Train dataset.  
# surveys_df.loc[[0,10], :]
WASH_BIB = WASH_X.loc[wash_bib_ls]
WASH_BIB

In [None]:
WASH_X.where(WASH_X is in wash_)

# --------------------------------------------------------------------------------------


# Logistic Regression 2: 
Logistic Regression is now worthless.

### Logistic Regression with Standard data (Bool)

In [718]:
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y, test_size = 0.3, random_state = 19)

lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print accuracy_score(y_test, pred_lr)

confusion_matrix(y_test, pred_lr)

0.925343811395


array([[443,  13],
       [ 25,  28]])

### Logistic Regression with Standard data (Categorical)

In [719]:
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y2, test_size = 0.3, random_state = 19)

lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print accuracy_score(y_test, pred_lr)

confusion_matrix(y_test, pred_lr)

0.907662082515


array([[450,   6,   0,   0],
       [ 26,  12,   0,   0],
       [  2,   6,   0,   0],
       [  1,   6,   0,   0]])

### Logistic Regression with Selected Data (Bool)

In [720]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X, MICH_Y, test_size = 0.3, random_state = 19)

lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print accuracy_score(y_test, pred_lr)

confusion_matrix(y_test, pred_lr)

0.895874263261


array([[456,   0],
       [ 53,   0]])

### Logistic Regression with Selected Data (Categorical)

In [721]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X, MICH_Y2, test_size = 0.3, random_state = 19)

lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print accuracy_score(y_test, pred_lr)

confusion_matrix(y_test, pred_lr)

0.895874263261


array([[456,   0,   0,   0],
       [ 38,   0,   0,   0],
       [  8,   0,   0,   0],
       [  7,   0,   0,   0]])

### Logistic Regression with Selected 250 Features (Bool)

In [722]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X_250, MICH_Y, test_size = 0.3, random_state = 19)

lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print accuracy_score(y_test, pred_lr)

confusion_matrix(y_test, pred_lr)

0.895874263261


array([[456,   0],
       [ 53,   0]])

### Logistic Regression with Selected 250 Features  (Categorical)

In [723]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X_250, MICH_Y2, test_size = 0.3, random_state = 19)

lr = LogisticRegression(penalty = 'l1')
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print accuracy_score(y_test, pred_lr)

confusion_matrix(y_test, pred_lr)

0.895874263261


array([[456,   0,   0,   0],
       [ 38,   0,   0,   0],
       [  8,   0,   0,   0],
       [  7,   0,   0,   0]])

# ---------------------------------------------------------------------------------------

# K-Nearest Kneighbors

## KNN Original Data (Bool)
I will run a few with different k (5, 7 & 13) for EDA.

##### 5 Neighbors

In [737]:
knn = KNeighborsClassifier()
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.941060903733


array([[455,  12],
       [ 18,  24]])

##### 7 Neighbors

In [736]:
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.94695481336


array([[457,  10],
       [ 17,  25]])

##### 13 Neighbors

In [738]:
knn = KNeighborsClassifier(n_neighbors = 13)
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.939096267191


array([[460,   7],
       [ 24,  18]])

## KNN Original Data (Category)
I will run a few with different k (5, 7 & 13) for EDA.

##### 5 Neighbors

In [742]:
# 5 Neighbors
knn = KNeighborsClassifier()
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y2, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.919449901768


array([[456,  10,   0,   1],
       [ 18,  11,   0,   0],
       [  3,   3,   1,   0],
       [  1,   4,   1,   0]])

##### 7 Neighbors

In [747]:
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y2, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)


0.919449901768


array([[457,  10,   0,   0],
       [ 18,  11,   0,   0],
       [  2,   5,   0,   0],
       [  1,   3,   2,   0]])

##### 13 Neighbors

In [739]:
knn = KNeighborsClassifier(n_neighbors = 13)
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y2, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.92141453831


array([[460,   7,   0,   0],
       [ 20,   9,   0,   0],
       [  3,   4,   0,   0],
       [  1,   4,   1,   0]])

## KNN Selected Data (Bool)

In [732]:
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X, MICH_Y, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.917485265226


array([[467,   0],
       [ 42,   0]])

## KNN Selected Data (Categorical)

In [733]:
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X, MICH_Y2, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.917485265226


array([[467,   0,   0,   0],
       [ 29,   0,   0,   0],
       [  7,   0,   0,   0],
       [  6,   0,   0,   0]])

## KNN Selected Data 250 (Bool)

In [734]:
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X_250, MICH_Y, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.923379174853


array([[467,   0],
       [ 39,   3]])

## KNN Selected Data 250 (Categorical)

In [735]:
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X_250, MICH_Y2, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

0.917485265226


array([[467,   0,   0,   0],
       [ 29,   0,   0,   0],
       [  6,   1,   0,   0],
       [  6,   0,   0,   0]])

# -------------------------------------------------------------------------------------------

# SVM.SVC 2: Using Selected Features.
#### Support Vector Machine Support Vector Classification
- MICH_X
- IMP_MICH_X, 
- IMP_MICH_X_250

### Support Vector Machine with Standard data (Bool)

In [726]:
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y, test_size = 0.3, random_state = 22)
clf = svm.SVC()
clf.fit(X_train, y_train)
svm_y_pred = clf.predict(X_test)

print accuracy_score(y_test, svm_y_pred)
confusion_matrix(y_test, svm_y_pred)

0.913555992141


array([[465,   0],
       [ 44,   0]])

### Support Vector Machine with Standard data (Categorical)

In [727]:
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y2, test_size = 0.3, random_state = 22)
clf = svm.SVC()
clf.fit(X_train, y_train)
svm_y_pred = clf.predict(X_test)

print accuracy_score(y_test, svm_y_pred)
confusion_matrix(y_test, svm_y_pred)

0.913555992141


array([[465,   0,   0,   0],
       [ 37,   0,   0,   0],
       [  3,   0,   0,   0],
       [  4,   0,   0,   0]])

### Support Vector Machine with Selected Features (Bool)

In [731]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X, MICH_Y, test_size = 0.3, random_state = 22)
clf = svm.SVC()
clf.fit(X_train, y_train)
svm_y_pred = clf.predict(X_test)

print accuracy_score(y_test, svm_y_pred)
confusion_matrix(y_test, svm_y_pred)

0.913555992141


array([[465,   0],
       [ 44,   0]])

### Support Vector Machine with Selected Features (Categorical)

In [728]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X, MICH_Y2, test_size = 0.3, random_state = 22)
clf = svm.SVC()
clf.fit(X_train, y_train)
svm_y_pred = clf.predict(X_test)

print accuracy_score(y_test, svm_y_pred)
confusion_matrix(y_test, svm_y_pred)

0.913555992141


array([[465,   0,   0,   0],
       [ 37,   0,   0,   0],
       [  3,   0,   0,   0],
       [  4,   0,   0,   0]])

### Support Vector Machine with Selected Features 250 (Bool)

In [730]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X_250, MICH_Y, test_size = 0.3, random_state = 22)
clf = svm.SVC()
clf.fit(X_train, y_train)
svm_y_pred = clf.predict(X_test)

print accuracy_score(y_test, svm_y_pred)
confusion_matrix(y_test, svm_y_pred)

0.913555992141


array([[465,   0],
       [ 44,   0]])

### Support Vector Machine with Selected Features 250 (Categorical)

In [729]:
X_train, X_test, y_train, y_test = train_test_split(IMP_MICH_X_250, MICH_Y2, test_size = 0.3, random_state = 22)
clf = svm.SVC()
clf.fit(X_train, y_train)
svm_y_pred = clf.predict(X_test)

print accuracy_score(y_test, svm_y_pred)
confusion_matrix(y_test, svm_y_pred)

0.913555992141


array([[465,   0,   0,   0],
       [ 37,   0,   0,   0],
       [  3,   0,   0,   0],
       [  4,   0,   0,   0]])

# ----------------------------------------------------------------------------------------

##### Current findings

I have built several models so far and one thing is definitely conclussive.  My selected features are terrible!  The predictive power of my models drop off significantly after I incorporate selected features.  It looks like my KNN model using the Original data and predicting a categorical outcome is performing the best.  Specifically the one using 7 neighbors.  I will need to run an optimization function (Gridsearch) to see what the optimal parameters are.

In [751]:
knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(MICH_X, MICH_Y2, random_state = 18, test_size = 0.3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print accuracy_score(y_test, y_pred)
print f1_score(y_test, y_pred)
#print log_loss(y_test, y_pred)
print precision_score(y_test, y_pred)
print recall_score(y_test, y_pred)
#print roc_auc_score(y_test, y_pred) 'Multiclass format not supported'
confusion_matrix(y_test, y_pred)


0.919449901768
0.908998866956
0.898788337128
0.919449901768


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([[457,  10,   0,   0],
       [ 18,  11,   0,   0],
       [  2,   5,   0,   0],
       [  1,   3,   2,   0]])

# Optimized Parameters with GridSearchCV


With a dataset of the size that I have, the model I am using and the parameters I am searching, it is unreasonable to ask my local device to try to perform this task.  I tried and it ran for 2 hours without before I realized I was using the wrong Y values.  Alternatively, I am going to set up an AWS instance and use that to run my GridSearchCV.  I will outline what I will be doing on the AWS instance below so process and methods do not get lost.

In [755]:
knn = KNeighborsClassifier(n_jobs= -1)

params = {'n_neighbors':[3,5,7,9,11,13], 'weights': ['uniform','distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
          'leaf_size' : [10,20,30,40,50]}

gs_knn = GridSearchCV(knn, params, scoring = )

gs_knn.fit(MICH_X, MICH_Y2)

Not alot of code for something that is extremely computationaly expensive.

In [53]:
KNN = KNeighborsClassifier(n_neighbors = 13, leaf_size = 10, weights = 'distance', n_jobs = -1)
KNN.fit(MICH_X, MICH_Y2)
KNN.predict(WASH_X)

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [55]:
KNN = KNeighborsClassifier(n_neighbors = 13, leaf_size = 10, weights = 'distance', n_jobs = -1)
KNN.fit(MICH_X, MICH_Y)
wash_pred = KNN.predict(WASH_X)
wash_pred

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [56]:
WASH_X['pred'] = wash_pred

In [61]:
final = WASH_X['pred'].where(WASH_X['pred'] == 1).dropna()

In [63]:
final_df = pd.DataFrame(final)

In [68]:
final_df

Unnamed: 0_level_0,pred
restaurant,Unnamed: 1_level_1
Fiola Mare,1.0
Masseria,1.0
Fiola,1.0
Preserve,1.0
Obelisk,1.0
Del Campo,1.0
Woodberry Kitchen,1.0
Centrolina,1.0


My models could not predict number of starts, even the KNN classifier with train data with 4 catagories, so this is my list of restaurants that will get starts for dc

In [65]:
final_df.to_csv('MichelinPredictions.csv')