## Merge files

In [419]:
#imports
import pandas as pd
import numpy as np
import re

### Google merge and cleaning

In [420]:
#read test data from Google scraping
ny_goog = pd.read_csv("ny_goog.csv")
ca_goog = pd.read_csv("ca_goog.csv")
ch_goog = pd.read_csv("ch_goog.csv")
zagat = pd.read_csv("dc_google_zagat.csv")
post = pd.read_csv("post_goog.csv")

In [421]:
us_goog = pd.concat([ny_goog,ca_goog,ch_goog])
dc = pd.concat([zagat,post])

In [422]:
us_goog = us_goog.reset_index(drop=True)
dc = dc.reset_index(drop=True)

In [423]:
#drop description for this part
dc.drop("desc",axis=1,inplace=True)

In [424]:
us_goog = us_goog.applymap(lambda x: np.nan if x=="None" else x)
dc = dc.applymap(lambda x: np.nan if x=="None" else x)
#post = post.applymap(lambda x: np.nan if x=="None" else x)

In [425]:
#now check null values
us_goog.isnull().sum()

name      17
rev      113
price     76
dtype: int64

In [426]:
dc.isnull().sum()

name     78
price    90
rev      91
dtype: int64

In [427]:
post.isnull().sum()

name     0
rev      0
price    0
dtype: int64

In [428]:
#drop them
us_goog.dropna(inplace=True)
dc.dropna(inplace=True)
#post.dropna(inplace=True)

In [429]:
#let's clean the names
def cleanName(x):
    x = str(x).replace("\n", "")
    if x[-17:].lower()=='websitedirections':
        return x[:-17]
    else:
        return x

In [430]:
us_goog["name"] = us_goog["name"].apply(cleanName)
dc["name"] = dc["name"].apply(cleanName)
#post["name"] = post["name"].apply(cleanName)

In [431]:
#separate rev 
us_goog["rating"] = us_goog["rev"].apply(lambda x: x.split("\n")[0])
dc["rating"] = dc["rev"].apply(lambda x: x.split("\n")[0])
#post["rating"] = post["rev"].apply(lambda x: x.split("\n")[0])

In [432]:
#get the reviews if available
def getReviews(x):
    try:
        return x.split("\n")[1]
    except:
        return np.nan

In [433]:
us_goog["numrev"] = us_goog["rev"].apply(getReviews)
us_goog.dropna(inplace=True)

dc["numrev"] = dc["rev"].apply(getReviews)
dc.dropna(inplace=True)

# post["numrev"] = post["rev"].apply(getReviews)
# post.dropna(inplace=True)

In [434]:
#let's split up the price to price and cuisine
def getPrice(x):
    price = x.split("·")[0]
    if price[0] == "$":
        return price.replace(" ","")
    else:
        return np.nan

In [435]:
us_goog["goog_price"] = us_goog["price"].apply(getPrice)
us_goog.dropna(inplace=True)

dc["goog_price"] = dc["price"].apply(getPrice)
dc.dropna(inplace=True)

# post["goog_price"] = post["price"].apply(getPrice)
# post.dropna(inplace=True)

In [436]:
def getCuisine(x):
    try:
        cuisine = x.split("·")[1]
        if cuisine[0] != "$":
            if "Restaurant" in cuisine:
                return cuisine.replace("Restaurant","")
            else:
                return cuisine
        else:
            if x.split("·")[0][0] != "$": 
                return x.split("·")[0].replace("Restaurant","")
    except:
        return np.nan
        

In [437]:
us_goog["cuisine"] = us_goog["price"].apply(getCuisine)
us_goog.dropna(inplace=True)

dc["cuisine"] = dc["price"].apply(getCuisine)
dc.dropna(inplace=True)

# post["cuisine"] = post["price"].apply(getCuisine)
# post.dropna(inplace=True)

In [438]:
def cleanReviews(x):
    if re.findall(r"\d",x):
        return "".join(re.findall(r"\d",x))
    else:
        return np.nan

In [439]:
us_goog.numrev = us_goog["numrev"].apply(cleanReviews)
us_goog.dropna(inplace=True)

dc.numrev = dc["numrev"].apply(cleanReviews)
dc.dropna(inplace=True)

# post.numrev = post["numrev"].apply(cleanReviews)
# post.dropna(inplace=True)

In [440]:
us_goog.drop(["rev","price"],axis=1,inplace=True)
dc.drop(["rev","price"],axis=1,inplace=True)
#post.drop(["rev","price"],axis=1,inplace=True)

In [441]:
#save clean test data
us_goog.to_csv("us_goog.csv",encoding="utf-8",index=False)
dc.to_csv("dc_test.csv",encoding="utf-8",index=False)
# post.to_csv("post_test.csv",encoding="utf-8",index=False)

### Michelin and Google Merge

In [442]:
michelin = pd.read_csv("combined_clean.csv")

In [443]:
#let's get the US data only
def findCountry(x):
    if re.search("[9]{1}[0-9]{4}",x[-5:]):
        return "CA"
    elif re.search("[1]{1}[0-9]{4}",x[-5:]):
        return "NY"
    elif re.search("[6]{1}[0-9]{4}",x[-5:]):
        return "IL"
    else:
        return "NA"

In [444]:
michelin["state"] = michelin["address"].apply(findCountry)

In [445]:
us = michelin[(michelin["state"]=="NY")|(michelin["state"]=="CA")|(michelin["state"]=="IL")]

In [446]:
#merge on name keep left (michelin restaurants)
us_mg = us.merge(us_goog,how="left",on="name")

In [447]:
#drop nulls and reset index
us_mg.dropna(inplace=True)
us_mg.reset_index(drop=True,inplace=True)

In [448]:
#change the price to numeric instead of $$$$
us_mg.goog_price = us_mg.goog_price.map({"$":1,"$$":2,"$$$":3,"$$$$":4})
dc["goog_price"] = dc.goog_price.map({"$":1,"$$":2,"$$$":3,"$$$$":4})
#post["goog_price"] = post.goog_price.map({"$":1,"$$":2,"$$$":3,"$$$$":4})

In [449]:
def generalizeC(x):
    cuisines = {"Chinese":["dim","dimsum","sichuan","taiwanese","tea","teahouse","mandarin"],
                "Mexican":["taco"],"International":["eclectic"],"Latin American":["nuevo"],
                "Japanese":["sushi","ramen","yakitori"],
                "American":["diner","family","hamburger","cafe","newamerican","traditionalamerican"],
               "Steakhouse":["steakhouse"],
               "Contemporary":["finedining"],
               "Spanish":["tapasbar"]}
    
    y = x.replace(" ","").lower()
    for key,val in cuisines.iteritems():
        if y in val:
            return key
    return x

In [450]:
us_mg.cuisine = us_mg.cuisine.apply(generalizeC)
dc.cuisine = dc.cuisine.apply(generalizeC)
#post.cuisine = post.cuisine.apply(generalizeC)

In [451]:
def cleanCuisine(x):
    x = re.sub(r"^\s*", "", x)
    x = re.sub(r"\s*$","",x)
    
    if x != "":
        return x
    else:
        return np.nan

In [452]:
us_mg["cuisine"] = us_mg["cuisine"].apply(cleanCuisine)
us_mg.dropna(inplace=True)

dc["cuisine"] = dc["cuisine"].apply(cleanCuisine)
dc.dropna(inplace=True)

# post["cuisine"] = post["cuisine"].apply(cleanCuisine)
# post.dropna(inplace=True)

In [453]:
us_mg.numrev = pd.to_numeric(us_mg.numrev)
us_mg.rating = pd.to_numeric(us_mg.rating)

dc.numrev = pd.to_numeric(dc.numrev)
dc.rating = pd.to_numeric(dc.rating)

# post.numrev = pd.to_numeric(post.numrev)
# post.rating = pd.to_numeric(post.rating)

In [454]:
us_mg.stars.value_counts()

0.0    891
1.0    144
2.0     24
3.0     18
Name: stars, dtype: int64

In [455]:
us_mg["bstars"] = us_mg.stars.map({0.0:0,1.0:1,2.0:1,3.0:1})
us_mg["cstars"] = us_mg.stars.map({0.0:0,1.0:1,2.0:2,3.0:2})

In [456]:
price_cuisine = pd.DataFrame(us_mg.groupby(["cuisine","goog_price"]).usd_avg.median())
avgPrice = pd.DataFrame(us_mg.groupby("goog_price").usd_avg.median())

In [457]:
ap = avgPrice.to_dict()
pc = price_cuisine.to_dict()

In [458]:
def getAvgPriceByCuisine(x,y):
    xy = (x,y)
    
    try:
        return pc["usd_avg"][xy]
    except:
        return ap["usd_avg"][y]

In [459]:
us_mg["pc"] = np.vectorize(getAvgPriceByCuisine)(us_mg["cuisine"],us_mg["goog_price"])
dc["pc"] = np.vectorize(getAvgPriceByCuisine)(dc["cuisine"],dc["goog_price"])
#post["pc"]= np.vectorize(getAvgPriceByCuisine)(post["cuisine"],post["goog_price"])

In [460]:
us_mg.dropna(inplace=True)
dc.dropna(inplace=True)
#post.dropna(inplace=True)

In [461]:
us_mg.drop_duplicates(inplace=True)
dc.drop_duplicates(inplace=True)
#post.drop_duplicates(inplace=True)

In [462]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(us_mg["cuisine"].values) + list(dc["cuisine"].values))

# le = LabelEncoder()
# le.fit(list(us_mg["cuisine"].values) + list(post["cuisine"].values))

LabelEncoder()

In [463]:
us_mg["cuisine"] = le.transform(us_mg["cuisine"].values)
dc["cuisine"] = le.transform(dc["cuisine"].values)

# us_mg["cuisine"] = le.transform(us_mg["cuisine"].values)
# post["cuisine"] = le.transform(post["cuisine"].values)

In [464]:
#save again
us_mg.to_csv("us_mg.csv",encoding="utf-8",index=False)
dc.to_csv("dc_test.csv",encoding="utf-8",index=False)
#post.to_csv("post_test.csv",encoding="utf-8",index=False)

## Modeling 

In [465]:
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix,classification_report,roc_curve,auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [727]:
X = us_mg[["goog_price","rating","numrev"]]
y = us_mg["cstars"]

In [728]:
X_scaled = preprocessing.scale(X)

In [879]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=0.4)

In [880]:
baseline = y.value_counts()/len(y)
baseline

0    0.860520
1    0.106383
2    0.033097
Name: cstars, dtype: float64

In [881]:
cv = StratifiedKFold(y, n_folds=5,shuffle=True)

In [882]:
def score(model, name, myX,myy):
    s = cross_val_score(model, myX, myy, cv=cv)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

In [883]:
def fitAndPrint(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print confusion_matrix(y_test,y_pred)

In [884]:
lm = LogisticRegression()

In [885]:
score(lm,"Logistic Regression",X_scaled,y)

Logistic Regression Score:	0.869 ± 0.009


In [886]:
fitAndPrint(lm)

[[277   5   0]
 [ 37   8   0]
 [  0  12   0]]


In [887]:
X_dc = dc[["goog_price","rating","numrev"]]
X_scaled_dc = preprocessing.scale(X_dc)

In [888]:
y_pred = lm.predict(X_scaled_dc)

In [889]:
count=0
for a,b in zip(dc["name"].values,y_pred):
    if b>=1:
        print a,b

Minibar 1
The Lafayette 1
Marcel's 1
Plume 1
Komi 1
Obelisk 1
Marcel’s 1


In [890]:
dt = DecisionTreeClassifier(class_weight="balanced")

In [891]:
score(dt,"Decision Tree",X_scaled,y)

Decision Tree Score:	0.818 ± 0.023


In [892]:
fitAndPrint(dt)

[[265  15   2]
 [ 29  14   2]
 [  5   3   4]]


In [893]:
dt.fit(X_scaled,y)

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [894]:
predictions_dt = dt.predict(X_scaled_dc)

In [895]:
for a,b in zip(dc["name"].values,predictions_dt):
    if b>=1:
        print a,b

Minibar 2
The Lafayette 1
Ristorante Tosca 1
Marcel's 1
Plume 2
Daikaya - Ramen (1F), Izakaya (2F) 1
Fiola Mare 1
BLT Steak 1
Equinox 1
Obelisk 2
Brasserie Beck 1
BOURBON STEAK 1
Hank's Oyster Bar 1
Lupo Verde 1
Toro Toro 1
Sakuramen Ramen Bar 1
Flight Wine Bar 1
Al Dente Italian Restaurant D.C. 1
the pig 1
Room 11 1
Mourayo 1
New Heights Restaurant 1
The Grill Room 1
Pesce 1
The Hamilton 1
Teaism Dupont Circle 1
Hank’s Oyster Bar 1
Daikaya Ramen 1
Garrison 1
Marcel’s 1


In [896]:
rf = RandomForestClassifier(class_weight="balanced")

In [897]:
score(rf,"Random Forest",X_scaled,y)

Random Forest Score:	0.843 ± 0.016


In [898]:
fitAndPrint(rf)

[[271   9   2]
 [ 34  10   1]
 [  5   2   5]]


In [899]:
for a,b in zip(X.columns.tolist(), rf.feature_importances_):
    print a, b

goog_price 0.329903969331
rating 0.192983360485
numrev 0.477112670184


In [900]:
rf.fit(X_scaled,y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [901]:
predictions = rf.predict(X_scaled_dc)

In [902]:
count=0
for a,b in zip(dc["name"].values,predictions):
    if b>=1:
        print a,b

Marcel's 1
Fiola Mare 1
Restaurant Nora 1
BLT Steak 1
Acadiana 1
Room 11 1
Marcel’s 1


In [903]:
ada = AdaBoostClassifier()

In [904]:
score(ada,"Adaptive Boosting",X_scaled,y)

Adaptive Boosting Score:	0.787 ± 0.086


In [905]:
fitAndPrint(ada)

[[275   7   0]
 [ 34  10   1]
 [  5   6   1]]


In [906]:
predictions = ada.predict(X_scaled_dc)

In [868]:
count=0
for a,b in zip(dc["name"].values,predictions):
    if b>=1:
        print a,b
        count+=1
print count

Rasika 1
Minibar 2
The Oval Room 1
Daikaya - Ramen (1F), Izakaya (2F) 1
Restaurant Nora 1
Obelisk 2
BOURBON STEAK 1
Good Stuff Eatery 1
Open City 1
Hill Country Barbecue Market 1
Clyde's of Gallery Place 1
BreadLine 1
Daikaya Ramen 1
13


In [907]:
gb = GradientBoostingClassifier()

In [908]:
score(gb,"Gradient Boosting",X_scaled,y)

Gradient Boosting Score:	0.858 ± 0.022


In [909]:
fitAndPrint(gb)

[[273   7   2]
 [ 34   9   2]
 [  6   4   2]]


In [910]:
predictions = gb.predict(X_scaled_dc)

In [911]:
count=0
for a,b in zip(dc["name"].values,predictions):
    if b>=1:
        print a,b

The Lafayette 2
Marcel's 1
Plume 2
Bibiana 1
Obelisk 2
Brasserie Beck 1
Sakedokoro Makoto 1
Beuchert’s Saloon 1
Al Dente Italian Restaurant D.C. 1
Mourayo 1
New Heights Restaurant 1
Westend Bistro 1
Matisse 1
Bearnaise 1
The Grill Room 1
Pesce 1
Brasserie Beck 1
Masseria 1
Garrison 1
Marcel’s 1


In [912]:
et = ExtraTreesClassifier()

In [913]:
score(et,"Extra Trees Classifier",X_scaled,y)

Extra Trees Classifier Score:	0.832 ± 0.02


In [914]:
fitAndPrint(et)

[[267  12   3]
 [ 35  10   0]
 [  5   4   3]]


In [915]:
predictions = gb.predict(X_scaled_dc)

In [916]:
count=0
for a,b in zip(dc["name"].values,predictions):
    if b>=1:
        print a,b

The Lafayette 2
Marcel's 1
Plume 2
Bibiana 1
Obelisk 2
Brasserie Beck 1
Sakedokoro Makoto 1
Beuchert’s Saloon 1
Al Dente Italian Restaurant D.C. 1
Mourayo 1
New Heights Restaurant 1
Westend Bistro 1
Matisse 1
Bearnaise 1
The Grill Room 1
Pesce 1
Brasserie Beck 1
Masseria 1
Garrison 1
Marcel’s 1


In [917]:
mich_test = pd.read_csv("michelin.csv")

In [919]:
mich_test

Unnamed: 0,restaurant,stars
0,Minibar,2
1,Marcel's,1
2,Plume,2
3,Komi,2
4,Obelisk,2
5,The Lafayette,2
6,Matisse,1
7,Restaurant Nora,1
8,Fiola Mare,1
9,Bibiana,1
