In [1]:
import requests
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

## Pulling the data

In [2]:
API_KEY = 'jUkhESUhw68Xd0nxttDKkDyfMqIWcDiBLdgVbYTrE6Kz5GS8AFuqEyAnyvFCS-JTekXw006bjKYLBjoP_WB-6jkXnKLimf1DBv9JTjgHHeeVa6HNtKxDINVgqhAXXHYx'

In [3]:
# see api reference: https://www.yelp.com/developers/documentation/v3/business_search
def get_data(num_rows=1000):
    limit = 50
    url_params = {
        'term': 'chinese',
        'location': 'NYC', 
        'limit': limit,
        'offset': 0
    }
    headers = {
        'Authorization': 'Bearer %s' % API_KEY,
    }
    
    df = pd.DataFrame()
    
    num_requests = int(num_rows / limit)
    for i in range(num_requests):
        print(u'Querying ...')
        response = requests.request('GET', 'https://api.yelp.com/v3/businesses/search', headers=headers, params=url_params)
        new_df = pd.DataFrame(response.json()['businesses'])
        df = pd.concat([df, new_df])
        
        url_params['offset'] += limit
    
    return df

In [4]:
data = get_data()

Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...
Querying ...


In [5]:
data.head()

Unnamed: 0,alias,categories,coordinates,display_phone,distance,id,image_url,is_closed,location,name,phone,price,rating,review_count,transactions,url
0,dun-huang-grand-central-new-york-4,"[{'alias': 'chinese', 'title': 'Chinese'}, {'a...","{'latitude': 40.74922, 'longitude': -73.97785}",(646) 964-4007,5062.808036,mUl0wtO7tPkBocuJqZ-7Dw,https://s3-media3.fl.yelpcdn.com/bphoto/PysnbQ...,False,"{'address1': '320 Lexington Ave', 'address2': ...",Dun Huang - Grand Central,16469644007,,4.5,26,"[pickup, delivery]",https://www.yelp.com/biz/dun-huang-grand-centr...
1,chinos-rotisserie-chicken-new-york,"[{'alias': 'chickenshop', 'title': 'Chicken Sh...","{'latitude': 40.71474, 'longitude': -73.9983}",(646) 559-0331,1085.627609,PBvQjFOcMBn76pb8okbHOg,https://s3-media3.fl.yelpcdn.com/bphoto/AYqTd2...,False,"{'address1': '23 Pell St', 'address2': None, '...",Chino's Rotisserie Chicken,16465590331,,4.5,25,"[pickup, delivery]",https://www.yelp.com/biz/chinos-rotisserie-chi...
2,mission-chinese-food-brooklyn,"[{'alias': 'chinese', 'title': 'Chinese'}]","{'latitude': 40.70933, 'longitude': -73.92315}",(718) 628-3731,6011.362329,IoAUbSudW1Mds5_XY_01Rg,https://s3-media3.fl.yelpcdn.com/bphoto/5xZaXa...,False,"{'address1': '599 Johnson Ave', 'address2': ''...",Mission Chinese Food,17186283731,,4.0,13,"[pickup, delivery]",https://www.yelp.com/biz/mission-chinese-food-...
3,shu-jiao-fu-zhou-cuisine-restaurant-new-york,"[{'alias': 'chinese', 'title': 'Chinese'}]","{'latitude': 40.7184290357839, 'longitude': -7...",(212) 625-2532,1460.31286,lynQoI3w_pzYfHGeuUU-Qg,https://s3-media2.fl.yelpcdn.com/bphoto/TALCyE...,False,"{'address1': '118 Eldridge St', 'address2': No...",Shu Jiao Fu Zhou Cuisine Restaurant,12126252532,$,4.5,587,[pickup],https://www.yelp.com/biz/shu-jiao-fu-zhou-cuis...
4,hunan-delight-brooklyn,"[{'alias': 'chinese', 'title': 'Chinese'}]","{'latitude': 40.6755282563532, 'longitude': -7...",(718) 789-1400,3596.767124,_XLLSXPIjgoZQtwU3RCcQA,https://s3-media2.fl.yelpcdn.com/bphoto/2M169b...,False,"{'address1': '752 Union St', 'address2': '', '...",Hunan Delight,17187891400,$$,4.0,267,[],https://www.yelp.com/biz/hunan-delight-brookly...


In [6]:
data.shape

(1000, 16)

In [7]:
data[data.alias.duplicated()]

Unnamed: 0,alias,categories,coordinates,display_phone,distance,id,image_url,is_closed,location,name,phone,price,rating,review_count,transactions,url


In [8]:
data.to_csv("nyc_chinese_restaurants_1k.csv", index=False)

## Good vs. Bad

In [9]:
data = pd.read_csv("nyc_chinese_restaurants_1k.csv")

In [10]:
data.columns

Index(['alias', 'categories', 'coordinates', 'display_phone', 'distance', 'id',
       'image_url', 'is_closed', 'location', 'name', 'phone', 'price',
       'rating', 'review_count', 'transactions', 'url'],
      dtype='object')

In [11]:
data[['rating', 'review_count']].describe()

Unnamed: 0,rating,review_count
count,1000.0,1000.0
mean,3.747,144.194
std,0.534113,337.260097
min,2.0,1.0
25%,3.5,15.0
50%,4.0,39.0
75%,4.0,140.0
max,5.0,5539.0


The average NYC Chinese restaurant has a 4 star rating with 39 reviews. 

In [12]:
data['is_good'] = data.apply(lambda r: True if r.review_count > 10 and r.rating >= 4 else False, axis=1)

In [13]:
data.is_good.value_counts()

False    608
True     392
Name: is_good, dtype: int64

39% of Chinese restaurants are "good" (i.e. have >10 reviews and and have greater than a 4 star rating).

## Bag of Words

In [14]:
def get_ngram_counts(v, corpus):
    # given a sparse matrix, return the top ngrams as a Pandas series with their counts
    m = v.fit_transform(corpus)
    ngrams = list(zip(*sorted(v.vocabulary_.items(), key=lambda x: x[1])))[0]
    totals = m.toarray().sum(axis=0)
    return pd.Series(data=totals, index=ngrams)

### Unigrams

In [15]:
vectorizer = CountVectorizer()
unigrams = get_ngram_counts(vectorizer, data.name).sort_values(ascending=False).to_frame()
unigrams.columns = ['ct']
unigrams.head(10)

Unnamed: 0,ct
restaurant,194
chinese,145
kitchen,100
new,71
garden,69
china,68
house,50
king,38
noodle,33
golden,33


### Bigrams

In [16]:
bigram_vectorizer = CountVectorizer(ngram_range=(2,2))
get_ngram_counts(bigram_vectorizer, data.name).sort_values(ascending=False).head(10)

chinese restaurant    79
chinese kitchen       17
hong kong             16
xi an                 13
famous foods          13
an famous             12
great wall            11
chinese food          11
garden chinese        10
noodle shop            9
dtype: int64

## Training the model

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data.name, data.is_good, test_size=0.2, random_state=42)
clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(random_state=42))
])

clf.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [18]:
np.mean(clf.predict(X_test) == y_test)

0.63500000000000001

In [19]:
metrics.roc_auc_score(y_test, clf.predict(X_test))

0.58921302578018997

In [20]:
print(metrics.classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

      False       0.73      0.72      0.73       134
       True       0.45      0.45      0.45        66

avg / total       0.64      0.64      0.64       200



In [21]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, clf.predict(X_test)).ravel()

print('true negatives: {:d}'.format(tn))
print('false positives: {:d}'.format(fp))
print('false negatives: {:d}'.format(fn))
print('true positives: {:d}'.format(tp))

true negatives: 97
false positives: 37
false negatives: 36
true positives: 30


It's easier to identify a bad restaurant than it is to identify a good one.

## Biggest word red-flags

In [22]:
model_vect = clf.get_params()['steps'][0][1]

model = clf.get_params()['steps'][2][1]

In [23]:
weights = pd.DataFrame(data={'coef': model.coef_[0]}, index=model_vect.get_feature_names())

In [24]:
weights_with_counts = weights.merge(unigrams, how='left', left_index=True, right_index=True)

In [25]:
weights_with_counts.query("ct >= 10").sort_values('coef', ascending=True).head(10)

Unnamed: 0,coef,ct
golden,-4.246894,33
wah,-4.206801,16
hot,-4.018966,11
restaurant,-3.522196,194
peking,-2.874666,12
city,-2.31796,11
star,-2.030389,11
kitchen,-2.006599,100
mr,-1.867034,16
lee,-1.783159,14


In [26]:
data[data.name.apply(lambda x: 'golden' in x.lower())][['name', 'rating', 'review_count']]

Unnamed: 0,name,rating,review_count
101,Trinidad Golden Place Restaurant,4.5,107
126,Golden Z,4.0,70
153,Golden Forest,4.0,139
158,Golden Chopstick,3.5,61
248,Golden Dragon,3.5,68
272,Golden Fried Dumpling,3.5,329
291,New Golden Star,3.5,72
329,Christopher's Golden Woks,3.5,49
330,Golden City Chinese Restaurant,3.5,76
363,New Golden Dragon Restaurant,4.0,25


## Random name generator

In [27]:
def gen_name_and_predict(length=3):
    get_words = random.sample(model_vect.vocabulary_.keys(), length)
    name = ' '.join([w.capitalize() for w in get_words])
    m = model_vect.transform([name])
    if model.predict(m)[0]:
        return name

In [28]:
i = 1
max_iterations = 100
while True:
    name = gen_name_and_predict(2)
    if name:
        print("Successful name found after {:d} iterations!".format(i))
        print(name)
        break
    
    i += 1
    
    if i == max_iterations:
        print("No name found after {:d} iterations".format(max_iterations))
        break

Successful name found after 6 iterations!
Chowtime He


Some fun ones:
• Super Kathy
• Wong Gate
• Uncle Chowtime
• Great Fatt

In [29]:
model.predict(model_vect.transform(['hua long']))[0]

False