In [1]:
import datetime

import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from scipy.sparse import csr_matrix, hstack, coo_matrix

import string
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

from nltk.stem.snowball import EnglishStemmer
import nltk

import pickle

from sklearn.decomposition import LatentDirichletAllocation

In [2]:
business = pd.read_json('yelp_dataset/business.json', lines=True)
#checkin = pd.read_json('yelp_dataset/checkin.json', lines=True)
#photo = pd.read_json('yelp_dataset/photo.json', lines=True)
#review =  pd.read_json('yelp_dataset/review.json', lines=True)
#tip = pd.read_json('yelp_dataset/tip.json', lines=True)
##user = pd.read_json('yelp_dataset/user.json', lines=True)

In [None]:
business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


In [None]:
#https://github.com/pandas-dev/pandas/issues/18152

max_records = 1e5
df = pd.read_json('yelp_dataset/review.json', lines=True, chunksize=max_records)
reviews = pd.DataFrame() # Initialize the dataframe
try:
   for df_chunk in df:
       reviews = pd.concat([reviews, df_chunk])
except ValueError:
       print ('\nSome messages in the file cannot be parsed')

In [None]:
print(reviews.shape)
reviews.head()

In [None]:
joined = reviews.join(business, lsuffix='_r', rsuffix='_b', how="inner")

In [None]:
print(joined.shape)
joined.head()

#### Filter restaurants

In [None]:
temp = joined['categories'].str.contains('Restaurants', regex=False)
temp = temp.fillna(False)
joined = joined[temp]
print(joined.shape)
joined.head()

### Train/test split

In [None]:
temp = joined[['text', 'city', 'categories']]
X_train, X_test, y_train, y_test = train_test_split(temp, joined['stars_r'], test_size=0.2, random_state=42)
X_train.head()

### TF-IDF

In [None]:
corpus = X_train['text']
vectorizer = TfidfVectorizer(min_df = 0.01)

vectorizer.fit(corpus)

tf_train = vectorizer.transform(corpus)
tf_train

In [None]:
#test data
test_corpus = X_test['text']
tf_test = vectorizer.transform(test_corpus)
tf_test

### One-hot encoding

In [None]:
X_train.head()

In [None]:
#common_cats = ['Nightlife', 'Pizza', 'Burger', 'Chinese', 'Steak', 'Sandwiches', 'Fast Food']
common_cats = ['Pizza', 'Mexican', 'Chinese', 'Italian', 'Vietnamese']

In [None]:
for cat in common_cats:
    X_train[cat] = 0
    X_test[cat] = 0
X_train.head()

In [None]:
#https://stackoverflow.com/questions/36909977/update-row-values-where-certain-condition-is-met-in-pandas
for cat in common_cats:
    X_train.loc[X_train['categories'].str.contains(cat, regex=False), [cat]] = 1
    X_test.loc[X_test['categories'].str.contains(cat, regex=False), [cat]] = 1
X_train.head()

In [None]:
X_train['city'] = X_train['city'].str.lower()
X_test['city'] = X_test['city'].str.lower()

#X_train['state'] = X_train['state'].str.lower()
#X_test['state'] = X_test['state'].str.lower()

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train['city'].reshape(-1,1))

In [None]:
X_trainC = enc.transform(X_train['city'].reshape(-1,1))
X_testC = enc.transform(X_test['city'].reshape(-1,1))
X_trainC

In [None]:
cities = []
for t in enc.categories_:
    for c in t:
        cities.append(c)

In [None]:
X_train[common_cats].values

In [None]:
X_train_cats = csr_matrix(X_train[common_cats].values) 
X_test_cats = csr_matrix(X_test[common_cats].values)
X_train_cats

### Join results

In [None]:
X_trainJ = hstack([tf_train, X_trainC, X_train_cats], format="csr")
X_testJ = hstack([tf_test, X_testC, X_test_cats], format="csr")

### Grid Search

#### Ridge

In [None]:
parameters = {"alpha":[0.0001, 0.001, 0.01, 0.1]}

In [None]:
ridge = Ridge()
ridge_grid = GridSearchCV(estimator=ridge, param_grid=parameters, cv=10, scoring="r2", n_jobs=20)
print(ridge_grid)

In [None]:
print(X_train.shape)
print(y_train.shape)
ridge_grid.fit(X_trainJ, y_train)

In [None]:
print("Best parameters set found on development set:")
print()
print(ridge_grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = ridge_grid.cv_results_['mean_test_score']
stds = ridge_grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, ridge_grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

#### Lasso

In [None]:
lasso = Lasso()

In [None]:
lasso_grid = GridSearchCV(estimator=lasso, param_grid=parameters, cv=10, scoring="r2", n_jobs=20)
print(lasso_grid)

In [None]:
lasso_grid.fit(X_trainJ, y_train)

In [None]:
print("Best parameters set found on development set:")
print()
print(lasso_grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = lasso_grid.cv_results_['mean_test_score']
stds = lasso_grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, lasso_grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

#### Elastic Net

In [None]:
ElasticNet = ElasticNet()

In [None]:
en_grid = GridSearchCV(estimator=ElasticNet, param_grid=parameters, cv=10, scoring="r2", n_jobs=20)
print(en_grid)

In [None]:
en_grid.fit(X_trainJ, y_train)

In [None]:
print("Best parameters set found on development set:")
print()
print(en_grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = en_grid.cv_results_['mean_test_score']
stds = en_grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, en_grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

### Predict outputs

In [None]:
en_final = ElasticNet(alpha=0.0001)
en_final.fit(X_trainJ, y_train)


In [None]:
y_pred = en_final.predict(X_testJ)
y_pred

In [None]:
r2_score(y_test, y_pred)

In [None]:
def genCat(cat, common_cats):
    row = pd.DataFrame(columns=common_cats)
    row.loc[0] = [0]*len(common_cats)
    row.loc[:, cat] = 1
    row = row.astype('int64')
    row = csr_matrix(row.values)
    return row

In [None]:
def genCity(city):
    row = pd.DataFrame(columns=cities)
    row.loc[0] = [0]*len(cities)
    row.loc[:, city] = 1
    row = row.astype('int64')
    row = csr_matrix(row.values)
    return row

In [None]:
def get_size(text, city, cat):

    text_tf = vectorizer.transform(text)
    text_enc = genCity(city)
    cat_row = genCat(cat, common_cats)

    text_joined = hstack([text_tf, text_enc, cat_row], format="csr")
    return text_joined

In [None]:
def predict_city_cat(text, city, cat):

    text_tf = vectorizer.transform(text)
    text_enc = genCity(city)
    cat_row = genCat(cat, common_cats)

    text_joined = hstack([text_tf, text_enc, cat_row], format="csr")
    return en_final.predict(text_joined)

In [None]:
print(predict_city_cat(["this is a review"], "phoenix", "Pizza"))
print(predict_city_cat(["this is a review"], "las vegas", "Pizza"))
print(predict_city_cat(["this is a review"], "phoenix", "Chinese"))
print(predict_city_cat(["this is a review"], "las vegas", "Chinese"))
print()
print(predict_city_cat(["This is a great restaurant!"], "phoenix", "Pizza"))
print(predict_city_cat(["This is a great restaurant!"], "las vegas", "Pizza"))
print(predict_city_cat(["This is a great restaurant!"], "phoenix", "Chinese"))
print(predict_city_cat(["This is a great restaurant!"], "las vegas", "Chinese"))
print(predict_city_cat(["This is a good review"], "las vegas", "Chinese"))
print()
print(predict_city_cat(["This is a bad restaurant!"], "phoenix", "Pizza"))
print(predict_city_cat(["This is a bad restaurant!"], "las vegas", "Pizza"))
print(predict_city_cat(["This is a bad restaurant!"], "phoenix", "Chinese"))
print(predict_city_cat(["This is a bad restaurant!"], "las vegas", "Chinese"))


### pickle

In [None]:
with open('enc.pkl', 'wb') as enc_file:
    pickle.dump(enc, enc_file)

with open('vec.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

with open('model.pkl', 'wb') as file:
    pickle.dump(en_final, file)

In [None]:
s = pickle.dumps(en_final)
p = pickle.loads(s)

In [None]:
def pickle_predict(text, city, cat):

    text_tf = vectorizer.transform(text)
    text_enc = genCity(city)
    cat_row = genCat(cat, common_cats)

    text_joined = hstack([text_tf, text_enc, cat_row], format="csr")
    return p.predict(text_joined)

In [None]:
print(pickle_predict(["this is a review"], "phoenix", "Pizza"))
print(pickle_predict(["this is a review"], "las vegas", "Pizza"))
print(pickle_predict(["this is a review"], "phoenix", "Chinese"))
print(pickle_predict(["this is a review"], "las vegas", "Chinese"))
print()
print(pickle_predict(["This is a great restaurant!"], "phoenix", "Pizza"))
print(pickle_predict(["This is a great restaurant!"], "las vegas", "Pizza"))
print(pickle_predict(["This is a great restaurant!"], "phoenix", "Chinese"))
print(pickle_predict(["This is a great restaurant!"], "las vegas", "Chinese"))
print(pickle_predict(["This is a good review"], "las vegas", "Chinese"))
print()
print(pickle_predict(["This is a bad restaurant!"], "phoenix", "Pizza"))
print(pickle_predict(["This is a bad restaurant!"], "las vegas", "Pizza"))
print(pickle_predict(["This is a bad restaurant!"], "phoenix", "Chinese"))
print(pickle_predict(["This is a bad restaurant!"], "las vegas", "Chinese"))