## CSC 215 - Project 2
### Khoi Hoang
### William Dobson

In [1]:
import os
import csv
import json
import numpy as np
import pandas as pd
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib.pyplot import figure, show
import matplotlib.pyplot as plt
%matplotlib inline
import collections

In [2]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)    

# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()    
    
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)


### Convert JSON data into tabular format for Pandas

In [3]:
#reviews.json
outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('review.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

outfile.close()

In [4]:
#business.json
outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'review_count', 'categories'])
with open('business.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], row['review_count'], row['categories']])

outfile.close()

In [5]:
#tip.json
outfile = open("tip.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','text'])
with open('tip.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], (row['text']).encode('utf-8')])

outfile.close()

In [3]:
df_review= pd.read_csv('review_stars.tsv', delimiter ="\t", encoding="utf-8")
df_business= pd.read_csv('business.tsv', delimiter ="\t", encoding="utf-8")
df_tip= pd.read_csv('tip.tsv', delimiter ="\t", encoding="utf-8")
df_tip[0:5]

Unnamed: 0,business_id,text
0,VaKXUpmWTTWDKbpJ3aQdMw,"b'Great for watching games, ufc, and whatever ..."
1,OPiPeoJiv92rENwbq76orA,b'Happy Hour 2-4 daily with 1/2 price drinks a...
2,5KheTjYPu1HcQzQFtm4_vw,b'Good chips and salsa. Loud at times. Good se...
3,TkoyGi8J7YFjA6SbaRzrxg,b'The setting and decoration here is amazing. ...
4,AkL6Ous6A1atZejfZXn1Bg,b'Molly is definately taking a picture with Sa...


In [4]:
df_review.head()

Unnamed: 0,business_id,stars,text
0,ujmEBvifdJM6h6RLv4wQIg,1.0,b'Total bill for this horrible service? Over $...
1,NZnhc2sEQy3RmzKTZnqtwQ,5.0,"b""I *adore* Travis at the Hard Rock's new Kell..."
2,WTqjgwHlXbSFevF32_DJVw,5.0,"b""I have to say that this office really has it..."
3,ikCg8xy5JIg_NGPx-MSIDA,5.0,"b""Went in for a lunch. Steak sandwich was deli..."
4,b1b1eb3uo-w561D0ZfCEiQ,1.0,b'Today was my second out of three sessions I ...


In [5]:
df_review = df_review[df_review['text'].str.len() >= 50]
df_review.shape

(6678404, 3)

In [6]:
df_tip = df_tip[df_tip['text'].str.len() >= 20]
df_tip.shape

(1094522, 2)

In [7]:
#group all reviews by each business and create a new dataframe, where each line is a business with all its reviews
df_review_agg = df_review.groupby('business_id')['text'].sum()

df_all_reviews = pd.DataFrame({'business_id': df_review_agg.index, 'all_reviews': df_review_agg.values})

In [8]:
#group all tip by each business and create a new dataframe, where each line is a business with all its tips
df_tip_agg = df_tip.groupby('business_id')['text'].sum()

df_all_tips = pd.DataFrame({'business_id': df_tip_agg.index, 'all_tips': df_tip_agg.values})

In [9]:
df_all_tips.head()

Unnamed: 0,business_id,all_tips
0,--1UhMGODdWsrMastO9DZw,"b""Delicious! One of the best burritos, salsa a..."
1,--6MefnULPED_I942VcFNA,b'BBQ pork is sold out early on Saturday'b'The...
2,--7zmmkVg-IMGaXbuVd0SQ,"b""Good place to stop when traffic don't go.""b'..."
3,--9QQLMTbFzLJ_oT-ON3Xw,"b""Tip? Don't get your hair cut here. If they'r..."
4,--9e1ONYQuAa-CB_Rrw7Tw,b'Wine pairings for my prix fixe meal. Mmmm'b...


#### Only consider businesses which have more than 20 reviews

In [10]:
df_business = df_business[df_business['review_count'] >= 20]
df_business.shape

(57644, 4)

In [11]:
#merge df_business, df_all_reviews, and df_all_tips by business_id
df_ready_for_sklearn = pd.merge(pd.merge(df_all_reviews, df_business, on='business_id', how='inner'), df_all_tips, on='business_id', how='inner')
df_ready_for_sklearn.shape

(54657, 6)

In [12]:
df_ready_for_sklearn['review_count_zscore'] = zscore(df_ready_for_sklearn['review_count'])
df_ready_for_sklearn[0:5]

Unnamed: 0,business_id,all_reviews,stars,review_count,categories,all_tips,review_count_zscore
0,--1UhMGODdWsrMastO9DZw,b'If you are looking for authentic Mexican str...,4.0,24,"Restaurants, Mexican","b""Delicious! One of the best burritos, salsa a...",-0.390434
1,--6MefnULPED_I942VcFNA,"b""They have the best Chinese BBQ Pork (Char Si...",3.0,44,"Chinese, Restaurants",b'BBQ pork is sold out early on Saturday'b'The...,-0.286046
2,--7zmmkVg-IMGaXbuVd0SQ,"b""I'm a tad reluctant to write a review as I r...",4.0,58,"Breweries, Food","b""Good place to stop when traffic don't go.""b'...",-0.212974
3,--9e1ONYQuAa-CB_Rrw7Tw,b'Very busy and noisy restaurant.\nAsparagas w...,4.0,1613,"Cajun/Creole, Seafood, Steakhouses, Restaurants",b'Wine pairings for my prix fixe meal. Mmmm'b...,7.903234
4,--DaPTJW3-tB1vP-PfdTEg,b'My sister and in laws were in town and we wa...,3.5,49,"Restaurants, Breakfast & Brunch",b'Awesome. Huge portions definitely worth a v...,-0.259948


In [13]:
#label encode the stars
le = preprocessing.LabelEncoder()
df_ready_for_sklearn['encoded_stars'] = le.fit_transform(df_ready_for_sklearn['stars'])

In [14]:
print(df_ready_for_sklearn.shape)
df_ready_for_sklearn[0:5]

(54657, 8)


Unnamed: 0,business_id,all_reviews,stars,review_count,categories,all_tips,review_count_zscore,encoded_stars
0,--1UhMGODdWsrMastO9DZw,b'If you are looking for authentic Mexican str...,4.0,24,"Restaurants, Mexican","b""Delicious! One of the best burritos, salsa a...",-0.390434,6
1,--6MefnULPED_I942VcFNA,"b""They have the best Chinese BBQ Pork (Char Si...",3.0,44,"Chinese, Restaurants",b'BBQ pork is sold out early on Saturday'b'The...,-0.286046,4
2,--7zmmkVg-IMGaXbuVd0SQ,"b""I'm a tad reluctant to write a review as I r...",4.0,58,"Breweries, Food","b""Good place to stop when traffic don't go.""b'...",-0.212974,6
3,--9e1ONYQuAa-CB_Rrw7Tw,b'Very busy and noisy restaurant.\nAsparagas w...,4.0,1613,"Cajun/Creole, Seafood, Steakhouses, Restaurants",b'Wine pairings for my prix fixe meal. Mmmm'b...,7.903234,6
4,--DaPTJW3-tB1vP-PfdTEg,b'My sister and in laws were in town and we wa...,3.5,49,"Restaurants, Breakfast & Brunch",b'Awesome. Huge portions definitely worth a v...,-0.259948,5


### Use tfidfVectorizer to obtain TFIDF representation for each business

In [18]:
import sklearn.feature_extraction.text as sk_text

#obtain tfidf score for reviews
vectorizer = sk_text.TfidfVectorizer(max_features=250, min_df=10)

reviews_matrix = vectorizer.fit_transform(df_ready_for_sklearn['all_reviews'])

reviews_tfidf = reviews_matrix.toarray()

In [19]:
print(reviews_tfidf.shape)
vectorizer.get_feature_names()

(54657, 250)


['10',
 'about',
 'after',
 'again',
 'all',
 'also',
 'always',
 'am',
 'amazing',
 'an',
 'and',
 'another',
 'any',
 'are',
 'area',
 'around',
 'as',
 'asked',
 'at',
 'atmosphere',
 'away',
 'awesome',
 'back',
 'bad',
 'bar',
 'be',
 'because',
 'been',
 'before',
 'being',
 'best',
 'better',
 'big',
 'bit',
 'both',
 'burger',
 'but',
 'by',
 'came',
 'can',
 'car',
 'check',
 'cheese',
 'chicken',
 'clean',
 'coffee',
 'come',
 'could',
 'customer',
 'day',
 'definitely',
 'delicious',
 'did',
 'didn',
 'different',
 'dinner',
 'do',
 'don',
 'down',
 'drink',
 'drinks',
 'eat',
 'enough',
 'even',
 'ever',
 'every',
 'everything',
 'excellent',
 'experience',
 'favorite',
 'feel',
 'few',
 'find',
 'first',
 'food',
 'for',
 'found',
 'free',
 'fresh',
 'friendly',
 'friends',
 'fries',
 'from',
 'get',
 'give',
 'go',
 'going',
 'good',
 'got',
 'great',
 'had',
 'happy',
 'has',
 'have',
 'he',
 'her',
 'here',
 'his',
 'home',
 'hot',
 'hour',
 'how',
 'if',
 'in',
 'into'

In [20]:
#obtain tfidf score for tips
vectorizer = sk_text.TfidfVectorizer(max_features=250, min_df=10)

tips_matrix = vectorizer.fit_transform(df_ready_for_sklearn['all_tips'])

tips_tfidf = tips_matrix.toarray()

In [21]:
print(tips_tfidf[0:5])
vectorizer.get_feature_names()

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.02237383 0.         0.01310014 ... 0.02325763 0.01319684 0.02350368]
 [0.         0.         0.         ... 0.         0.         0.        ]]


['10',
 '20',
 '30',
 'about',
 'after',
 'again',
 'all',
 'also',
 'always',
 'am',
 'amazing',
 'an',
 'and',
 'any',
 'are',
 'area',
 'around',
 'as',
 'ask',
 'at',
 'atmosphere',
 'awesome',
 'back',
 'bad',
 'bar',
 'be',
 'because',
 'beef',
 'been',
 'beer',
 'before',
 'best',
 'better',
 'big',
 'breakfast',
 'bring',
 'burger',
 'busy',
 'but',
 'by',
 'can',
 'check',
 'cheese',
 'chicken',
 'clean',
 'closed',
 'coffee',
 'come',
 'coming',
 'cool',
 'cream',
 'customer',
 'day',
 'definitely',
 'delicious',
 'did',
 'dinner',
 'do',
 'don',
 'down',
 'drink',
 'drinks',
 'early',
 'eat',
 'even',
 'ever',
 'every',
 'everything',
 'excellent',
 'experience',
 'family',
 'fantastic',
 'fast',
 'favorite',
 'find',
 'first',
 'fish',
 'food',
 'for',
 'free',
 'fresh',
 'fried',
 'friendly',
 'fries',
 'from',
 'fun',
 'get',
 'getting',
 'go',
 'going',
 'good',
 'got',
 'great',
 'had',
 'half',
 'happy',
 'has',
 'have',
 'he',
 'here',
 'highly',
 'home',
 'hot',
 'ho

# Linear Regression

Preparing data for training/testing

In [22]:
#Merge reviews_tfidf, tips_tfidf and review_count into one array for input
temp = df_ready_for_sklearn['review_count_zscore'].values
input_data = np.column_stack((reviews_tfidf, tips_tfidf))
input_data = np.column_stack((input_data, temp))
input_data.shape

(54657, 501)

In [23]:
#split into train/test data
x_train, x_test, y_train, y_test = train_test_split(input_data, df_ready_for_sklearn['stars'], test_size=0.25, random_state=43)

In [24]:
test_data = input_data[test_loc[0]]
for i in range(1, len(test_loc)):
    test_data = np.append(test_data, input_data[test_loc[i]], axis=0)
test_data.shape

NameError: name 'test_loc' is not defined

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

# instantiate the model (using the default parameters)
linreg = LinearRegression()

# fit the model with data
linreg.fit(x_train, y_train)

# predict the response for new observations
y_pred_Linear = linreg.predict(test_data)

In [None]:
print("Prediction result: ")
print("Business ID                         Categories             Predicted Star Rating         True Star Rating")
for i in range(5):
    print("{:.20}             {:.23}         {:.5}                         {}".format(test_businesses[i]['business_id'], test_businesses[i]['categories'], y_pred_Linear[i], test_true_stars[i]))

### Support Vector Machine

In [None]:
#split into train/test data
#using encoded stars because classification problem
x_train, x_test, y_train, y_test = train_test_split(input_data, df_ready_for_sklearn['encoded_stars'], test_size=0.25, random_state=43)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC(dual=False)
clf.fit(x_train, y_train)

y_pred_SVM = clf.predict(test_data)

In [None]:
y_pred_SVM = le.inverse_transform(y_pred_SVM)
print("Prediction result: ")
print("Business ID                         Categories             Predicted Star Rating         True Star Rating")
for i in range(5):
    print("{:.20}             {:.23}         {:.5}                         {}".format(test_businesses[i]['business_id'], test_businesses[i]['categories'], y_pred_SVM[i], test_true_stars[i]))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(x_train, y_train)

# predict the response for new observations
y_pred_Logistic = logreg.predict(test_data)

In [None]:
y_pred_Logistic = le.inverse_transform(y_pred_Logistic)
print("Prediction result: ")
print("Business ID                         Categories             Predicted Star Rating         True Star Rating")
for i in range(5):
    print("{:.20}             {:.23}         {:.5}                         {}".format(test_businesses[i]['business_id'], test_businesses[i]['categories'], y_pred_Logistic[i], test_true_stars[i]))

### Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# instantiate the model (using the value K=10)
knn = KNeighborsClassifier(n_neighbors=10)

# fit the model with data
knn.fit(x_train, y_train)

In [None]:
# predict the response for new observations
y_neighbor = knn.predict(test_data)

In [None]:
y_neighbor = le.inverse_transform(y_neighbor)
print("Prediction result: ")
print("Business ID                         Categories             Predicted Star Rating         True Star Rating")
for i in range(5):
    print("{:.20}             {:.23}         {:.5}                         {}".format(test_businesses[i]['business_id'], test_businesses[i]['categories'], y_neighbor[i], test_true_stars[i]))

### Multinomial Naive Bayes

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

temp = scaler.fit_transform(df_ready_for_sklearn['review_count'].values.reshape(1, -1))

input_data_MNB = np.column_stack((reviews_tfidf, tips_tfidf))
input_data_MNB = np.column_stack((input_data_MNB, temp.reshape(-1,1)))

input_data_MNB

In [None]:
test_data = input_data_MNB[test_loc[0]]
for i in range(1, len(test_loc)):
    test_data = np.append(test_data, input_data_MNB[test_loc[i]], axis=0)
test_data.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(input_data_MNB, df_ready_for_sklearn['encoded_stars'], test_size=0.25, random_state=43)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# instantiate the model (using the default parameters)
mnb = MultinomialNB()

# fit the model with data
mnb.fit(x_train, y_train)

# predict the response for new observations
y_mnb = mnb.predict(test_data)

In [None]:
y_mnb = le.inverse_transform(y_mnb)
print("Prediction result: ")
print("Business ID                         Categories             Predicted Star Rating         True Star Rating")
for i in range(5):
    print("{:.20}             {:.23}         {:.5}                         {}".format(test_businesses[i]['business_id'], test_businesses[i]['categories'], y_mnb[i], test_true_stars[i]))

# TensorFlow

In [15]:
%matplotlib inline
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras import optimizers

Using TensorFlow backend.


In [16]:
x_train, x_test, y_train, y_test = train_test_split(df_ready_for_sklearn, df_ready_for_sklearn['encoded_stars'], test_size=0.25, random_state=43)

In [17]:
import sklearn.feature_extraction.text as sk_text

#obtain tfidf score for reviews
vectorizer = sk_text.TfidfVectorizer(max_features=250, min_df=10)

reviews_matrix_train = vectorizer.fit_transform(x_train['all_reviews'])
reviews_tfidf_train = reviews_matrix_train.toarray()

reviews_matrix_test = vectorizer.fit_transform(x_test['all_reviews'])
reviews_tfidf_test = reviews_matrix_test.toarray()

In [18]:
#obtain tfidf score for tips
vectorizer = sk_text.TfidfVectorizer(max_features=250, min_df=10)

tips_matrix_train = vectorizer.fit_transform(x_train['all_tips'])
tips_tfidf_train = tips_matrix_train.toarray()

tips_matrix_test = vectorizer.fit_transform(x_test['all_tips'])
tips_tfidf_test = tips_matrix_test.toarray()

In [19]:
temp = x_train['review_count_zscore'].values
input_data_train = np.column_stack((reviews_tfidf_train, tips_tfidf_train))
input_data_train = np.column_stack((input_data_train, temp))
input_data_train.shape

temp = x_test['review_count_zscore'].values
input_data_test = np.column_stack((reviews_tfidf_test, tips_tfidf_test))
input_data_test = np.column_stack((input_data_test, temp))
input_data_test.shape

(13665, 501)

In [23]:
df_train = pd.DataFrame(input_data_train)
df_test = pd.DataFrame(input_data_test)
df_train['encoded_stars'] = y_train
df_test['encoded_stars'] = y_test
missing_median(df_train, 'encoded_stars')
missing_median(df_test, 'encoded_stars')

In [24]:
df_train.shape

(40992, 502)

In [25]:
y_train.shape

(40992,)

In [26]:
y_train.isnull().values.any()

False

In [28]:
x_train,y_train = to_xy(df_train,'encoded_stars')

In [None]:
# Encode to a 2D matrix for training
x,y = to_xy(df_train,'stars')

In [None]:
x.shape

In [None]:
y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=43)

In [None]:
model = Sequential()

model.add(Dense(501, input_dim=x_train.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
model.add(Dense(200, activation='relu')) # Hidden 2
model.add(Dense(100, activation='relu')) # Hidden 3
model.add(Dense(10, activation='relu')) # Hidden 4
model.add(Dense(1)) # Output

model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='loss', min_delta=1e-3, patience=3, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath=".\lowest_error.hdf5", verbose=0, save_best_only=True) # save best model


model.fit(x_train,y_train,callbacks=[monitor,checkpointer],verbose=2,epochs=100)    # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.


In [None]:
pred = model.predict(x_test)
print("Shape: {}".format(pred.shape))
print(pred)

In [None]:
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

In [None]:
# Plot the chart
chart_regression(pred.flatten(),y_test)