# Predicting Yelp Reviews

In [1]:
import os
import json
import string
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
import time

root_dir = "."
    
wordnet_lemmatizer = WordNetLemmatizer()

In [2]:
with open('yelp.json') as data_file:    
    data = json.load(data_file)

In [3]:
data[0]

{'business_id': '0W4lkclzZThpx3V65bVgig',
 'cool': 0,
 'date': '2016-05-28',
 'funny': 0,
 'review_id': 'v0i_UHJMo_hPBq9bxWvW4w',
 'stars': 5,
 'text': "Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.",
 'useful': 0,
 'user_id': 'bv2nCi5Qv5vroFiqKGopiw'}

In [4]:
with open('yelpHeld.json') as data_file:    
    test_data = json.load(data_file)

In [5]:
file = open('glove.twitter.27B.200d.txt', 'r') 
text_embeddings = file.readlines()
word2vec = {}
for line in text_embeddings:
    items = line.split(' ')
    word = items[0]
    vec = items[1:]
    word2vec[word] = np.asarray(vec, dtype = 'float32')

In [6]:
def get_list_of_words(line):
    temp = line.replace('\n','').lower()
    temp = "".join(char for char in temp if char in 'qwertyuiopasdfghjklzxcvbnm ')
    return temp.split(' ')

def get_avg_vec(line, word2vec):
    words = get_list_of_words(line)
    num_words = 0
    vec_length = len(word2vec['a'])
    avg = np.zeros((vec_length))
    valid_words = word2vec.keys()
    
    for word in words:
        if word in valid_words:
            vec = word2vec[word]
            avg = np.add(avg, vec)
            num_words += 1
            
    if num_words > 0:
        avg = avg/num_words
    
    return avg

In [7]:
x_data = []
y_data = []

for sample in data:
    text = sample['text']
    x_data.append(get_avg_vec(text, word2vec))
    
    star_rating = sample['stars']
    y_one_hot = [0, 0, 0, 0, 0]
    y_one_hot[star_rating-1] = 1
    
    y_data.append(y_one_hot)
    if len(x_data) % 100000 == 0:
        print(len(x_data))

100000
200000
300000
400000
500000
600000
700000


In [8]:
x_data = np.asarray(x_data)
y_data = np.asarray(y_data)
print("x_data_shape:", x_data.shape)
print("y_data_shape:", y_data.shape)

x_data_shape: (749574, 200)
y_data_shape: (749574, 5)


In [9]:
from keras.layers.core import Dense, Activation, Dropout
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [10]:
vec_length = 200
num_classes = 5

model = Sequential()
model.add(Dense(80, input_dim=vec_length))
model.add(Dense(10))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


start = time.time()
model.fit(x_data, y_data, batch_size=1024, nb_epoch=10, validation_split=0.05)
print('training time : ', time.time() - start)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 80)                16080     
_________________________________________________________________
dense_2 (Dense)              (None, 10)                810       
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 55        
Total params: 16,945
Trainable params: 16,945
Non-trainable params: 0
_________________________________________________________________
None
Train on 712095 samples, validate on 37479 samples
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training time :  38.592257022857666


In [11]:
x_test_data = []
x_text = []
x_id = []

for sample in test_data:
    text = sample['text']
    x_test_data.append(get_avg_vec(text, word2vec))
    x_id.append(sample['review_id'])
    x_text.append(text)
    if len(x_test_data) % 100000 == 0:
        print(len(x_test_data))
        
x_test_data = np.asarray(x_test_data)

100000
200000


In [14]:
y_predict_probs = model.predict(x_test_data)
y_predict_stars = []
for i in range(y_predict_probs.shape[0]):
    star = np.argmax(y_predict_probs[i]) + 1
    y_predict_stars.append(star)

for i in range(10, 20):
    print(y_predict_stars[i], x_text[i], x_id[i])
    print()
    print()

In [15]:
len(y_predict_stars)

250425

In [16]:
len(x_id)

250425

In [18]:
file = open('stars_wei_tang_xiao_sampugnaro.csv','w') 
file.write('review_id,stars\n')

for i in range(len(y_predict_stars)):
    file.write(x_id[i] + "," + str(y_predict_stars[i]))
    file.write('\n')