In [1]:
import pymysql
import random
import nltk
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import *
from nltk.corpus import stopwords
from nltk.stem import *
from nltk import *
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.metrics import r2_score

# Set up a connection with the server
conn = pymysql.connect(host='localhost', port=8889, user='root', passwd='root')
# Set up a cursor object that will serve as a virtual 'cursor'
cursor = conn.cursor()
conn.commit()

# Fetch text and stars data (list of tuples)
cursor.execute('USE YELP_DATA_PROJECT;')
cursor.execute('SELECT text, stars FROM usa_reviews WHERE length(text) > 50;') # Querying only if length of review is greater than 50 chars, otherwise review is hard to accommodate in a bag of words.
textstars=list(cursor.fetchall())   

In [2]:
# Get a sample of the entire set of reviews, to avoid excessive computational cost.
randsamp = np.asarray(random.sample(range(1,len(textstars)+1),10000))
trainsamp = np.split(randsamp,2)[0]
testsamp = np.split(randsamp,2)[1]

# Get train data
textstars_sample_train = np.asarray(textstars)[trainsamp]
text_train = [i[0] for i in textstars_sample_train]
stars_train = [float(i[1]) for i in textstars_sample_train]

# A few statistics about the data being used
print('Train data')
print('Data size =', len(textstars_sample_train))
print('Average review length (characters) =',sum([len(r) for r in text_train])/len(text_train))
print('Average # of stars =',sum(stars_train)/len(stars_train))
print('Star values (1 to 5) =',[stars_train.count(i) for i in range(1,6)])

# Read test data
textstars_sample_test = np.asarray(textstars)[testsamp]
text_test = [i[0] for i in textstars_sample_test]
stars_test = [float(i[1]) for i in textstars_sample_test]

# A few statistics about the test data being used
print('\nTest data')
print('Data size =', len(textstars_sample_test))
print('Average review length (characters) =',sum([len(r) for r in text_test])/len(text_test))
print('Average # of stars =',sum(stars_test)/len(stars_test))
print('Star values (1 to 5) =',[stars_test.count(i) for i in range(1,6)])

Train data
Data size = 5000
Average review length (characters) = 678.4928
Average # of stars = 3.7216
Star values (1 to 5) = [449, 489, 751, 1627, 1684]

Test data
Data size = 5000
Average review length (characters) = 668.4844
Average # of stars = 3.6792
Star values (1 to 5) = [479, 513, 776, 1597, 1635]


In [3]:
# Stem
def stem(word):
    wnl = WordNetLemmatizer()
    temp_review=wnl.lemmatize(word,'v')
    return wnl.lemmatize(temp_review,'n')

# Tokenize
def tokenize(review):
    tok_review = TreebankWordTokenizer().tokenize(review)
    return tok_review

# Remove stopwords
def remove_stop(review):
    stopset = set(stopwords.words('english'))
    result_text= [w for w in review if not w in stopset]
    return result_text

# Define a comprehensive preprocessing function
def preprocess(rev_text): # list of strings
    tokenized_reviews = [[] for _ in range(len(rev_text))]
    stemmed_reviews = [[] for _ in range(len(rev_text))]
    tok_stem_reviews = [[] for _ in range(len(rev_text))]
    tok_stem_stop_reviews = [[] for _ in range(len(rev_text))]
    preproc_reviews = [[] for _ in range(len(rev_text))]

    rev_text = [i.lower() for i in rev_text] # Lower case
    for i in range(len(rev_text)):
        tokenized_reviews[i] = tokenize(rev_text[i]) # Tokenize the reviews -- Takes as input a string
        for j in range(len(tokenized_reviews[i])): 
            stemmed_reviews[i].append(stem(tokenized_reviews[i][j])) # Stem reviews
        tok_stem_stop_reviews[i]=remove_stop(stemmed_reviews[i]) # Remove stop words
        for j in range(len(tok_stem_stop_reviews[i])): 
            if(tok_stem_stop_reviews[i][j].isalpha()): # Only keep words with only letters
                preproc_reviews[i].append(tok_stem_stop_reviews[i][j])
    for i in range(len(preproc_reviews)):
        preproc_reviews[i] = " ".join(preproc_reviews[i]) # Ready to generate bag of words
    
    return preproc_reviews # list of strings

In [9]:
# Preprocess train set of reviews
preprocessed_text_train = preprocess(text_train)

# Generate bag of words (train)
vectorizer = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = None, 
                             max_features = 500) 
bow_train = vectorizer.fit_transform(preprocessed_text_train) # Learn the vocabulary, create feature vectors
bow_train = bow_train.toarray() # Numpy arrays are easy to work with, so convert the result to an array

# Create and sort a dict vocabulary, with all expressions included in the bag of words and their counts
vocab = vectorizer.get_feature_names()
vocabulary = dict()
dist = np.sum(bow_train, axis=0)
for expression, count in zip(vocab, dist):
    vocabulary.update({expression: count})
vocabulary = sorted(vocabulary.items(), key=lambda x:x[1])

stars_train = np.asarray(stars_train).reshape(-1,1)

In [10]:
# Preprocess test set of reviews
preprocessed_text_test = preprocess(text_test)

# Get a bag of words for the test set, and convert to a numpy array
bow_test = vectorizer.transform(preprocessed_text_test)
bow_test = bow_test.toarray()

stars_test = np.asarray(stars_test).reshape(-1,1)

In [11]:
# Create linear regression object, train the model
linreg = LinearRegression()
linreg.fit(bow_train, stars_train.ravel())

# Create a random forest regressor with 100 trees, train the model
#forest = RandomForestRegressor(n_estimators = 200) 
#forest = forest.fit(bow_train, stars_train.ravel())

# Create a SVR object, train the model
#supvec = svm.SVR(kernel = 'linear')
#supvec.fit(bow_train, stars_train.ravel())

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [12]:
# Generate predictions based on the bag of words (test).
stars_pred_linreg = np.asarray(linreg.predict(bow_test))
#stars_pred_rf = np.asarray(forest.predict(bow_test))
#stars_pred_supvec = np.asarray(supvec.predict(bow_test))

# Bound predictions to [1,5]
for i in range(len(stars_pred_linreg)): 
    if stars_pred_linreg[i]<1:
        stars_pred_linreg[i] = 1
    elif stars_pred_linreg[i]>5:
        stars_pred_linreg[i] = 5
        
#for i in range(len(stars_pred_rf)): 
#    if stars_pred_rf[i]<1:
#        stars_pred_rf[i] = 1
#    elif stars_pred_rf[i]>5:
#        stars_pred_rf[i] = 5

#for i in range(len(stars_pred_supvec)): 
#    if stars_pred_supvec[i]<1:
#        stars_pred_supvec[i] = 1
#    elif stars_pred_supvec[i]>5:
#        stars_pred_supvec[i] = 5

In [13]:
# Compute RMSE and MAE for each of the models.
def rmse(pred, actual):
    return np.sqrt(np.mean((pred-actual)**2))

rmse_linreg = rmse(stars_pred_linreg, stars_test)
mae_linreg = np.mean(abs(stars_pred_linreg - stars_test))
r2_linreg = r2_score(stars_test, stars_pred_linreg)
adjr2_linreg = 1-((len(bow_test)-1)/(len(bow_test)-len(vocab)-1))*(1-r2_linreg)

print('LinReg')
print('RMSE = %.4f' % rmse_linreg)
print('MAE = %.4f' % mae_linreg)
print('AdjR2 = %.4f' % adjr2_linreg)

#rmse_rf = rmse(stars_pred_rf, stars_test)
#mae_rf = np.mean(abs(stars_pred_rf - stars_test))
#print('\nRF')
#print('RMSE = %.4f' % rmse_rf)
#print('MAE = %.4f' % mae_rf)

#rmse_supvec = rmse(stars_pred_supvec, stars_test)
#mae_supvec = np.mean(abs(stars_pred_supvec - stars_test))
#print('\nPoly-2 SVM')
#print('RMSE = %.4f' % rmse_supvec)
#print('MAE = %.4f' % mae_supvec)

LinReg
RMSE = 1.4344
MAE = 1.1571
AdjR2 = 0.2683


In [None]:
importance = dict()
for i in range(len(vocab)):
    importance.update({vocab[i]: linreg.coef_[i]})
importance = sorted(importance.items(), key=lambda x:x[1])

mostpos = importance[-10:]
mostneg = importance[:10]

words_pos = [i[0] for i in mostpos]
coeff_pos = [i[1] for i in mostpos]
y_pos = np.arange(len(mostpos))

plt.barh(y_pos, coeff_pos, align='center', color='green')
plt.yticks(y_pos, words_pos)
plt.xlabel('Coefficient')
plt.title('Which words drive positive ratings the most?')
plt.show()

words_neg = [i[0] for i in mostneg]
coeff_neg = [-i[1] for i in mostneg]
y_neg = np.arange(len(mostneg))

plt.barh(y_neg, coeff_neg, align='center', color='red')
plt.yticks(y_neg, words_neg)
plt.xlabel('Coefficient (-1)')
plt.title('Which words drive negative ratings the most?')
plt.show()