In [0]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [0]:
pip install vaderSentiment

In [0]:
import string
import math 
import re
import gensim

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import matplotlib.colors as mcolors
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gensim import corpora
from collections import defaultdict
from wordcloud import WordCloud
from pprint import pprint
from collections import Counter

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
from google.colab import drive
df = pd.read_csv('/content/gdrive/My Drive/yelp.csv')
df

In [0]:
print("Count according to the cool level")
Cool_count = df['cool'].value_counts()
print(Cool_count)

print("Count according to the useful level")
Useful_count = df['useful'].value_counts()
print(Useful_count)

print("Count according to the funny level")
Funny_count = df['funny'].value_counts()
print(Funny_count)

In [0]:
#remove newlines in text
def preprocess(text):
    return text.replace('\n\n',' ')

In [0]:
#create list of all reviews with the new line symbols removed
all_reviews = []
for i in df.index:
    all_reviews.append(preprocess(df.text[i]))

In [0]:
#calculate tf-idf
tfidf = TfidfVectorizer()
tfidf_table = tfidf.fit_transform(all_reviews)

In [0]:
n_features = len(tfidf.get_feature_names())

In [0]:
#save average tf-idf for each review
avg_non_zero_tf_idf =[]
avg_tf_idf =[]
for i in range(len(all_reviews)):
    
    non_zero_doc_tf_idf = tfidf_table[i].tocoo().data.tolist()
    
    if len(non_zero_doc_tf_idf) ==0:
        avg_non_zero_tf_idf.append(0)
        
    else: avg_non_zero_tf_idf.append(sum(non_zero_doc_tf_idf)/len(non_zero_doc_tf_idf))
    
    avg_tf_idf.append(10000*tfidf_table[i].tocoo().data.sum()/n_features)

In [0]:
#calculate other features for the model

stop_words=stopwords.words("english")

def is_not_only_punctuation(word):
    return not all([x  in string.punctuation for x in word  ]) 

n_words =[] #number of words
n_sent =[] #number of sentences
for review in all_reviews:
    word_list = nltk.word_tokenize(review)
    #word_list_no_puncuation = [word for word in word_list if is_not_only_punctuation(word)]
    n_words.append(len(n_words))
    n_sent.append( len(nltk.sent_tokenize(review)))
    
n_paras = []  #number of paragraphs
for i in df.index:
     n_paras.append(df.text[i].count("\n\n")+1)
    

print("Total number of words are:", sum(n_words))
print("Total number of sentences are:", sum(n_sent))
print("Total number of paragraphs are:", sum(n_paras))

In [0]:
#create binary variables for cool, funny, and useful
df['is_cool']= df.cool>1
df['is_funny']= df.funny>1
df['is_useful']= df.useful>1

#create binary variables for stars
df['star_1']= df.stars == 1
df['star_2']= df.stars == 2
df['star_3']= df.stars == 3
df['star_4']= df.stars == 4
df['star_5']= df.stars == 5

#save other features
df['avg_non_zero_tf_idf'] = avg_non_zero_tf_idf 
df['avg_tf_idf'] = avg_tf_idf
df['n_words']= n_words
df['n_sent']= n_sent
df['n_paras']= n_paras

df['exclaim'] = df.text.str.contains('!')

In [0]:
lexical_diversity =[]

for review in all_reviews:
    word_list = nltk.word_tokenize(review.lower())
    unique_word_list = set(word_list)

    lexical_diversity.append(len(unique_word_list)/len(word_list))    
df['LD']= lexical_diversity

In [0]:
analyzer = SentimentIntensityAnalyzer()

## to account for sentiment, we can include the results from sentinment analysis 
#into the model as features

## try both numerical (positive, negative, neutral scores) or binary
# a review can have a high amount of both positive and negative sentiment,
# so keep both positive and negative aspects seperate instead of using compound score


neg = []
pos =[]
neu = []
for review in all_reviews:
    sent=analyzer.polarity_scores(review)

    neg.append(sent['neg'])
    pos.append(sent['pos'])
    neu.append(sent['neu'])
    
df['neg']= neg
df['pos']= pos
df['neu']= neu

df['is_neg']= df['neg']>.5
df['is_pos']= df['pos']>.5
df['is_neu']= df['neu']>.5

# Average TF IDF over non zero values (i.e, only over words in the doc)
through trail and error this is the best model I found, based of R2 of test set

coefs for cool and useful are (-) and funny is (+)

In [0]:
#split into train and test set 

X = df[['is_pos','is_neg','is_cool','is_funny','is_useful','star_1','star_2','star_3','star_4','star_5','n_sent','n_paras']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#fit model
regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train, y_train)


for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

pred=regression_model.predict(X_test) #make prediction on test set
error = math.sqrt(metrics.mean_squared_error(y_test,pred)) #calculate rmse

print('Test RMSE:: ',error)
print('Test score::',regression_model.score(X_test,y_test)) #R2 score

The coefficient for is_pos is 0.10993827359513889
The coefficient for is_neg is 0.19990059373579422
The coefficient for is_cool is -0.00963089254682753
The coefficient for is_funny is 0.007797862753552836
The coefficient for is_useful is -0.012095779065884747
The coefficient for star_1 is 0.003408518468594225
The coefficient for star_2 is -0.0038517214683838714
The coefficient for star_3 is -0.008031035867474617
The coefficient for star_4 is 0.0012319870904246993
The coefficient for star_5 is 0.007242251776838842
The coefficient for n_sent is -0.01168984898140521
The coefficient for n_paras is -0.0032775392523697947
Test RMSE::  0.07880016174460247
Test score:: 0.5940129842958047


# Average tf-idf over all values (all words)
Cool and useful (+) and funny is (-)

In [0]:
#split into train and test set 

X = df[['pos','neg','is_cool','is_funny','is_useful','star_1','star_2','star_3','star_4','star_5','n_sent','n_paras','exclaim']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#fit model
regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train, y_train)


for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

pred=regression_model.predict(X_test) #make prediction on test set
error = math.sqrt(metrics.mean_squared_error(y_test,pred)) #calculate rmse

print('Test RMSE:: ',error)
print('Test score::',regression_model.score(X_test,y_test)) #R2 score

The coefficient for pos is 0.2779607247212962
The coefficient for neg is 0.2047841913198902
The coefficient for is_cool is -0.00478267725038981
The coefficient for is_funny is 0.002917193024210482
The coefficient for is_useful is -0.006722879637917491
The coefficient for star_1 is 0.014561080832083864
The coefficient for star_2 is 0.005010500996677312
The coefficient for star_3 is -0.004204690558490066
The coefficient for star_4 is -0.008615791124996032
The coefficient for star_5 is -0.006751100145273945
The coefficient for n_sent is -0.003843231827473398
The coefficient for n_paras is -0.0006349143239790661
The coefficient for exclaim is -0.00891881551923842
Test RMSE::  0.04901618012917836
Test score:: 0.5001241830050258


# Lexical diversity
Cool and useful (-) and funny is (+)

In [0]:
#split into train and test set 

X = df[['pos','neg','is_cool','is_funny','is_useful','star_1','star_2','star_3','star_4','star_5','n_sent','n_paras']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#fit model
regression_model = LinearRegression(fit_intercept=True)
regression_model.fit(X_train, y_train)


for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

pred=regression_model.predict(X_test) #make prediction on test set
error = math.sqrt(metrics.mean_squared_error(y_test,pred)) #calculate rmse

print('Test RMSE:: ',error)
print('Test score::',regression_model.score(X_test,y_test)) #R2 score

The coefficient for pos is 0.23369753575272212
The coefficient for neg is 0.17962682803630117
The coefficient for is_cool is -0.007988674394703932
The coefficient for is_funny is 0.006936270814023489
The coefficient for is_useful is -0.010729049113249272
The coefficient for star_1 is 0.012491592738215372
The coefficient for star_2 is 0.0015502839771365734
The coefficient for star_3 is -0.007128739912120648
The coefficient for star_4 is -0.005138536324285967
The coefficient for star_5 is -0.0017746004789447236
The coefficient for n_sent is -0.011477299480761649
The coefficient for n_paras is -0.00265164041493271
Test RMSE::  0.07729747946928243
Test score:: 0.6093493137907626
