In [7]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ana58/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import data using pandas and create two dataframes with only stars and text.
all_columns = ['review_id', 'user_id', 'business_id', 'stars', 'date', 'text', 'useful', 'funny', 'cool']
good_columns = ['stars', 'text']
# Can change good stars to include all 1-5 or select certain stars only
good_stars = ['1', '3', '5']
sample_size = 10000

data = pd.read_json('data/review-0-0.json')
data = data[data['stars'].isin(good_stars)]
# Selecting only the first 10000 rows for computational time and memory error
# MemoryError: Unable to allocate array with shape (100000, 113946) and data type float64
x = data.head(sample_size)['text']
y = data.head(sample_size)['stars']

In [3]:
# Cleaning the data to remove puncutation, stop words, then returns the cleaned text
def clean_text(text):
    not_punc = [char for char in text if char not in string.punctuation]
    not_punc = ''.join(not_punc)
    return [word.lower() for word in not_punc.split() if word.lower() not in stopwords.words('english')]

In [4]:
# Converts the data into a vector
tfidf_vectorizer = TfidfVectorizer(analyzer=clean_text, use_idf=False)
X = tfidf_vectorizer.fit_transform(x).toarray()

In [5]:
# Split the dataset into train and test data
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=101)

In [6]:
# Using an multilayer perceptron classifier to score the baseline Tfidf score
mlp = MLPClassifier()
mlp.fit(x_train, y_train)
predict_mlp = mlp.predict(x_test)

In [7]:
print(classification_report(y_test, predict_mlp))
print("Score:",round(accuracy_score(y_test,predict_mlp)*100,2))

              precision    recall  f1-score   support

           0       0.85      0.81      0.83       453
           1       0.89      0.92      0.91       797

    accuracy                           0.88      1250
   macro avg       0.87      0.86      0.87      1250
weighted avg       0.88      0.88      0.88      1250

Score: 87.84


In [1]:
print("Score with stars 1/2/3/4/5: 58.56")
print("Score with stars 1/2/4/5: 65.27")
print("Score with stars 1/5: 95.07")
print("Score with stars 2/4: 87.13")
print("Score with stars 1/3/5: 84.47")

Score with stars 1/2/3/4/5: 58.56
Score with stars 1/2/4/5: 65.27
Score with stars 1/5: 95.07
Score with stars 2/4: 87.13
Score with stars 1/3/5: 84.47


In [9]:
table_data = pd.read_json('data/review-0-0.json')
table_data

Unnamed: 0,stars,text
0,1,Total bill for this horrible service? Over $8G...
1,5,I *adore* Travis at the Hard Rock's new Kelly ...
2,5,I have to say that this office really has it t...
3,5,Went in for a lunch. Steak sandwich was delici...
4,1,Today was my second out of three sessions I ha...
...,...,...
99995,3,I love the clothes in Aritzia. The service I c...
99996,1,We went here to have a small dinner with famil...
99997,5,Amazing!!! Our 2 boys never had this much fun ...
99998,5,Wanted a short hair cut but didn't have a regu...
