In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import re
import contractions

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jakeisrael/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jakeisrael/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Read Data Keeping only Reviews and Ratings

In [3]:
# Read data, keeping only the star_rating and review_body columns
input = pd.read_csv("data.tsv", sep='\t', usecols = ['star_rating','review_body'])

# Drop rows with null values
input = input.dropna()

# Cast ratings to int
input["star_rating"] = input["star_rating"].astype(int)

  input = pd.read_csv("data.tsv", sep='\t', usecols = ['star_rating','review_body'])


 ## We select 20000 reviews randomly from each rating class.



In [4]:
n = 20000

onestar = input[input["star_rating"] == 1].sample(n)
twostar = input[input["star_rating"] == 2].sample(n)
threestar = input[input["star_rating"] == 3].sample(n)
fourstar = input[input["star_rating"] == 4].sample(n)
fivestar = input[input["star_rating"] == 5].sample(n)

df = pd.concat([onestar, twostar, threestar, fourstar, fivestar]).sample(frac=1)

# Data Cleaning



In [5]:
preCleaningAvgLength = df['review_body'].str.len().mean()

# Lowercase all reviews
df["review_body"] = df["review_body"].str.lower()

# Remove html tags
def remove_html(s):
    res = re.sub('<.*?>','',s)
    return res

df["review_body"] = df["review_body"].apply(lambda s : remove_html(s))

# Remove URLs
def remove_urls(s):
    res = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', s, flags=re.MULTILINE)
    return res


df["review_body"] = df["review_body"].apply(lambda s : remove_urls(s))

# Expand contractions
df["review_body"] = df["review_body"].apply(lambda s : contractions.fix(s))

# Remove all nonalphabetic characters
def remove_nonalpha(s):
    res = re.sub('[^a-zA-Z ]+', '', s)
    return res

df["review_body"] = df["review_body"].apply(lambda s : remove_nonalpha(s))

# Strip out extra whitespace
df["review_body"] = df["review_body"].apply(lambda s : s.strip())

In [20]:
postCleaningAvgLength = df['review_body'].str.len().mean()
print(preCleaningAvgLength, postCleaningAvgLength, sep=',')

189.7216,108.8351


# Pre-processing

## remove the stop words 

In [8]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

# Remove stopwords
df['review_body'] = df['review_body'].apply(lambda s : ' '.join([word for word in s.split() if word not in stop_words]))

## perform lemmatization  

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

def lemmatize_string(s):
    return ' '.join([lemmatizer.lemmatize(word) for word in tokenizer.tokenize(s)])

# Perform lemmatization on review words
df['review_body'] = df['review_body'].apply(lambda s : lemmatize_string(s))

postProcessingAvgLength = df['review_body'].str.len().mean()
print(postCleaningAvgLength, postProcessingAvgLength, sep=',')


183.50953 108.8351


In [None]:
# TF-IDF Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer()

# Extract TF-IDF weights
weights = vectorizer.fit_transform(df["review_body"])

train_x, test_x, train_y, test_y = train_test_split(weights, df["star_rating"], test_size=0.2, random_state=1)

# Perceptron

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

p = Perceptron()
p.fit(train_x, train_y)

In [None]:
test_predictions = p.predict(test_x)

stats = precision_recall_fscore_support(test_y, test_predictions)

precision = stats[0]
recall = stats[1]
fscore = stats[2]

for i in range(5):
    print(precision[i], ",", recall[i], ",", fscore[i], sep='')
print(np.average(precision), "," , np.average(recall), "," , np.average(fscore), sep='')

# SVM

In [None]:
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(train_x, train_y)

In [None]:
test_predictions = svm.predict(test_x)

stats = precision_recall_fscore_support(test_y, test_predictions)

precision = stats[0]
recall = stats[1]
fscore = stats[2]

for i in range(5):
    print(precision[i], ",", recall[i], ",", fscore[i], sep='')
print(np.average(precision), "," , np.average(recall), "," , np.average(fscore), sep='')

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(max_iter=1000000).fit(train_x, train_y)

In [None]:
test_predictions = log.predict(test_x)

stats = precision_recall_fscore_support(test_y, test_predictions)

precision = stats[0]
recall = stats[1]
fscore = stats[2]

for i in range(5):
    print(precision[i], ",", recall[i], ",", fscore[i], sep='')
print(np.average(precision), "," , np.average(recall), "," , np.average(fscore), sep='')

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
bayes = MultinomialNB().fit(train_x, train_y)

In [None]:
test_predictions = bayes.predict(test_x)

stats = precision_recall_fscore_support(test_y, test_predictions)

precision = stats[0]
recall = stats[1]
fscore = stats[2]

for i in range(5):
    print(precision[i], ",", recall[i], ",", fscore[i], sep='')
print(np.average(precision), "," , np.average(recall), "," , np.average(fscore), sep='')