# RQ3
> can we predict the relationship between the sentiment of a review and the number of stars given?
- independent: sentiment of review
- dependent: # of stars given for review

## todo
1. build a linear regression model
2. build a logistic regression model
3. build a polynomial regression model (if possible)
4. graph all the results

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.metrics import confusion_matrix

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import nltk
nltk.data.path.append('/usr/share/nltk_data/')
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import string

# matplotlib things
plt.figure(figsize=(3, 6), dpi=60)
plt.style.use('seaborn-v0_8')
# plt.rcParams['font.family'] = ['Times New Roman', 'serif']

<Figure size 180x360 with 0 Axes>

In [2]:
# import the data
df = pd.read_csv('./data/combined_sentiments.csv', header=0, sep=',', on_bad_lines='skip')

# lemmatise
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# check whether there is a digit or not
def check_digits(text):
     return any(i.isdigit() for i in text)

# tokenise
def clean_review(review):
	review = str(review)
	review = review.lower() # turn into lowercase
	review = [word.strip(string.punctuation) for word in review.split(' ')] # remove punctuation
	review = [word for word in review if not check_digits(word)] # remove digits

	# remove stop words
	stop = stopwords.words('english')
	review = [token for token in review if token not in stop]
	# remove empty tokens
	review = [token for token in review if len(token) > 0]
	
	# tag each token with its part of speech (pos)
	pos_tags = pos_tag(review)
	review = [WordNetLemmatizer().lemmatize(tag[0], get_wordnet_pos(tag[1])) for tag in pos_tags]

	# remove words with only one letter
	review = [token for token in review if len(token) > 1]
	review = ' '.join(review)
	return review

# generate a cleaned, tokenised and lemmatised version of the reviews
df['reviews.clean'] = df['reviews.text'].apply(lambda x: clean_review(x))