# Movie Review Sentiment Analysis

## Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from my_measures import BinaryClassificationPerformance

## Importing dataset

In [2]:
dataset = pd.read_csv('moviereviews_train.tsv', sep='\t', quoting = 3)

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tqdm import tqdm
import pandas as pd

nltk.download('stopwords')

total_reviews = len(dataset['review'])  # This dynamically gets the total number of reviews

corpus = []
word_counts = []  # List to store word counts
ps = PorterStemmer()  # Instantiate PorterStemmer once, before the loop
stop_words = set(stopwords.words('english'))  # Call this once before the loop

# Words to keep (negations and specific domain words)
words_to_keep = {'not', "isn't", "doesn't", "couldn't", "could not", "is not", "does not", "movie", "film", "cinema"}

# Remove these words from the stop_words set
stop_words = stop_words.difference(words_to_keep)

for i in tqdm(range(total_reviews), desc="Cleaning reviews: "):
    review = dataset['review'][i]
    review = re.sub('[^a-zA-Z]', ' ', review)  # Keep only alphabetic characters
    review = review.lower()  # Convert to lower case
    review_words = review.split()  # Split into words
    # Stem words not in stop_words
    cleaned_review = [ps.stem(word) for word in review_words if word not in stop_words]
    word_counts.append(len(cleaned_review))  # Count words after cleaning and stemming
    cleaned_review = ' '.join(cleaned_review)  # Join words back into a single string
    corpus.append(cleaned_review)


[nltk_data] Downloading package stopwords to /Users/main/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Cleaning reviews: 100%|██████████████████| 25000/25000 [00:44<00:00, 565.58it/s]


## Creating the sparse matrix | matrix of token counts | Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import HashingVectorizer
cv = CountVectorizer(ngram_range=(1, 2))
X_cv = cv.fit_transform(corpus)
# hv = HashingVectorizer(n_features=2 ** 17, alternate_sign=False)
# X_cv = hv.fit_transform(corpus)

## Creating a TF-IDF weighted document-term matrix

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_cv)

## Creating additional features

In [6]:
import string
from scipy.sparse import csr_matrix, hstack

# 1. Get the count of tokens in the corpus
# num_tokens = len(cv.vocabulary_)
# print(f"Number of tokens in the corpus: {num_tokens}")

word_counts_sparse = csr_matrix(word_counts).T  # Convert to CSR and transpose to a column vector

punctuation_counts = dataset['review'].apply(lambda x: sum(c in string.punctuation for c in x))

# Convert to a sparse matrix format
punctuation_counts_sparse = csr_matrix(punctuation_counts).T  # Ensure it's a column

# 2. Calculate token count per document
# Summing across columns for each document (axis=1) gives the total token count per document
# Ensure it remains a sparse matrix and transpose to column matrix
token_counts_per_doc = csr_matrix(X_cv.sum(axis=1))

# 3. Stack this feature with X_tfidf
X_combined = hstack([X_tfidf]) #, punctuation_counts_sparse, word_counts_sparse])

## Creating the (scaled) matrix of features and the dependent variable vector

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_combined)
y = dataset['sentiment']

## Splitting the dataset into 'train' and 'test' set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

## Training the models on the 'train' set

In [9]:
# Ordinary Least Squares

from sklearn import linear_model
ols = linear_model.SGDClassifier(loss="squared_error")
ols.fit(X_train, y_train)

ols_performance_train = BinaryClassificationPerformance(ols.predict(X_train), y_train, 'ols_train')
ols_performance_train.compute_measures()
print(ols_performance_train.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 5352, 'TN': 4816, 'FP': 5196, 'FN': 4636, 'Accuracy': 0.5084, 'Precision': 0.5073947667804323, 'Recall': 0.5358430116139368, 'desc': 'ols_train'}


In [10]:
# SVM, linear

from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(X_train, y_train)

svm_performance_train = BinaryClassificationPerformance(svm.predict(X_train), y_train, 'svm_train')
svm_performance_train.compute_measures()
print(svm_performance_train.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9988, 'TN': 10012, 'FP': 0, 'FN': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'desc': 'svm_train'}


In [11]:
# Logistic Regression

from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log_loss')
lgs.fit(X_train, y_train)

lgs_performance_train = BinaryClassificationPerformance(lgs.predict(X_train), y_train, 'lgs_train')
lgs_performance_train.compute_measures()
print(lgs_performance_train.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9988, 'TN': 10012, 'FP': 0, 'FN': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'desc': 'lgs_train'}


In [12]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(X_train, y_train)

nbs_performance_train = BinaryClassificationPerformance(nbs.predict(X_train), y_train, 'nbs_train')
nbs_performance_train.compute_measures()
print(nbs_performance_train.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9988, 'TN': 10012, 'FP': 0, 'FN': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'desc': 'nbs_train'}


In [13]:
# Perceptron

from sklearn import linear_model
prc = linear_model.SGDClassifier(loss='perceptron')
prc.fit(X_train, y_train)

prc_performance_train = BinaryClassificationPerformance(prc.predict(X_train), y_train, 'prc_train')
prc_performance_train.compute_measures()
print(prc_performance_train.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9988, 'TN': 10012, 'FP': 0, 'FN': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'desc': 'prc_train'}


In [14]:
# Ridge Regression Classifier

from sklearn import linear_model
rdg = linear_model.RidgeClassifier()
rdg.fit(X_train, y_train)

rdg_performance_train = BinaryClassificationPerformance(rdg.predict(X_train), y_train, 'rdg_train')
rdg_performance_train.compute_measures()
print(rdg_performance_train.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9988, 'TN': 10012, 'FP': 0, 'FN': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'desc': 'rdg_train'}


In [15]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(max_depth=2, random_state=90)
rdf.fit(X_train, y_train)

rdf_performance_train = BinaryClassificationPerformance(rdf.predict(X_train), y_train, 'rdf_train')
rdf_performance_train.compute_measures()
print(rdf_performance_train.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 7474, 'TN': 7653, 'FP': 2359, 'FN': 2514, 'Accuracy': 0.75635, 'Precision': 0.7600935624936439, 'Recall': 0.7482979575490589, 'desc': 'rdf_train'}


## Predicting the 'test' set results

In [16]:
# Ordinary Least Squares

ols_performance_test = BinaryClassificationPerformance(ols.predict(X_test), y_test, 'ols_test')
ols_performance_test.compute_measures()
print(ols_performance_test.performance_measures)

{'Pos': 2512, 'Neg': 2488, 'TP': 1346, 'TN': 1184, 'FP': 1304, 'FN': 1166, 'Accuracy': 0.506, 'Precision': 0.5079245283018868, 'Recall': 0.535828025477707, 'desc': 'ols_test'}


In [17]:
# SVM, linear

svm_performance_test = BinaryClassificationPerformance(svm.predict(X_test), y_test, 'svm_test')
svm_performance_test.compute_measures()
print(svm_performance_test.performance_measures)

{'Pos': 2512, 'Neg': 2488, 'TP': 2105, 'TN': 2025, 'FP': 463, 'FN': 407, 'Accuracy': 0.826, 'Precision': 0.8197040498442367, 'Recall': 0.8379777070063694, 'desc': 'svm_test'}


In [18]:
# Logistic Regression

lgs_performance_test = BinaryClassificationPerformance(lgs.predict(X_test), y_test, 'lgs_test')
lgs_performance_test.compute_measures()
print(lgs_performance_test.performance_measures)

{'Pos': 2512, 'Neg': 2488, 'TP': 2108, 'TN': 2041, 'FP': 447, 'FN': 404, 'Accuracy': 0.8298, 'Precision': 0.8250489236790607, 'Recall': 0.839171974522293, 'desc': 'lgs_test'}


In [19]:
# Naive Bayes

nbs_performance_test = BinaryClassificationPerformance(nbs.predict(X_test), y_test, 'nbs_test')
nbs_performance_test.compute_measures()
print(nbs_performance_test.performance_measures)

{'Pos': 2512, 'Neg': 2488, 'TP': 1968, 'TN': 2099, 'FP': 389, 'FN': 544, 'Accuracy': 0.8134, 'Precision': 0.8349596945269411, 'Recall': 0.7834394904458599, 'desc': 'nbs_test'}


In [20]:
# Perceptron

prc_performance_test = BinaryClassificationPerformance(prc.predict(X_test), y_test, 'prc_test')
prc_performance_test.compute_measures()
print(prc_performance_test.performance_measures)

{'Pos': 2512, 'Neg': 2488, 'TP': 2073, 'TN': 2020, 'FP': 468, 'FN': 439, 'Accuracy': 0.8186, 'Precision': 0.8158205430932703, 'Recall': 0.8252388535031847, 'desc': 'prc_test'}


In [21]:
# Ridge Regression Classifier

rdg_performance_test = BinaryClassificationPerformance(rdg.predict(X_test), y_test, 'rdg_test')
rdg_performance_test.compute_measures()
print(rdg_performance_test.performance_measures)

{'Pos': 2512, 'Neg': 2488, 'TP': 2300, 'TN': 2132, 'FP': 356, 'FN': 212, 'Accuracy': 0.8864, 'Precision': 0.8659638554216867, 'Recall': 0.9156050955414012, 'desc': 'rdg_test'}


In [22]:
# Random Forest Classifier

rdf_performance_test = BinaryClassificationPerformance(rdf.predict(X_test), y_test, 'rdf_test')
rdf_performance_test.compute_measures()
print(rdf_performance_test.performance_measures)

{'Pos': 2512, 'Neg': 2488, 'TP': 1838, 'TN': 1877, 'FP': 611, 'FN': 674, 'Accuracy': 0.743, 'Precision': 0.7505104124132299, 'Recall': 0.731687898089172, 'desc': 'rdf_test'}


## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
print(X_matrix)