In [41]:
# Import necessary libraries
import pandas as pd

import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

from bs4 import BeautifulSoup
import re
import os

In [42]:
#Import training data
# imdb_data=pd.read_csv('IMDB_dataset.csv')
# imdb_data_raw = pd.read_csv('IMDB_dataset.csv')

movie_data = pd.read_csv('IMDB_dataset.csv')

In [43]:
#Preprocessing function

def preprocess(text):
    # Check if the input looks like a file name
    if os.path.isfile(text):
        with open(text, 'r') as file:
            soup = BeautifulSoup(file, "html.parser")
    else:
        soup = BeautifulSoup(text, "html.parser")

    cleaned_text = soup.get_text()

    # Replace [] with nothing
    cleaned_text = re.sub(r'\[[^]]*\]', '', cleaned_text)

    # Remove excessive white spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    # Convert to lowercase
    cleaned_text = cleaned_text.lower()

    # Preserve negation
    cleaned_text = re.sub(r'\b(n\'t|not|no|never)\s+(\w+)', r'not_\1 \2', cleaned_text)

    return cleaned_text

#Adapted code from Lakshmipathi N 2020 with substantial changes

In [44]:
# Apply preprossing to the dataframe
movie_data['review'] = movie_data['review'].apply(preprocess)

  soup = BeautifulSoup(text, "html.parser")


In [45]:
movie_data.head(15)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [46]:
# Convert labels to binary
movie_data['sentiment'] = movie_data.sentiment.map({'positive':1, 'negative':0})

movie_data.head(15)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1
5,"probably my all-time favorite movie, a story o...",1
6,i sure would like to see a resurrection of a u...,1
7,"this show was an amazing, fresh & innovative i...",0
8,encouraged by the positive comments about this...,0
9,if you like original gut wrenching laughter yo...,1


In [34]:
# Slit data into training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(movie_data['review'], movie_data['sentiment'], random_state=1)

print('Number of rows in the total set: {}'.format(movie_data.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

# Code from Roman, 2019

Number of rows in the total set: 50000
Number of rows in the training set: 37500
Number of rows in the test set: 12500


In [35]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [36]:
# Naive Bayes implementation
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()

naive_bayes.fit(training_data, y_train)

In [37]:
# Make predictions
predictions = naive_bayes.predict(testing_data)

In [38]:
# Model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.85136
Precision score:  0.8718300205620287
Recall score:  0.8209099709583737
F1 score:  0.8456041216553098


In [40]:
# Compile a classification report
from sklearn.metrics import classification_report

naive_bayes_report = classification_report(y_test, predictions, target_names = ['Positive', 'Negative'])
print(naive_bayes_report)

              precision    recall  f1-score   support

    Positive       0.83      0.88      0.86      6302
    Negative       0.87      0.82      0.85      6198

    accuracy                           0.85     12500
   macro avg       0.85      0.85      0.85     12500
weighted avg       0.85      0.85      0.85     12500

