The following are the libraries required to perform various functions such as
creating tf-idf and BOW values and also using logistic and linear regression on
them


In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

The following function loads in the dataset such that the program can use it. The
goal is to take in the dataset and return the training and testing data in a
DataFrame. This process is supported by the Pandas library.


In [None]:
def load_train_test_imdb_data(data_dir):
  data = {}
  for dataset in ["train", "test"]:
    data[dataset] = []
    for sentiment in ["neg", "pos"]:
      score = 1 if sentiment == "pos" else 0
      path = os.path.join(data_dir, dataset, sentiment)
      file_names = os.listdir(path)
      for f_name in file_names:
        with open(os.path.join(path, f_name), "r") as f:
          review = f.read()
          data[dataset].append([review, score])
  data["train"] = pd.DataFrame(data["train"], columns=['text', 'sentiment'])
  print("Training Data: \n")
  print(data["train"])
  print("\n")
  data["test"] = pd.DataFrame(data["test"], columns=['text', 'sentiment'])
  print("Testing Data: \n")
  print(data["test"])
  return data["train"], data["test"]

train_data, test_data = load_train_test_imdb_data(data_dir="aclImdb/")


This is the function that cleans the text.
The following operations are done on it:
- Remove HTML tags
- Remove punctuation
- Putting all characters to its lowercase form
- Removing words that are irrelevant to the sentiment to the text, also called stopwords

In [5]:
def clean_text(text):
#Stopwords
  stopwords = ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll",
    "you'd",'your','yours','yourself','yourselves','he','him','his','himself','she',
    "she's",'her','hers','herself','it',"it's",'its','itself','they','them','their','theirs',
    'themselves','what','which','who','whom','this','that',"that'll",'these','those','am',
    'is','are','was','were','be','been','being','have','has','had','having','do','does',
    'did','doing','a','an','the','and','but','if','or','because','as','until','while',
    'of','at','by','for','with','about','against','between','into','through','during',
    'before','after','above','below','to','from','up','down','in','out','on','off',
    'over','under','again','further','then','once','here','there','when','where',
    'why','how','all','any','both','each','few','more','most','other','some','such',
    'no','nor','not','only','own','same','so','than','too','very','s','t','can',
    'will','just','don',"don't",'should',"should've",'now','d','ll','m','o','re',
    've','y','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',
    "doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",
    'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",
    'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]
  # remove HTML tags
  text = re.sub(r'<.*?>', '', text)
  # remove the characters [\], ['] and ["]
  text = re.sub(r"\\", "", text)
  text = re.sub(r"\'", "", text)
  text = re.sub(r"\"", "", text)
  # convert text to lowercase
  text = text.strip().lower()
  # replace punctuation characters with spaces
  filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
  translate_dict = dict((c, " ") for c in filters)
  translate_map = str.maketrans(translate_dict)
  text = text.translate(translate_map)
  #removing stopwords
  words = text.split()
  result = ""
  for word in words:
    if word not in stopwords:
      result = result+" "+word
  text = result
  return text


Linear Regression on TF-IDF

In [None]:
# This code will convert the text from the movie reviews to TF-IDF values
vectorizer = TfidfVectorizer(stop_words="english",preprocessor=clean_text,)
#this uses the vectorizer to produce the training and testing tf-idf features for the words
training_features = vectorizer.fit_transform(train_data["text"])
test_features = vectorizer.transform(test_data["text"])
# Training the linear regression model
model = LinearRegression()
#this plots the training features onto a plane and then plots a linear regression line according to it
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)
# decision boundary. Predictions greater than 0.5 are positive and predictions less than 0.5 are negative
for i in range(len(y_pred)):
  if y_pred[i]<0.5:
    y_pred[i] = 0
  else:
    y_pred[i] = 1
# Evaluation
#finding the accuracy of the linear regression model on tf-idf values by comparing the predicted sentiments to the actual sentiments
acc = accuracy_score(test_data["sentiment"], y_pred)
print(acc*100)
# confusion matrix creation
matrix = confusion_matrix(test_data["sentiment"], y_pred)
#the following functions will help format the confusion matrices to make them more visually appealing
group_names = ['','','','']
group_counts = ["{0:0.0f}".format(value) for value in
                matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                    matrix.flatten()/np.sum(matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(matrix, annot=labels, fmt='', cmap='Blues')


Logistic Regression on TF-IDF


In [None]:
# Training the logistic regression model
model = LogisticRegression()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)
# Evaluation
#here we continue to use the tf-idf values we found in the previous trial
acc = accuracy_score(test_data["sentiment"], y_pred)
print(acc*100)
#confusion matrix creation
matrix = confusion_matrix(test_data["sentiment"], y_pred)
group_names = ['','','','']
group_counts = ["{0:0.0f}".format(value) for value in
                matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                    matrix.flatten()/np.sum(matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(matrix, annot=labels, fmt='', cmap='Blues')


Linear Regression on BOW


In [None]:
# Instead of using the vectorizer for tf-idf values, we now use countvectorizerv which converts the words to bag of words values
vectorizer = CountVectorizer(stop_words="english",preprocessor=clean_text,)
training_features = vectorizer.fit_transform(train_data["text"])
test_features = vectorizer.transform(test_data["text"])

In [None]:
# Training the linear regression model according to the BOW values
model = LinearRegression()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)
#again, we use the same decision boundary where a predicted sentiment of over 0.5 is considered positive and below 0.5 is considered negative
for i in range(len(y_pred)):
  if y_pred[i]<0.5:
    y_pred[i]=0
  else:
    y_pred[i]=1
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)
print(acc*100)
#confusion matrix
matrix = confusion_matrix(test_data["sentiment"], y_pred)
group_names = ['','','','']
group_counts = ["{0:0.0f}".format(value) for value in
                matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                    matrix.flatten()/np.sum(matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(matrix, annot=labels, fmt='', cmap='Blues')

Logistic Regression on BOW


In [None]:
# Training the logistic regression model according the bag of words values
model = LogisticRegression()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)
print(acc*100)
#confusion matrix
matrix = confusion_matrix(test_data["sentiment"], y_pred)
group_names = ['','','','']
group_counts = ["{0:0.0f}".format(value) for value in
                matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                    matrix.flatten()/np.sum(matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(matrix, annot=labels, fmt='', cmap='Blues')