In [None]:
# Installing the required dependencies
!pip install datasets
!pip install nltk
!pip install bs4
!pip install gensim

In [None]:
# Loading in the required libraries
from datasets import load_dataset
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tqdm as tqdm
from bs4 import BeautifulSoup
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import statistics

In [None]:
# Importing the IMDb dataset from Hugging Face and creating the training and development splits
imdb_dataset = load_dataset("imdb")
imdb_train_full = imdb_dataset["train"]
imdb_train_dev = imdb_train_full.train_test_split(test_size = 0.2, stratify_by_column = "label", seed = 123)
imdb_train_text = imdb_train_dev["train"]["text"]
imdb_train_y = np.array(imdb_train_dev["train"]["label"])
imdb_dev_text = imdb_train_dev["test"]["text"]
imdb_dev_y = np.array(imdb_train_dev["test"]["label"])

In [None]:
# Preprocessing our training data (pre-split)
stopwords_list = stopwords.words("english")
processed_reviews_train = []
for review in tqdm.tqdm(imdb_train_full["text"]):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train.append(temp_update)

# Preprocessing our training data (post-split)
stopwords_list = stopwords.words("english")
processed_reviews_train_post = []
for review in tqdm.tqdm(imdb_train_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train_post.append(temp_update)

# Preprocessing our development data
processed_reviews_dev = []
for review in tqdm.tqdm(imdb_dev_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_dev.append(temp_update)

# Defining the hyperparameter configurations
vec_size = [100, 300, 600]
window_size = [3, 7, 12, 15]
model_architecture = [0, 1]
subsample = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
num_models = len(vec_size)*len(window_size)*len(model_architecture)*len(subsample)
models = np.zeros(shape=(num_models, 6))

# word2vec hyperparameter tuning
iteration = 0
for v in vec_size:
  for w in window_size:
    for m in model_architecture:
      for s in subsample:
        # Creating the document level representation using word2vec for each review in the train and development splits
        w2v_model = Word2Vec(sentences = processed_reviews_train, size = v, window = w, sg = m, sample = s, seed = 123)

        imdb_train_post = np.zeros([len(processed_reviews_train_post), v])
        for i in tqdm.tqdm(range(len(processed_reviews_train_post))):
          word_list = []
          for word in processed_reviews_train_post[i]:
            if word in w2v_model.wv.vocab:
              word_list.append(word)
              imdb_train_post[i] = np.mean(w2v_model.wv[word_list], axis = 0)

        imdb_dev = np.zeros([len(processed_reviews_dev), v])
        for i in tqdm.tqdm(range(len(processed_reviews_dev))):
          word_list = []
          for word in processed_reviews_dev[i]:
            if word in w2v_model.wv.vocab:
              word_list.append(word)
              imdb_dev[i] = np.mean(w2v_model.wv[word_list], axis = 0)

        # Training a Naive Bayes model on the datasets
        imdb_nb = GaussianNB()
        imdb_nb_fit = imdb_nb.fit(imdb_train_post, imdb_train_y)

        # Training a Logistic Regression model on the datasets
        imdb_logreg = LogisticRegression(random_state = 123, solver = 'liblinear')
        imdb_logreg_fit = imdb_logreg.fit(imdb_train_post, imdb_train_y)

        # Training an SVM model on the datasets
        imdb_svm = SGDClassifier(random_state = 123)
        imdb_svm_fit = imdb_svm.fit(imdb_train_post, imdb_train_y)

        # Training a Random Forests model on the datasets
        imdb_rf = RandomForestClassifier(random_state = 123)
        imdb_rf_fit = imdb_rf.fit(imdb_train_post, imdb_train_y)

        # Using our models to obtain predictions and compute the F1-score
        imdb_nb_preds = imdb_nb_fit.predict(imdb_dev)
        imdb_nb_f1 = f1_score(imdb_dev_y, imdb_nb_preds)

        imdb_logreg_preds = imdb_logreg_fit.predict(imdb_dev)
        imdb_logreg_f1 = f1_score(imdb_dev_y, imdb_logreg_preds)

        imdb_svm_preds = imdb_svm_fit.predict(imdb_dev)
        imdb_svm_f1 = f1_score(imdb_dev_y, imdb_svm_preds)

        imdb_rf_preds = imdb_rf_fit.predict(imdb_dev)
        imdb_rf_f1 = f1_score(imdb_dev_y, imdb_rf_preds)

        med_f1 = statistics.median([imdb_nb_f1, imdb_logreg_f1, imdb_svm_f1, imdb_rf_f1])

        # Updating our model matrix
        models[iteration] = [iteration, v, w, m, s, med_f1]
        print(models[iteration])
        iteration = iteration + 1