In [1]:
# Installing the required dependencies
!pip install datasets
!pip install nltk
!pip install bs4
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 14.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 87.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 76.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 72.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 66.3 MB/s 
Collecting multiprocess
  

In [2]:
# Loading in the required libraries
from datasets import load_dataset
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tqdm as tqdm
from bs4 import BeautifulSoup
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import statistics

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Importing the IMDb dataset from Hugging Face and creating the training and development splits
imdb_dataset = load_dataset("imdb")
imdb_train_full = imdb_dataset["train"]
imdb_train_dev = imdb_train_full.train_test_split(test_size = 0.2, stratify_by_column = "label", seed = 123)
imdb_train_text = imdb_train_dev["train"]["text"]
imdb_train_y = np.array(imdb_train_dev["train"]["label"])
imdb_dev_text = imdb_train_dev["test"]["text"]
imdb_dev_y = np.array(imdb_train_dev["test"]["label"])

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Preprocessing our training data (pre-split)
stopwords_list = stopwords.words("english")
processed_reviews_train = []
for review in tqdm.tqdm(imdb_train_full["text"]):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train.append(temp_update)

# Preprocessing our training data (post-split)
stopwords_list = stopwords.words("english")
processed_reviews_train_post = []
for review in tqdm.tqdm(imdb_train_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train_post.append(temp_update)

# Preprocessing our development data
processed_reviews_dev = []
for review in tqdm.tqdm(imdb_dev_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_dev.append(temp_update)

# Defining the final word2vec model parameters
vec_size = 600
window_size = 15
model_architecture = 1
subsample = 1e-2

# Creating the document level representation using the final word2vec model for each review in the training (post-split) and development splits
w2v_model = Word2Vec(sentences = processed_reviews_train, size = vec_size, window = window_size, sg = model_architecture, sample = subsample, seed = 123)

imdb_train_post = np.zeros([len(processed_reviews_train_post), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_train_post))):
  word_list = []
  for word in processed_reviews_train_post[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_train_post[i] = np.mean(w2v_model.wv[word_list], axis = 0)

imdb_dev = np.zeros([len(processed_reviews_dev), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_dev))):
  word_list = []
  for word in processed_reviews_dev[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_dev[i] = np.mean(w2v_model.wv[word_list], axis = 0)      

100%|██████████| 25000/25000 [00:24<00:00, 1009.55it/s]
100%|██████████| 20000/20000 [00:18<00:00, 1101.73it/s]
100%|██████████| 5000/5000 [00:04<00:00, 1116.12it/s]
100%|██████████| 20000/20000 [09:08<00:00, 36.47it/s]
100%|██████████| 5000/5000 [02:13<00:00, 37.48it/s]


In [5]:
# Defining the hyperparameter configurations
penalty_term = ["l1", "l2"]
tolerance = [1e-6, 1e-4, 1e-2]
regularisation = [1/10, 1/8, 1/6, 1/4, 1/2, 1]
max_iterations = [100, 300, 500]
num_models = len(penalty_term)*len(tolerance)*len(regularisation)*len(max_iterations)
models = np.zeros(shape=(num_models, 5))

# Logistic regression hyperparameter tuning
iteration = 0
for p in penalty_term:
  for t in tolerance:
    for r in regularisation:
      for m in max_iterations:
        # Training a Logistic Regression model on the datasets
        imdb_logreg = LogisticRegression(penalty = p, tol = t, C = r, max_iter = m, random_state = 123, solver = 'liblinear')
        imdb_logreg_fit = imdb_logreg.fit(imdb_train_post, imdb_train_y)

        # Using our models to obtain predictions and compute the F1-score
        imdb_logreg_preds = imdb_logreg_fit.predict(imdb_dev)
        imdb_logreg_f1 = f1_score(imdb_dev_y, imdb_logreg_preds)

        # Updating our model matrix
        models[iteration] = [iteration, t, r, m, imdb_logreg_f1]
        print(models[iteration])
        iteration = iteration + 1

[0.00000000e+00 1.00000000e-06 1.00000000e-01 1.00000000e+02
 8.38824226e-01]
[1.00000000e+00 1.00000000e-06 1.00000000e-01 3.00000000e+02
 8.38824226e-01]
[2.00000000e+00 1.00000000e-06 1.00000000e-01 5.00000000e+02
 8.38824226e-01]
[3.00000000e+00 1.00000000e-06 1.25000000e-01 1.00000000e+02
 8.44207197e-01]
[4.00000000e+00 1.00000000e-06 1.25000000e-01 3.00000000e+02
 8.44207197e-01]
[5.00000000e+00 1.00000000e-06 1.25000000e-01 5.00000000e+02
 8.44207197e-01]
[6.00000000e+00 1.00000000e-06 1.66666667e-01 1.00000000e+02
 8.51308485e-01]
[7.00000000e+00 1.00000000e-06 1.66666667e-01 3.00000000e+02
 8.51308485e-01]
[8.00000000e+00 1.00000000e-06 1.66666667e-01 5.00000000e+02
 8.51308485e-01]
[9.00000000e+00 1.00000000e-06 2.50000000e-01 1.00000000e+02
 8.58725762e-01]
[1.00000000e+01 1.00000000e-06 2.50000000e-01 3.00000000e+02
 8.58725762e-01]
[1.10000000e+01 1.00000000e-06 2.50000000e-01 5.00000000e+02
 8.58725762e-01]
[1.20000000e+01 1.00000000e-06 5.00000000e-01 1.00000000e+02
 8.

In [6]:
# Defining the hyperparameter configurations
regularisation = [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
num_models = len(regularisation)
models = np.zeros(shape=(num_models, 3))

# SVM hyperparameter tuning
iteration = 0
for r in regularisation:
  # Training an SVM model on the datasets
  imdb_svm = SGDClassifier(alpha = r, random_state = 123)
  imdb_svm_fit = imdb_svm.fit(imdb_train_post, imdb_train_y)

  # Using our models to obtain predictions and compute the F1-score
  imdb_svm_preds = imdb_svm_fit.predict(imdb_dev)
  imdb_svm_f1 = f1_score(imdb_dev_y, imdb_svm_preds)

  # Updating our model matrix
  models[iteration] = [iteration, r, imdb_svm_f1]
  print(models[iteration])
  iteration = iteration + 1

[0.         0.03125    0.78409977]
[1.         0.0625     0.77240685]
[2.         0.125      0.67928124]
[3.         0.25       0.66675557]
[4.         0.5        0.41557632]
[5. 1. 0.]
[6.         2.         0.66666667]
[7.         4.         0.66666667]
[8.         8.         0.66666667]
[ 9.         16.          0.66666667]
[10.         32.          0.66666667]
[11.         64.          0.66666667]
[ 12.         128.           0.66666667]
[ 13.         256.           0.66666667]
[ 14.         512.           0.66666667]
[1.50000000e+01 1.02400000e+03 6.66666667e-01]
[1.60000000e+01 2.04800000e+03 6.66666667e-01]
[1.70000000e+01 4.09600000e+03 6.66666667e-01]
[1.80000000e+01 8.19200000e+03 6.66666667e-01]
[1.90000000e+01 1.63840000e+04 6.66666667e-01]
[2.00000000e+01 3.27680000e+04 6.66666667e-01]
[2.10000000e+01 6.55360000e+04 6.66666667e-01]
[2.20000000e+01 1.31072000e+05 6.66666667e-01]


In [7]:
# Defining the hyperparameter configurations
num_trees = [100, 300, 500]
min_samples = [2, 5, 10, 20]
num_features = [1, 5, 17]
num_samples = [0.5, 0.75]
num_models = len(num_trees)*len(min_samples)*len(num_features)*len(num_samples)
models = np.zeros(shape=(num_models, 6))

# Random forests hyperparameter tuning
iteration = 0
for t in num_trees:
  for ns in num_samples:
    for f in num_features:
      for ms in min_samples:
        # Training a Random Forests model on the datasets
        imdb_rf = RandomForestClassifier(n_estimators = t, min_samples_leaf = ms, max_features = f, max_samples = ns, random_state = 123)
        imdb_rf_fit = imdb_rf.fit(imdb_train_post, imdb_train_y)

        # Using our models to obtain predictions and compute the F1-score
        imdb_rf_preds = imdb_rf_fit.predict(imdb_dev)
        imdb_rf_f1 = f1_score(imdb_dev_y, imdb_rf_preds)

        # Updating our model matrix
        models[iteration] = [iteration, t, ns, f, ms, imdb_rf_f1]
        print(models[iteration])
        iteration = iteration + 1

[  0.         100.           0.5          1.           2.
   0.80726851]
[  1.         100.           0.5          1.           5.
   0.81395349]
[  2.         100.           0.5          1.          10.
   0.80814288]
[  3.         100.           0.5          1.          20.
   0.80358847]
[  4.         100.           0.5          5.           2.
   0.83064516]
[  5.        100.          0.5         5.          5.          0.8287037]
[  6.         100.           0.5          5.          10.
   0.82328083]
[  7.         100.           0.5          5.          20.
   0.82457496]
[  8.         100.           0.5         17.           2.
   0.82941976]
[  9.         100.           0.5         17.           5.
   0.83259145]
[ 10.         100.           0.5         17.          10.
   0.82923047]
[ 11.         100.           0.5         17.          20.
   0.82832202]
[ 12.         100.           0.75         1.           2.
   0.81045499]
[ 13.         100.           0.75         1.      