In [1]:
# Installing the required dependencies
!pip install datasets
!pip install nltk
!pip install bs4
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 27.2 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.0 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 62.0 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 78.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 61.4 MB/s 
Collecting multiprocess
  

In [2]:
# Loading in the required libraries
from datasets import load_dataset
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tqdm as tqdm
from bs4 import BeautifulSoup
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import f1_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Importing the IMDb dataset from Hugging Face and creating the different splits
imdb_dataset = load_dataset("imdb")
imdb_train_text = imdb_dataset["train"]["text"]
imdb_train_y = np.array(imdb_dataset["train"]["label"])
imdb_test_text = imdb_dataset["test"]["text"]
imdb_test_y = np.array(imdb_dataset["test"]["label"])



  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# Preprocessing our training data
stopwords_list = stopwords.words("english")
processed_reviews_train = []
for review in tqdm.tqdm(imdb_train_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train.append(temp_update)

# Preprocessing our testing data
processed_reviews_test = []
for review in tqdm.tqdm(imdb_test_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_test.append(temp_update)

# Defining the final word2vec model parameters
vec_size = 600
window_size = 15
model_architecture = 1
subsample = 1e-2

# Creating the document level representation using the final word2vec model for each review in the different data sets
w2v_model = Word2Vec(sentences = processed_reviews_train, size = vec_size, window = window_size, sg = model_architecture, sample = subsample, seed = 123)

imdb_train = np.zeros([len(processed_reviews_train), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_train))):
  word_list = []
  for word in processed_reviews_train[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_train[i] = np.mean(w2v_model.wv[word_list], axis = 0)

imdb_test = np.zeros([len(processed_reviews_test), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_test))):
  word_list = []
  for word in processed_reviews_test[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_test[i] = np.mean(w2v_model.wv[word_list], axis = 0)

# Training the final Naive Bayes model on the dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_train, imdb_train_y)

# Training the final Logistic Regression model on the dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_train, imdb_train_y)

# Training the final SVM model on the dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_train, imdb_train_y)

# Training the final Random Forests model on the dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_train, imdb_train_y)

# Using our final models to obtain predictions on the test set and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

100%|██████████| 25000/25000 [00:21<00:00, 1138.27it/s]
100%|██████████| 25000/25000 [00:21<00:00, 1156.48it/s]
100%|██████████| 25000/25000 [09:43<00:00, 42.83it/s]
100%|██████████| 25000/25000 [09:06<00:00, 45.76it/s]


Naive Bayes F1-score: 0.6767478764118361
Logistic Regression F1-score: 0.8665623871976897
SVM F1-score: 0.786436110487312
Random Forest F1-score: 0.8191777499292557
