In [1]:
# Installing the required dependencies
!pip install datasets
!pip install nltk
!pip install bs4
!pip install gensim
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 4.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 45.1 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 50.2 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 70.1 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 45.3 MB/s 
Collecting multiproce

In [6]:
# Loading in the required libraries
from datasets import load_dataset
import random
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tqdm as tqdm
from bs4 import BeautifulSoup
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
import numpy as np
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks  
import nlpaug.augmenter.word as naw
import nlpaug.flow as nafc
from nlpaug.util import Action
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import f1_score
import statistics
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from numpy import vstack

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Importing the IMDb dataset from Hugging Face and creating the different splits
imdb_dataset = load_dataset("imdb")
imdb_train_text = imdb_dataset["train"]["text"]
imdb_train_y = np.array(imdb_dataset["train"]["label"])
imdb_test_text = imdb_dataset["test"]["text"]
imdb_test_y = np.array(imdb_dataset["test"]["label"])
semi_size = random.sample(range(0, 50000), 10000)
imdb_unlabeled = imdb_dataset["unsupervised"].select(semi_size)["text"]
imdb_unlabeled_y = np.array(imdb_dataset["unsupervised"].select(semi_size)["label"])

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Simulating the moderate imbalanced data scenario
num_majority = int((len(imdb_train_text))/2)
num_minority = int((25/75)*num_majority)
imdb_text_array = np.asarray(imdb_train_text).reshape(-1, 1)
imdb_text_moderate, imdb_y_moderate = make_imbalance(imdb_text_array, imdb_train_y, sampling_strategy = {0: num_minority, 1: num_majority}, random_state = 123)
imdb_text_moderate = imdb_text_moderate.tolist()
imdb_text_moderate = [''.join(ele) for ele in imdb_text_moderate]

In [7]:
# Preprocessing our training data
stopwords_list = stopwords.words("english")
processed_reviews_train = []
for review in tqdm.tqdm(imdb_train_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train.append(temp_update)

# Preprocessing our testing data
processed_reviews_test = []
for review in tqdm.tqdm(imdb_test_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_test.append(temp_update)

# Preprocessing our imbalanced data set
processed_reviews_imbalanced = []
for review in tqdm.tqdm(imdb_text_moderate):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_imbalanced.append(temp_update)

# Preprocessing our unlabeled data
processed_reviews_unlabeled = []
for review in tqdm.tqdm(imdb_unlabeled):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_unlabeled.append(temp_update)

# Defining the final word2vec model parameters
vec_size = 600
window_size = 15
model_architecture = 1
subsample = 1e-2

# Creating the document level representation using the final word2vec model for each review in the different data sets
w2v_model = Word2Vec(sentences = processed_reviews_train, size = vec_size, window = window_size, sg = model_architecture, sample = subsample, seed = 123)

imdb_imbalanced = np.zeros([len(processed_reviews_imbalanced), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_imbalanced))):
  word_list = []
  for word in processed_reviews_imbalanced[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_imbalanced[i] = np.mean(w2v_model.wv[word_list], axis = 0)

imdb_test = np.zeros([len(processed_reviews_test), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_test))):
  word_list = []
  for word in processed_reviews_test[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_test[i] = np.mean(w2v_model.wv[word_list], axis = 0)

imdb_unlabeled = np.zeros([len(processed_reviews_unlabeled), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_unlabeled))):
  word_list = []
  for word in processed_reviews_unlabeled[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_unlabeled[i] = np.mean(w2v_model.wv[word_list], axis = 0)

# Training the final Naive Bayes model on the imbalanced dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_imbalanced, imdb_y_moderate)

# Training the final Logistic Regression model on the imbalanced dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_imbalanced, imdb_y_moderate)

# Training the final SVM model on the imbalanced dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_imbalanced, imdb_y_moderate)

# Training the final Random Forests model on the imbalanced dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_imbalanced, imdb_y_moderate)

# Using our final models to obtain predictions on the test and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

100%|██████████| 25000/25000 [00:30<00:00, 814.77it/s]
100%|██████████| 25000/25000 [00:28<00:00, 869.11it/s]
100%|██████████| 16666/16666 [00:21<00:00, 781.76it/s]
100%|██████████| 10000/10000 [00:11<00:00, 860.52it/s]
100%|██████████| 16666/16666 [10:19<00:00, 26.89it/s]
100%|██████████| 25000/25000 [24:29<00:00, 17.01it/s]
100%|██████████| 10000/10000 [14:32<00:00, 11.46it/s]


Naive Bayes F1-score: 0.6848442906574395
Logistic Regression F1-score: 0.8314451027266744
SVM F1-score: 0.6666666666666666
Random Forest F1-score: 0.7808632078496762


In [8]:
# Random oversampling
imdb_ros = RandomOverSampler(random_state = 123)
imdb_text_resampled, imdb_label_resampled = imdb_ros.fit_resample(imdb_imbalanced, imdb_y_moderate)

# Training the final Naive Bayes model on the balanced dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final Logistic Regression model on the balanced dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final SVM model on the balanced dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final Random Forest model on the balanced dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_text_resampled, imdb_label_resampled)

# Using our final models to obtain predictions on the test set and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

Naive Bayes F1-score: 0.6770779917020184
Logistic Regression F1-score: 0.8656668666266748
SVM F1-score: 0.7840485074626866
Random Forest F1-score: 0.8127850652335324


In [9]:
# Defining the hyperparameter configurations
neighbours = 10

# Oversampling using Synthetic Minority Over-sampling Technique (SMOTE)
imdb_smote = SMOTE(k_neighbors = neighbours, random_state = 123)
imdb_text_resampled, imdb_label_resampled = imdb_smote.fit_resample(imdb_imbalanced, imdb_y_moderate)

# Training a Naive Bayes model on the balanced dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_text_resampled, imdb_label_resampled)

# Training a Logistic Regression model on the balanced dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_text_resampled, imdb_label_resampled)

# Training an SVM model on the balanced dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_text_resampled, imdb_label_resampled)

# Training a Random Forest model on the balanced dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_text_resampled, imdb_label_resampled)

# Using our final models to obtain predictions on the test set and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

Naive Bayes F1-score: 0.650101688920329
Logistic Regression F1-score: 0.8645779696597615
SVM F1-score: 0.781500219097319
Random Forest F1-score: 0.8105642610166998


In [10]:
# Random undersampling
imdb_rus = RandomUnderSampler(random_state = 123)
imdb_text_resampled, imdb_label_resampled = imdb_rus.fit_resample(imdb_imbalanced, imdb_y_moderate)

# Training the final Naive Bayes model on the balanced dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final Logistic Regression model on the balanced dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final SVM model on the balanced dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final Random Forest model on the balanced dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_text_resampled, imdb_label_resampled)

# Using our final models to obtain predictions on the test set and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

Naive Bayes F1-score: 0.6801190254788916
Logistic Regression F1-score: 0.8564671930738708
SVM F1-score: 0.7786677399879252
Random Forest F1-score: 0.8145213454075032


In [11]:
# Undersampling using Tomek's Links
imdb_tl = TomekLinks()
imdb_text_resampled, imdb_label_resampled = imdb_tl.fit_resample(imdb_imbalanced, imdb_y_moderate)

# Training the final Naive Bayes model on the balanced dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final Logistic Regression model on the balanced dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final SVM model on the balanced dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_text_resampled, imdb_label_resampled)

# Training the final Random Forest model on the balanced dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_text_resampled, imdb_label_resampled)

# Using our final models to obtain predictions on the test set and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

Naive Bayes F1-score: 0.6831930441217279
Logistic Regression F1-score: 0.8319972236682284
SVM F1-score: 0.6666666666666666
Random Forest F1-score: 0.7818988236806582


In [12]:
# Preparing our data for augmentation (random substitution)
augment_index = np.array(imdb_text_moderate)[imdb_y_moderate == 0]
augment_index = augment_index.tolist()
augment_index = [''.join(ele) for ele in augment_index]

# Preprocessing our data
processed_reviews_aug = []
for review in tqdm.tqdm(augment_index):
    raw = BeautifulSoup(review)
    html_remove = raw.get_text()
    processed_reviews_aug.append(html_remove)

# Defining the hyperparameter configuration
aug_perc = 0.1
num_augment = 2

# Data augmentation (random subsitution)
aug = naw.SynonymAug(aug_p = aug_perc, stopwords = stopwords_list)
augmented = []
for i in [1, num_augment]:
  for review in augment_index:
    temp = aug.augment(review)
    augmented.append(temp)
    augmented = [''.join(ele) for ele in augmented]

# Preprocessing our augmented data
processed_reviews_aug_post = []
for review in tqdm.tqdm(augmented):
  temp = simple_preprocess(review)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_aug_post.append(temp_update)

# Creating the document level representation using the final word2vec model for each review in the augmented data set
imdb_train_aug = np.zeros([len(processed_reviews_aug_post), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_aug_post))):
  word_list = []
  for word in processed_reviews_aug_post[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_train_aug[i] = np.mean(w2v_model.wv[word_list], axis = 0)

# Combining the imbalanced and augmented data sets
imdb_train_aug_combined = vstack((imdb_imbalanced, imdb_train_aug))
imdb_y_aug = np.zeros(shape=(len(processed_reviews_aug_post), ))
imdb_y_aug_combined = np.concatenate((imdb_y_moderate, imdb_y_aug))

# Training the final Naive Bayes model on the dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final Logistic Regression model on the dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final SVM model on the dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final Random Forests model on the dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Using our models to obtain predictions and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

100%|██████████| 4166/4166 [00:01<00:00, 3338.28it/s]
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
100%|██████████| 8332/8332 [00:06<00:00, 1243.66it/s]
100%|██████████| 8332/8332 [04:46<00:00, 29.11it/s]


Naive Bayes F1-score: 0.6569418386491558
Logistic Regression F1-score: 0.8647331415787792
SVM F1-score: 0.7563314270972181
Random Forest F1-score: 0.8218415251943232


In [13]:
# Defining the hyperparameter configuration
aug_perc = 0.2
num_augment = 2

# Data augmentation (random swap)
aug = naw.RandomWordAug(action="swap", aug_p = aug_perc, stopwords = stopwords_list)
augmented = []
for i in [1, num_augment]:
  for review in augment_index:
    temp = aug.augment(review)
    augmented.append(temp)
    augmented = [''.join(ele) for ele in augmented]

# Preprocessing our augmented data
processed_reviews_aug_post = []
for review in tqdm.tqdm(augmented):
  temp = simple_preprocess(review)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_aug_post.append(temp_update)

# Creating the document level representation using the final word2vec model for each review in the augmented data set
imdb_train_aug = np.zeros([len(processed_reviews_aug_post), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_aug_post))):
  word_list = []
  for word in processed_reviews_aug_post[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_train_aug[i] = np.mean(w2v_model.wv[word_list], axis = 0)

# Combining the imbalanced and augmented data sets
imdb_train_aug_combined = vstack((imdb_imbalanced, imdb_train_aug))
imdb_y_aug = np.zeros(shape=(len(processed_reviews_aug_post), ))
imdb_y_aug_combined = np.concatenate((imdb_y_moderate, imdb_y_aug))

# Training the final Naive Bayes model on the dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final Logistic Regression model on the dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final SVM model on the dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final Random Forests model on the dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Using our models to obtain predictions and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

100%|██████████| 8332/8332 [00:06<00:00, 1291.29it/s]
100%|██████████| 8332/8332 [04:36<00:00, 30.10it/s]


Naive Bayes F1-score: 0.6782503377276751
Logistic Regression F1-score: 0.865236325862
SVM F1-score: 0.7688084840923269
Random Forest F1-score: 0.8178879310344828


In [14]:
# Defining the hyperparameter configuration
aug_perc = 0.2
num_augment = 2

# Data augmentation (random swap)
aug = naw.RandomWordAug(action="delete", aug_p = aug_perc, stopwords = stopwords_list)
augmented = []
for i in [1, num_augment]:
  for review in augment_index:
    temp = aug.augment(review)
    augmented.append(temp)
    augmented = [''.join(ele) for ele in augmented]

# Preprocessing our augmented data
processed_reviews_aug_post = []
for review in tqdm.tqdm(augmented):
  temp = simple_preprocess(review)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_aug_post.append(temp_update)

# Creating the document level representation using the final word2vec model for each review in the augmented data set
imdb_train_aug = np.zeros([len(processed_reviews_aug_post), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_aug_post))):
  word_list = []
  for word in processed_reviews_aug_post[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_train_aug[i] = np.mean(w2v_model.wv[word_list], axis = 0)

# Combining the imbalanced and augmented data sets
imdb_train_aug_combined = vstack((imdb_imbalanced, imdb_train_aug))
imdb_y_aug = np.zeros(shape=(len(processed_reviews_aug_post), ))
imdb_y_aug_combined = np.concatenate((imdb_y_moderate, imdb_y_aug))

# Training the final Naive Bayes model on the dataset
imdb_nb = GaussianNB()
imdb_nb_fit = imdb_nb.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final Logistic Regression model on the dataset
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_fit = imdb_logreg.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final SVM model on the dataset
imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
imdb_svm_fit = imdb_svm.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Training the final Random Forests model on the dataset
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_fit = imdb_rf.fit(imdb_train_aug_combined, imdb_y_aug_combined)

# Using our models to obtain predictions and compute the F1-score
imdb_nb_preds = imdb_nb_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_preds)

imdb_logreg_preds = imdb_logreg_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_preds)

imdb_svm_preds = imdb_svm_fit.predict(imdb_test)
imdb_svm_f1 = f1_score(imdb_test_y, imdb_svm_preds)

imdb_rf_preds = imdb_rf_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"SVM F1-score: {imdb_svm_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

100%|██████████| 8332/8332 [00:06<00:00, 1341.17it/s]
100%|██████████| 8332/8332 [04:08<00:00, 33.59it/s]


Naive Bayes F1-score: 0.6997009431792041
Logistic Regression F1-score: 0.8657325860688551
SVM F1-score: 0.7742119875877838
Random Forest F1-score: 0.8279475314543577


In [15]:
# Preparing our data
imdb_text_semi = vstack((imdb_imbalanced, imdb_unlabeled))
imdb_y_semi = np.concatenate((imdb_y_moderate, imdb_unlabeled_y))

# Defining the hyperparameter configuration
thresh = 0.99

# Implementing the self-training algorithm using the final Naive Bayes model as the base classifer
imdb_nb = GaussianNB()
imdb_nb_semi = SelfTrainingClassifier(base_estimator = imdb_nb, threshold = thresh)
imdb_nb_semi_fit = imdb_nb_semi.fit(imdb_text_semi, imdb_y_semi)

# Implementing the self-training algorithm using the final Logistic Regression model as the base classifer
imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
imdb_logreg_semi = SelfTrainingClassifier(base_estimator = imdb_logreg, threshold = thresh)
imdb_logreg_semi_fit = imdb_logreg_semi.fit(imdb_text_semi, imdb_y_semi)

# Implementing the self-training algorithm using the final Random Forests model as the base classifer
imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
imdb_rf_semi = SelfTrainingClassifier(base_estimator = imdb_rf, threshold = thresh)
imdb_rf_semi_fit = imdb_rf_semi.fit(imdb_text_semi, imdb_y_semi)

# Using our models to obtain predictions and compute the F1-score
imdb_nb_semi_preds = imdb_nb_semi_fit.predict(imdb_test)
imdb_nb_f1 = f1_score(imdb_test_y, imdb_nb_semi_preds)

imdb_logreg_semi_preds = imdb_logreg_semi_fit.predict(imdb_test)
imdb_logreg_f1 = f1_score(imdb_test_y, imdb_logreg_semi_preds)

imdb_rf_semi_preds = imdb_rf_semi_fit.predict(imdb_test)
imdb_rf_f1 = f1_score(imdb_test_y, imdb_rf_semi_preds)

# Outputting the F1-score for the final models
print(f"Naive Bayes F1-score: {imdb_nb_f1}")
print(f"Logistic Regression F1-score: {imdb_logreg_f1}")
print(f"Random Forest F1-score: {imdb_rf_f1}")

Naive Bayes F1-score: 0.6312066638745405
Logistic Regression F1-score: 0.8315198558458661
Random Forest F1-score: 0.7808601391873257


In [16]:
# Defining the hyperparameter configurations
nu_values = 0.9

# Training the final one-class SVM model
imdb_oneclass_svm = OneClassSVM(kernel = "linear", nu = nu_values)
imdb_oneclass_train = imdb_imbalanced[imdb_y_moderate == 1]
imdb_oneclass_model = imdb_oneclass_svm.fit(imdb_oneclass_train)

# Using our model to obtain predictions and compute the F1-score
imdb_oneclass_preds = imdb_oneclass_model.predict(imdb_test)
imdb_oneclass_test_y = imdb_test_y
imdb_oneclass_test_y[imdb_oneclass_test_y == 0] = -1
imdb_oneclass_f1 = f1_score(imdb_oneclass_test_y, imdb_oneclass_preds, pos_label = -1)

# Outputting the F1-score for the final model
print(f"One-class SVM F1-score: {imdb_oneclass_f1}")

One-class SVM F1-score: 0.6713620230700976


In [17]:
# Defining the hyperparameter configurations
num_samples = 32

# Training the final Isolation Forest model
imdb_iforest = IsolationForest(max_samples = num_samples, contamination = 0.25, random_state = 123)
imdb_iforest_model = imdb_iforest.fit(imdb_oneclass_train)

# Using our model to obtain predictions and compute the F1-score
imdb_iforest_preds = imdb_iforest_model.predict(imdb_test)
imdb_iforest_f1 = f1_score(imdb_oneclass_test_y, imdb_iforest_preds, pos_label = -1)

# Outputting the F1-score for the final model
print(f"Isolation Forest F1-score: {imdb_iforest_f1}")

Isolation Forest F1-score: 0.1921611674307484


In [19]:
# Defining the hyperparameter configurations
neighbours = 11

# Training the final Local Outlier Factor model
imdb_lof = LocalOutlierFactor(n_neighbors = neighbours, contamination = 0.25)
imdb_combined = vstack((imdb_oneclass_train, imdb_test))

# Using our model to obtain predictions and compute the F1-score
imdb_lof_preds = imdb_lof.fit_predict(imdb_combined)
imdb_lof_preds = imdb_lof_preds[len(imdb_oneclass_train):]
imdb_lof_f1 = f1_score(imdb_oneclass_test_y, imdb_lof_preds, pos_label = -1)

# Outputting the F1-score for the final model
print(f"Local Outlier Factor F1-score: {imdb_lof_f1}")

Local Outlier Factor F1-score: 0.34761775115709226
