In [None]:
# Installing the required dependencies
!pip install datasets
!pip install nltk
!pip install bs4
!pip install gensim
!pip install nlpaug

In [None]:
# Loading in the required libraries
from datasets import load_dataset
import random
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tqdm as tqdm
from bs4 import BeautifulSoup
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
import numpy as np
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import SMOTE 
import nlpaug.augmenter.word as naw
import nlpaug.flow as nafc
from nlpaug.util import Action
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import f1_score
import statistics
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from numpy import vstack

In [None]:
# Importing the IMDb dataset from Hugging Face and creating the different splits
imdb_dataset = load_dataset("imdb")
imdb_train_full = imdb_dataset["train"]
imdb_train_dev = imdb_train_full.train_test_split(test_size = 0.2, stratify_by_column = "label", seed = 123)
imdb_train_text = imdb_train_dev["train"]["text"]
imdb_train_y = np.array(imdb_train_dev["train"]["label"])
imdb_dev_text = imdb_train_dev["test"]["text"]
imdb_dev_y = np.array(imdb_train_dev["test"]["label"])
semi_size = random.sample(range(0, 50000), 10000)
imdb_unlabeled = imdb_dataset["unsupervised"].select(semi_size)["text"]
imdb_unlabeled_y = np.array(imdb_dataset["unsupervised"].select(semi_size)["label"])

In [4]:
# Simulating the moderate imbalanced data scenario
num_majority = int((len(imdb_train_text))/2)
num_minority = int((25/75)*num_majority)
imdb_text_array = np.asarray(imdb_train_text).reshape(-1, 1)
imdb_text_moderate, imdb_y_moderate = make_imbalance(imdb_text_array, imdb_train_y, sampling_strategy = {0: num_minority, 1: num_majority}, random_state = 123)
imdb_text_moderate = imdb_text_moderate.tolist()
imdb_text_moderate = [''.join(ele) for ele in imdb_text_moderate]

In [None]:
# Preprocessing our training data (pre-split)
stopwords_list = stopwords.words("english")
processed_reviews_train = []
for review in tqdm.tqdm(imdb_train_full["text"]):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train.append(temp_update)

# Preprocessing our training data (post-split)
processed_reviews_train_post = []
for review in tqdm.tqdm(imdb_train_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_train_post.append(temp_update)

# Preprocessing our imbalanced data set
processed_reviews_imbalanced = []
for review in tqdm.tqdm(imdb_text_moderate):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_imbalanced.append(temp_update)

# Preprocessing our development data
processed_reviews_dev = []
for review in tqdm.tqdm(imdb_dev_text):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_dev.append(temp_update)

# Preprocessing our unlabeled data
processed_reviews_unlabeled = []
for review in tqdm.tqdm(imdb_unlabeled):
  raw = BeautifulSoup(review)
  html_remove = raw.get_text()
  temp = simple_preprocess(html_remove)
  temp_update = [x for x in temp if x not in stopwords_list]
  processed_reviews_unlabeled.append(temp_update)

# Defining the final word2vec model parameters
vec_size = 600
window_size = 15
model_architecture = 1
subsample = 1e-2

# Creating the document level representation using the final word2vec model for each review in the different data sets
w2v_model = Word2Vec(sentences = processed_reviews_train, size = vec_size, window = window_size, sg = model_architecture, sample = subsample, seed = 123)

imdb_imbalanced = np.zeros([len(processed_reviews_imbalanced), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_imbalanced))):
  word_list = []
  for word in processed_reviews_imbalanced[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_imbalanced[i] = np.mean(w2v_model.wv[word_list], axis = 0)

imdb_dev = np.zeros([len(processed_reviews_dev), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_dev))):
  word_list = []
  for word in processed_reviews_dev[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_dev[i] = np.mean(w2v_model.wv[word_list], axis = 0)

imdb_unlabeled = np.zeros([len(processed_reviews_unlabeled), vec_size])
for i in tqdm.tqdm(range(len(processed_reviews_unlabeled))):
  word_list = []
  for word in processed_reviews_unlabeled[i]:
    if word in w2v_model.wv.vocab:
      word_list.append(word)
      imdb_unlabeled[i] = np.mean(w2v_model.wv[word_list], axis = 0)

In [None]:
# Defining the hyperparameter configurations
neighbours = [item for item in range(3, 21)]
num_models = len(neighbours)
models = np.zeros(shape=(num_models, 3))

# SMOTE hyperparameter tuning
iteration = 0
for n in neighbours:
  # Over-sampling using Synthetic Minority Over-sampling Technique (SMOTE)
  imdb_smote = SMOTE(k_neighbors = n, random_state = 123)
  imdb_text_resampled, imdb_label_resampled = imdb_smote.fit_resample(imdb_imbalanced, imdb_y_moderate)

  # Training a Naive Bayes model on the dataset
  imdb_nb = GaussianNB()
  imdb_nb_fit = imdb_nb.fit(imdb_text_resampled, imdb_label_resampled)

  # Training a Logistic Regression model on the dataset
  imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
  imdb_logreg_fit = imdb_logreg.fit(imdb_text_resampled, imdb_label_resampled)

  # Training an SVM model on the dataset
  imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
  imdb_svm_fit = imdb_svm.fit(imdb_text_resampled, imdb_label_resampled)

  # Training a Random Forests model on the dataset
  imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
  imdb_rf_fit = imdb_rf.fit(imdb_text_resampled, imdb_label_resampled)

  # Using our models to obtain predictions and compute the F1-score
  imdb_nb_preds = imdb_nb_fit.predict(imdb_dev)
  imdb_nb_f1 = f1_score(imdb_dev_y, imdb_nb_preds)

  imdb_logreg_preds = imdb_logreg_fit.predict(imdb_dev)
  imdb_logreg_f1 = f1_score(imdb_dev_y, imdb_logreg_preds)

  imdb_svm_preds = imdb_svm_fit.predict(imdb_dev)
  imdb_svm_f1 = f1_score(imdb_dev_y, imdb_svm_preds)

  imdb_rf_preds = imdb_rf_fit.predict(imdb_dev)
  imdb_rf_f1 = f1_score(imdb_dev_y, imdb_rf_preds)

  med_f1 = statistics.median([imdb_nb_f1, imdb_logreg_f1, imdb_svm_f1, imdb_rf_f1])

  # Updating our model matrix
  models[iteration] = [iteration, n, med_f1]
  print(models[iteration])
  iteration = iteration + 1

In [None]:
# Preparing our data for augmentation (random substitution)
augment_index = np.array(imdb_text_moderate)[imdb_y_moderate == 0]
augment_index = augment_index.tolist()
augment_index = [''.join(ele) for ele in augment_index]

# Preprocessing our data
processed_reviews_aug = []
for review in tqdm.tqdm(augment_index):
    raw = BeautifulSoup(review)
    html_remove = raw.get_text()
    processed_reviews_aug.append(html_remove)

# Defining the hyperparameter configuration
aug_perc = [0.1, 0.2, 0.3, 0.4, 0.5]
num_augment = 2
num_models = len(aug_perc)
models = np.zeros(shape=(num_models, 3))

# Data augmentation (random substitution hyperparameter tuning)
iteration = 0
for a in aug_perc:
  # Data augmentation (random subsitution)
  aug = naw.SynonymAug(aug_p = a, stopwords = stopwords_list)
  augmented = []
  for i in [1, num_augment]:
    for review in augment_index:
      temp = aug.augment(review)
      augmented.append(temp)
  augmented = [''.join(ele) for ele in augmented]

  # Preprocessing our augmented data
  processed_reviews_aug_post = []
  for review in tqdm.tqdm(augmented):
    temp = simple_preprocess(review)
    temp_update = [x for x in temp if x not in stopwords_list]
    processed_reviews_aug_post.append(temp_update)

  # Creating the document level representation using the final word2vec model for each review in the augmented data set
  imdb_train_aug = np.zeros([len(processed_reviews_aug_post), vec_size])
  for i in tqdm.tqdm(range(len(processed_reviews_aug_post))):
    word_list = []
    for word in processed_reviews_aug_post[i]:
      if word in w2v_model.wv.vocab:
        word_list.append(word)
        imdb_train_aug[i] = np.mean(w2v_model.wv[word_list], axis = 0)

  # Combining the imbalanced and augmented data sets
  imdb_train_aug_combined = vstack((imdb_imbalanced, imdb_train_aug))
  imdb_y_aug = np.zeros(shape=(len(processed_reviews_aug_post), ))
  imdb_y_aug_combined = np.concatenate((imdb_y_moderate, imdb_y_aug))

  # Training a Naive Bayes model on the dataset
  imdb_nb = GaussianNB()
  imdb_nb_fit = imdb_nb.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training a Logistic Regression model on the dataset
  imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
  imdb_logreg_fit = imdb_logreg.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training an SVM model on the dataset
  imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
  imdb_svm_fit = imdb_svm.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training a Random Forests model on the dataset
  imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
  imdb_rf_fit = imdb_rf.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Using our models to obtain predictions and compute the F1-score
  imdb_nb_preds = imdb_nb_fit.predict(imdb_dev)
  imdb_nb_f1 = f1_score(imdb_dev_y, imdb_nb_preds)

  imdb_logreg_preds = imdb_logreg_fit.predict(imdb_dev)
  imdb_logreg_f1 = f1_score(imdb_dev_y, imdb_logreg_preds)

  imdb_svm_preds = imdb_svm_fit.predict(imdb_dev)
  imdb_svm_f1 = f1_score(imdb_dev_y, imdb_svm_preds)

  imdb_rf_preds = imdb_rf_fit.predict(imdb_dev)
  imdb_rf_f1 = f1_score(imdb_dev_y, imdb_rf_preds)

  med_f1 = statistics.median([imdb_nb_f1, imdb_logreg_f1, imdb_svm_f1, imdb_rf_f1])

  # Updating our model matrix
  models[iteration] = [iteration, a, med_f1]
  print(models[iteration])
  iteration = iteration + 1

In [None]:
# Defining the hyperparameter configuration
aug_perc = [0.1, 0.2, 0.3, 0.4, 0.5]
num_augment = 2
num_models = len(aug_perc)
models = np.zeros(shape=(num_models, 3))

# Data augmentation (random swap hyperparameter tuning)
iteration = 0
for a in aug_perc:
  # Data augmentation (random swap)
  aug = naw.RandomWordAug(action="swap", aug_p = a, stopwords = stopwords_list)
  augmented = []
  for i in [1, num_augment]:
    for review in augment_index:
      temp = aug.augment(review)
      augmented.append(temp)
  augmented = [''.join(ele) for ele in augmented]

  # Preprocessing our augmented data
  processed_reviews_aug_post = []
  for review in tqdm.tqdm(augmented):
    temp = simple_preprocess(review)
    temp_update = [x for x in temp if x not in stopwords_list]
    processed_reviews_aug_post.append(temp_update)

  # Creating the document level representation using the final word2vec model for each review in the augmented data set
  imdb_train_aug = np.zeros([len(processed_reviews_aug_post), vec_size])
  for i in tqdm.tqdm(range(len(processed_reviews_aug_post))):
    word_list = []
    for word in processed_reviews_aug_post[i]:
      if word in w2v_model.wv.vocab:
        word_list.append(word)
        imdb_train_aug[i] = np.mean(w2v_model.wv[word_list], axis = 0)

  # Combining the imbalanced and augmented data sets
  imdb_train_aug_combined = vstack((imdb_imbalanced, imdb_train_aug))
  imdb_y_aug = np.zeros(shape=(len(processed_reviews_aug_post), ))
  imdb_y_aug_combined = np.concatenate((imdb_y_moderate, imdb_y_aug))

  # Training a Naive Bayes model on the dataset
  imdb_nb = GaussianNB()
  imdb_nb_fit = imdb_nb.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training a Logistic Regression model on the dataset
  imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
  imdb_logreg_fit = imdb_logreg.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training an SVM model on the dataset
  imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
  imdb_svm_fit = imdb_svm.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training a Random Forests model on the dataset
  imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
  imdb_rf_fit = imdb_rf.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Using our models to obtain predictions and compute the F1-score
  imdb_nb_preds = imdb_nb_fit.predict(imdb_dev)
  imdb_nb_f1 = f1_score(imdb_dev_y, imdb_nb_preds)

  imdb_logreg_preds = imdb_logreg_fit.predict(imdb_dev)
  imdb_logreg_f1 = f1_score(imdb_dev_y, imdb_logreg_preds)

  imdb_svm_preds = imdb_svm_fit.predict(imdb_dev)
  imdb_svm_f1 = f1_score(imdb_dev_y, imdb_svm_preds)

  imdb_rf_preds = imdb_rf_fit.predict(imdb_dev)
  imdb_rf_f1 = f1_score(imdb_dev_y, imdb_rf_preds)

  med_f1 = statistics.median([imdb_nb_f1, imdb_logreg_f1, imdb_svm_f1, imdb_rf_f1])

  # Updating our model matrix
  models[iteration] = [iteration, a, med_f1]
  print(models[iteration])
  iteration = iteration + 1

In [None]:
# Defining the hyperparameter configuration
aug_perc = [0.1, 0.2, 0.3, 0.4, 0.5]
num_augment = 2
num_models = len(aug_perc)
models = np.zeros(shape=(num_models, 3))

# Data augmentation (random deletion hyperparameter tuning)
iteration = 0
for a in aug_perc:
  # Data augmentation (random deletion)
  aug = naw.RandomWordAug(action="delete", aug_p = a, stopwords = stopwords_list)
  augmented = []
  for i in [1, num_augment]:
    for review in augment_index:
      temp = aug.augment(review)
      augmented.append(temp)
  augmented = [''.join(ele) for ele in augmented]

  # Preprocessing our augmented data
  processed_reviews_aug_post = []
  for review in tqdm.tqdm(augmented):
    temp = simple_preprocess(review)
    temp_update = [x for x in temp if x not in stopwords_list]
    processed_reviews_aug_post.append(temp_update)

  # Creating the document level representation using the final word2vec model for each review in the augmented data set
  imdb_train_aug = np.zeros([len(processed_reviews_aug_post), vec_size])
  for i in tqdm.tqdm(range(len(processed_reviews_aug_post))):
    word_list = []
    for word in processed_reviews_aug_post[i]:
      if word in w2v_model.wv.vocab:
        word_list.append(word)
        imdb_train_aug[i] = np.mean(w2v_model.wv[word_list], axis = 0)

  # Combining the imbalanced and augmented data sets
  imdb_train_aug_combined = vstack((imdb_imbalanced, imdb_train_aug))
  imdb_y_aug = np.zeros(shape=(len(processed_reviews_aug_post), ))
  imdb_y_aug_combined = np.concatenate((imdb_y_moderate, imdb_y_aug))

  # Training a Naive Bayes model on the dataset
  imdb_nb = GaussianNB()
  imdb_nb_fit = imdb_nb.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training a Logistic Regression model on the dataset
  imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
  imdb_logreg_fit = imdb_logreg.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training an SVM model on the dataset
  imdb_svm = SGDClassifier(alpha = 0.03125, random_state = 123)
  imdb_svm_fit = imdb_svm.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Training a Random Forests model on the dataset
  imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
  imdb_rf_fit = imdb_rf.fit(imdb_train_aug_combined, imdb_y_aug_combined)

  # Using our models to obtain predictions and compute the F1-score
  imdb_nb_preds = imdb_nb_fit.predict(imdb_dev)
  imdb_nb_f1 = f1_score(imdb_dev_y, imdb_nb_preds)

  imdb_logreg_preds = imdb_logreg_fit.predict(imdb_dev)
  imdb_logreg_f1 = f1_score(imdb_dev_y, imdb_logreg_preds)

  imdb_svm_preds = imdb_svm_fit.predict(imdb_dev)
  imdb_svm_f1 = f1_score(imdb_dev_y, imdb_svm_preds)

  imdb_rf_preds = imdb_rf_fit.predict(imdb_dev)
  imdb_rf_f1 = f1_score(imdb_dev_y, imdb_rf_preds)

  med_f1 = statistics.median([imdb_nb_f1, imdb_logreg_f1, imdb_svm_f1, imdb_rf_f1])

  # Updating our model matrix
  models[iteration] = [iteration, a, med_f1]
  print(models[iteration])
  iteration = iteration + 1

In [None]:
# Preparing our data
imdb_text_semi = vstack((imdb_imbalanced, imdb_unlabeled))
imdb_y_semi = np.concatenate((imdb_y_moderate, imdb_unlabeled_y))

# Defining the hyperparameter configurations
thresh = [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]
num_models = len(thresh)
models = np.zeros(shape=(num_models, 3))

# Self-training hyperparameter tuning
iteration = 0
for t in thresh:
  # Training a Naive Bayes model as the base classifer
  imdb_nb = GaussianNB()
  imdb_nb_semi = SelfTrainingClassifier(base_estimator = imdb_nb, threshold = t)
  imdb_nb_semi_fit = imdb_nb_semi.fit(imdb_text_semi, imdb_y_semi)

  # Training a Logistic Regression model as the base classifer
  imdb_logreg = LogisticRegression(penalty = "l2", tol = 0.0001, C = 1, max_iter = 500, random_state = 123, solver = 'liblinear')
  imdb_logreg_semi = SelfTrainingClassifier(base_estimator = imdb_logreg, threshold = t)
  imdb_logreg_semi_fit = imdb_logreg_semi.fit(imdb_text_semi, imdb_y_semi)

  # Training a Random Forests model as the base classifer
  imdb_rf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 2, max_features = 17, max_samples = 0.75, random_state = 123)
  imdb_rf_semi = SelfTrainingClassifier(base_estimator = imdb_rf, threshold = t)
  imdb_rf_semi_fit = imdb_rf_semi.fit(imdb_text_semi, imdb_y_semi)

  # Using our models to obtain predictions and compute the F1-score
  imdb_nb_semi_preds = imdb_nb_semi_fit.predict(imdb_dev)
  imdb_nb_f1 = f1_score(imdb_dev_y, imdb_nb_semi_preds)

  imdb_logreg_semi_preds = imdb_logreg_semi_fit.predict(imdb_dev)
  imdb_logreg_f1 = f1_score(imdb_dev_y, imdb_logreg_semi_preds)

  imdb_rf_semi_preds = imdb_rf_semi_fit.predict(imdb_dev)
  imdb_rf_f1 = f1_score(imdb_dev_y, imdb_rf_semi_preds)

  med_f1 = statistics.median([imdb_nb_f1, imdb_logreg_f1, imdb_rf_f1])

  # Updating our model matrix
  models[iteration] = [iteration, t, med_f1]
  print(models[iteration])
  iteration = iteration + 1

In [None]:
# Defining the hyperparameter configurations
nu_values = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
num_models = len(nu_values)
models = np.zeros(shape=(num_models, 3))

# One-class SVM hyperparameter tuning
iteration = 0
for n in nu_values:
  # Training the one-class SVM model
  imdb_oneclass_svm = OneClassSVM(kernel = "linear", nu = n)
  imdb_oneclass_train = imdb_imbalanced[imdb_y_moderate == 1]
  imdb_oneclass_model = imdb_oneclass_svm.fit(imdb_oneclass_train)

  # Using our model to obtain predictions and compute the F1-score
  imdb_oneclass_preds = imdb_oneclass_model.predict(imdb_dev)
  imdb_oneclass_dev_y = imdb_dev_y
  imdb_oneclass_dev_y[imdb_oneclass_dev_y == 0] = -1
  imdb_oneclass_f1 = f1_score(imdb_oneclass_dev_y, imdb_oneclass_preds, pos_label = -1)

  # Updating our model matrix
  models[iteration] = [iteration, n, imdb_oneclass_f1]
  print(models[iteration])
  iteration = iteration + 1

In [None]:
# Defining the hyperparameter configurations
num_samples = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
num_models = len(num_samples)
models = np.zeros(shape=(num_models, 3))

# Isolation Forest hyperparameter tuning
iteration = 0
for n in num_samples:
  # Training the Isolation Forest model
  imdb_iforest = IsolationForest(max_samples = n, contamination = 0.25, random_state = 123)
  imdb_iforest_model = imdb_iforest.fit(imdb_oneclass_train)

  # Using our model to obtain predictions and compute the F1-score
  imdb_iforest_preds = imdb_iforest_model.predict(imdb_dev)
  imdb_iforest_f1 = f1_score(imdb_oneclass_dev_y, imdb_iforest_preds, pos_label = -1)

  # Updating our model matrix
  models[iteration] = [iteration, n, imdb_iforest_f1]
  print(models[iteration])
  iteration = iteration + 1

In [None]:
# Defining the hyperparameter configurations
neighbours = [item for item in range(10, 36)]
num_models = len(neighbours)
models = np.zeros(shape=(num_models, 3))

# Local Outlier Factor hyperparameter tuning
iteration = 0
for n in neighbours:
  # Training the Local Outlier Factor model
  imdb_lof = LocalOutlierFactor(n_neighbors = n, contamination = 0.25)
  imdb_combined = vstack((imdb_oneclass_train, imdb_dev))

  # Using our model to obtain predictions and compute the F1-score
  imdb_lof_preds = imdb_lof.fit_predict(imdb_combined)
  imdb_lof_preds = imdb_lof_preds[len(imdb_oneclass_train):]
  imdb_lof_f1 = f1_score(imdb_oneclass_dev_y, imdb_lof_preds, pos_label = -1)

  # Updating our model matrix
  models[iteration] = [iteration, n, imdb_lof_f1]
  print(models[iteration])
  iteration = iteration + 1