<a href="https://colab.research.google.com/github/Alyssa1918/ML_Final_Project/blob/main/CPTS_437_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[IMDB Dataset](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/code)

[Github](https://github.com/Alyssa1918/ML_Final_Project/tree/main)

In [None]:
# connect to google drive to load in dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

# reading in the IMDB csv file
file = ('/content/drive/MyDrive/IMDB Dataset.csv')
df = pd.read_csv(file)

# displaying the contents of the csv file
print(df, type(df))

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns] <class 'pandas.core.frame.DataFrame'>


In [None]:
!pip install transformers



In [None]:
# randomly select 10000 samples from positive and 10000 from negative to ensure balanced classes

positive_samples = df[df['sentiment'] == 'positive'].sample(n=10000, random_state=42)
negative_samples = df[df['sentiment'] == 'negative'].sample(n=10000, random_state=42)

# Concatenate the randomly selected samples
df = pd.concat([positive_samples, negative_samples])
del positive_samples, negative_samples

# Shuffle the DataFrame to ensure samples are in random order
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
# Tokenizing, Transformer, Pooling

from transformers import DistilBertTokenizer, DistilBertModel
import torch
from google.colab import files

# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

def encode_and_pool(reviews):
    token_ids = tokenizer(reviews, truncation=True, padding="max_length", max_length=512, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**token_ids)
    pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
    del token_ids, outputs
    return pooled_output.squeeze().detach().numpy()

batch_size = 100
pooled_output_all = []

for i in range(0, len(df['review']), batch_size):
    print(f"on sample {i+1}")

    # process batch data
    pooled_output_all.extend(encode_and_pool(df['review'].iloc[i:i+batch_size].tolist()))

    print(f"pooled_output_all list length: {len(pooled_output_all)}")

# save dataframe to file on local machine
df['pooled_output'] = pooled_output_all
df.to_csv('pooled.csv')
files.download('pooled.csv')

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import ast

# load in pooled.csv
file = ('/content/drive/MyDrive/pooled.csv')
pooled_df = pd.read_csv(file)

# convert pooled_output column to list of numbers (currently strings)
# will take a few minutes to run
def string_to_nums(list_of_strings):
    numbers_as_strings = list_of_strings.strip('[]\n').split()
    return [ast.literal_eval(num) for num in numbers_as_strings]

pooled_df['pooled_output'] = pooled_df['pooled_output'].apply(string_to_nums)

# split the dataset into training and testing sets where the test set is 20% of the original dataset.
X_train, X_test, Y_train, Y_test = train_test_split(pooled_df['pooled_output'].tolist(), pooled_df['sentiment'], test_size=0.2, random_state=42)

# convert Y labels to an numpy array. Recall that the X value are already converted to numpy arrays during encode_and_pool.
Y_train = Y_train.values
Y_test = Y_test.values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# hypertuning k for KNN

# generate k values
k_values = range(1, 300, 2)

# set parameter to tune (k values)
grid_params = { 'n_neighbors' : k_values}

# create instance of gridsearch
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

# fit the model on our train set
g_res = gs.fit(X_train, Y_train)

# find the best score
g_res.best_score_

# get the hyperparameters with the best score
opt_k_value = g_res.best_params_['n_neighbors']
print(opt_k_value)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
17


In [None]:
# KNN Classifier

from sklearn.neighbors import KNeighborsClassifier

# create instance of KNN and train
KNN = KNeighborsClassifier(n_neighbors = 17)
KNN.fit(X_train, Y_train)

# get predictions for KNN
KNN_Y_test_pred = KNN.predict(X_test)

In [None]:
# SVM Classifier

from sklearn import svm

# create instance of SVM and train
SVM = svm.SVC()
SVM.fit(X_train, Y_train)

# get predictions for SVM
SVM_Y_test_pred = SVM.predict(X_test)

In [None]:
# Perceptron Classifier

from sklearn.linear_model import Perceptron

# create instance of Perceptron and train
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)

# get predictions for Perceptron
Perceptron_Y_test_pred = perceptron.predict(X_test)

In [None]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

# create instance of Perceptron and train
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)

# get predictions for Decision Tree
DT_Y_test_pred = decision_tree.predict(X_test)

In [None]:
# evaluate models' performance via f1 score and bootstrapping

from sklearn.metrics import f1_score
import numpy as np

# Bootstrapping and F1 measurement for 10 bootstrap iterations
KNN_scores = []
SVM_scores = []
Perceptron_scores = []
DT_scores = []

for i in range(10):  # for each bootstrap iteration
  boot_samples = np.random.randint(low = 0, high = len(Y_test)-1, size = 1000) # sample (with replacement) a set of indices of 1000 test samples

  # get their corresponding prediction
  KNN_Y_pred_boot = KNN_Y_test_pred[boot_samples]
  SVM_Y_pred_boot = SVM_Y_test_pred[boot_samples]
  Perceptron_Y_pred_boot = Perceptron_Y_test_pred[boot_samples]
  DT_Y_pred_boot = DT_Y_test_pred[boot_samples]

  # get their corresponding ground-truth value
  Y_test_boot = Y_test[boot_samples]

  # evaluate the F1 measurement & store
  KNN_scores.append(f1_score(Y_test_boot, KNN_Y_pred_boot, pos_label='positive'))
  SVM_scores.append(f1_score(Y_test_boot, SVM_Y_pred_boot, pos_label='positive'))
  Perceptron_scores.append(f1_score(Y_test_boot, Perceptron_Y_pred_boot, pos_label='positive'))
  DT_scores.append(f1_score(Y_test_boot, DT_Y_pred_boot, pos_label='positive'))

KNN_scores = np.array(KNN_scores)
SVM_scores = np.array(SVM_scores)
Perceptron_scores = np.array(Perceptron_scores)
DT_scores = np.array(DT_scores)

print(f"Average F1 Score for KNN: {KNN_scores.mean()}")
print(f"Average F1 Score for SVM: {SVM_scores.mean()}")
print(f"Average F1 Score for Perceptron: {Perceptron_scores.mean()}")
print(f"Average F1 Score for Decision Tree: {DT_scores.mean()}")

Average F1 Score for KNN: 0.7611507903070905
Average F1 Score for SVM: 0.8733311273376934
Average F1 Score for Perceptron: 0.7376342464208909
Average F1 Score for Decision Tree: 0.7020079642148629
