##### **Mount G-drive folder and access files**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
!ls '/content/gdrive/My Drive/Text - A3'

Mounted at /content/gdrive
data  results  word2vec_model.bin


In [None]:
model_folder_path = '/content/gdrive/My Drive/Text - A3/'
folder_path = '/content/gdrive/My Drive/Text - A3/data'
name_of_train_data = 'train.csv'
name_of_test_data = 'test.csv'

In [None]:
path_to_train_data = folder_path + '/' + name_of_train_data
path_to_test_data = folder_path + '/' + name_of_test_data

##### **Installing dependencies**

In [None]:
!pip install ipython-autotime nltk gdown



##### **Importing dependencies**

In [None]:
%load_ext autotime
import pandas as pd
import numpy as np
import nltk
import os
import zipfile
import tarfile
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from google.colab import files
from scipy.sparse import hstack
from gensim.models import Word2Vec
import gdown
import gzip
import shutil


time: 5.06 s (started: 2025-01-01 20:29:34 +00:00)


##### **Supporting Functions**

In [None]:
def download_and_unzip_nltk_data(datasets, nltk_data_dir):
    for dataset in datasets:
        print(f"Downloading {dataset}...")
        nltk.download(dataset, download_dir=nltk_data_dir)

        for root, dirs, files in os.walk(nltk_data_dir):
            for file in files:
                file_path = os.path.join(root, file)
                if file == f"{dataset}.zip":
                    with zipfile.ZipFile(file_path, 'r') as zip_ref:
                        zip_ref.extractall(root)
                    print(f"Unzipped {file}")
                elif file == f"{dataset}.tar.gz":
                    with tarfile.open(file_path, 'r:gz') as tar_ref:
                        tar_ref.extractall(root)
                    print(f"Untarred {file}")

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

def remove_urls(text):
    return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

def to_lowercase(text):
    return text.lower()

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def evaluate_models_with_tfidf_configs(train_df, test_df, tfidf_configs, results_folder):
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)

    results = pd.DataFrame()

    for config in tfidf_configs:
        tfidf_vectorizer = TfidfVectorizer(
            ngram_range=config['ngram_range'],
            max_features=config['max_features']
        )

        features_train = tfidf_vectorizer.fit_transform(train_df['cleaned_review'])
        features_test = tfidf_vectorizer.transform(test_df['cleaned_review'])

        X_train, X_val, y_train, y_val = train_test_split(
            features_train,
            train_df['sentiment_binary'].values,
            test_size=0.2,
            random_state=42
        )
        y_test = test_df['sentiment_binary'].values

        config_results = {}

        for model_name, model in models.items():
            model.fit(X_train, y_train)

            y_test_pred = model.predict(features_test)

            test_accuracy = accuracy_score(y_test, y_test_pred)

            config_results[model_name] = test_accuracy

        config_label = f"ngram={config['ngram_range']}, max_features={config['max_features']}"
        results[config_label] = pd.Series(config_results)

    results.to_csv(os.path.join(results_folder, '1_MLmodels_TFIDF_hyperparams.csv'))
    results.to_excel(os.path.join(results_folder, '1_MLmodels_TFIDF_hyperparams.xlsx'))

    print(f"Results saved to: {results_folder}")
    return results

def get_word2vec_embeddings(corpus, gensim_model):
    embeddings = []
    for doc in corpus:
        tokens = doc.split()
        vectors = [gensim_model.wv[word] for word in tokens if word in gensim_model.wv]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(gensim_model.vector_size))
    return np.array(embeddings)

def evaluate_with_gensim(train_df, test_df, tfidf_configs, results_folder):
    results = pd.DataFrame()

    gensim_model = Word2Vec(
        sentences=[doc.split() for doc in train_df['cleaned_review']],
        vector_size=300, window=5, min_count=2, workers=4
    )

    for config in tfidf_configs:
        tfidf_vectorizer = TfidfVectorizer(
            ngram_range=config['ngram_range'],
            max_features=config['max_features']
        )
        tfidf_train = tfidf_vectorizer.fit_transform(train_df['cleaned_review'])
        tfidf_test = tfidf_vectorizer.transform(test_df['cleaned_review'])

        train_embeddings = get_word2vec_embeddings(train_df['cleaned_review'], gensim_model)
        test_embeddings = get_word2vec_embeddings(test_df['cleaned_review'], gensim_model)

        features_train = hstack([tfidf_train, train_embeddings])
        features_test = hstack([tfidf_test, test_embeddings])

        X_train, X_val, y_train, y_val = train_test_split(
            features_train, train_df['sentiment_binary'], test_size=0.2, random_state=42
        )
        y_test = test_df['sentiment_binary']

        config_results = {}
        for model_name, model in models.items():
            if model_name == "Naive Bayes":
                continue
            model.fit(X_train, y_train)
            y_test_pred = model.predict(features_test)
            config_results[model_name] = accuracy_score(y_test, y_test_pred)

        config_label = f"ngram={config['ngram_range']}, max_features={config['max_features']}"
        results[config_label] = pd.Series(config_results)

    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    results.to_csv(os.path.join(results_folder, '2_MLmodels_TFIDF_gensim.csv'))
    results.to_excel(os.path.join(results_folder, '2_MLmodels_TFIDF_gensim.xlsx'))
    return results

time: 3.45 ms (started: 2025-01-01 20:29:39 +00:00)


##### **Loading data**

In [None]:
train_df = pd.read_csv(path_to_train_data)
print('The length of the training data is', len(train_df))
train_df.head()

The length of the training data is 30000


Unnamed: 0,review,sentiment
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here ?! Let me tell you. It's the pr...,negative


time: 1.82 s (started: 2025-01-01 20:29:39 +00:00)


In [None]:
test_df = pd.read_csv(path_to_test_data)
print('The length of the testing data is', len(test_df))
test_df.head()

The length of the testing data is 20000


Unnamed: 0,review,sentiment
0,Steven Rea plays a forensic scientist thrust o...,positive
1,As the first of the TV specials offered on the...,positive
2,There may something poetically right in seeing...,negative
3,all i can say about this film is to read the b...,negative
4,I thought it was a pretty good movie and shoul...,positive


time: 1.25 s (started: 2025-01-01 20:29:41 +00:00)


##### **Cleaning/preprocessing corpus**

In [None]:
nltk_data_dir = '/root/nltk_data'
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)
nltk.data.path.append(nltk_data_dir)

datasets = ['stopwords', 'punkt', 'punkt_tab', 'wordnet', 'omw-1.4']

download_and_unzip_nltk_data(datasets, nltk_data_dir)

print("NLTK data directory contents after extraction:")
print(os.listdir(nltk_data_dir))

Downloading stopwords...
Unzipped stopwords.zip
Downloading punkt...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unzipped punkt.zip
Downloading punkt_tab...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unzipped punkt_tab.zip
Downloading wordnet...


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unzipped wordnet.zip
Downloading omw-1.4...


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unzipped omw-1.4.zip
NLTK data directory contents after extraction:
['corpora', 'tokenizers']
time: 3.93 s (started: 2025-01-01 20:29:43 +00:00)


In [None]:
# Cleaning train data
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

train_df['cleaned_review'] = train_df['review']

preprocessing_steps = [
    remove_punctuation,
    remove_html_tags,
    remove_urls,
    remove_special_characters,
    to_lowercase,
    remove_stopwords,
    lemmatize_text
]

for step in preprocessing_steps:
    train_df['cleaned_review'] = train_df['cleaned_review'].apply(step)

review_col_index = train_df.columns.get_loc('review')
train_df.insert(review_col_index + 1, 'cleaned_review', train_df.pop('cleaned_review'))

time: 1min 45s (started: 2025-01-01 20:29:47 +00:00)


In [None]:
# Cleaning test data
test_df['cleaned_review'] = test_df['review']

preprocessing_steps = [
    remove_punctuation,
    remove_html_tags,
    remove_urls,
    remove_special_characters,
    to_lowercase,
    remove_stopwords,
    lemmatize_text
]

for step in preprocessing_steps:
    test_df['cleaned_review'] = test_df['cleaned_review'].apply(step)

review_col_index = test_df.columns.get_loc('review')
test_df.insert(review_col_index + 1, 'cleaned_review', test_df.pop('cleaned_review'))

time: 52.1 s (started: 2025-01-01 20:31:32 +00:00)


##### **Labels**

In [None]:
train_df['sentiment_binary'] = train_df['sentiment'].map({'negative': 0, 'positive': 1})
test_df['sentiment_binary'] = test_df['sentiment'].map({'negative': 0, 'positive': 1})

time: 7.74 ms (started: 2025-01-01 20:32:24 +00:00)


In [None]:
train_df.head()

Unnamed: 0,review,cleaned_review,sentiment,sentiment_binary
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,sap sea br br aspect ratio 1371br br sound for...,negative,0
1,"If you want mindless action, hot chicks and a ...",want mindless action hot chick postapocalyptic...,positive,1
2,"""The Woman in Black"" is easily one of the cree...",woman black easily one creepiest british ghost...,positive,1
3,I can barely find the words to describe how mu...,barely find word describe much piece trash off...,negative,0
4,What's in here ?! Let me tell you. It's the pr...,whats let tell presence alec baldwin he great ...,negative,0


time: 592 ms (started: 2025-01-01 20:32:24 +00:00)


In [None]:
test_df.head()

Unnamed: 0,review,cleaned_review,sentiment,sentiment_binary
0,Steven Rea plays a forensic scientist thrust o...,steven rea play forensic scientist thrust job ...,positive,1
1,As the first of the TV specials offered on the...,first tv special offered elaborate box set bar...,positive,1
2,There may something poetically right in seeing...,may something poetically right seeing dentist ...,negative,0
3,all i can say about this film is to read the b...,say film read back video case put back shelf p...,negative,0
4,I thought it was a pretty good movie and shoul...,thought pretty good movie released theater fir...,positive,1


time: 408 ms (started: 2025-01-01 20:32:25 +00:00)


##### **TFIDF and Word Embeddings Approach 1, Model Evaluation**

In [None]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'k-NN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

time: 1.07 ms (started: 2025-01-01 20:32:25 +00:00)


In [None]:
results_folder = './gensim_results/'
tfidf_configs = [
    {'ngram_range': (1, 1), 'max_features': None},
    {'ngram_range': (1, 2), 'max_features': 2000},
    {'ngram_range': (1, 3), 'max_features': 5000}
]

gensim_results = evaluate_with_gensim(train_df, test_df, tfidf_configs, results_folder)
gensim_results

Unnamed: 0,"ngram=(1, 1), max_features=None","ngram=(1, 2), max_features=2000","ngram=(1, 3), max_features=5000"
Logistic Regression,0.887,0.8808,0.8858
k-NN,0.79255,0.78485,0.78845
Random Forest,0.81475,0.82645,0.8244
Gradient Boosting,0.8457,0.8469,0.8462


time: 1h 10min 33s (started: 2025-01-01 13:33:53 +00:00)


In [None]:
files.download('./gensim_results/2_MLmodels_TFIDF_gensim.csv')
files.download('./gensim_results/2_MLmodels_TFIDF_gensim.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

time: 27.3 ms (started: 2025-01-01 14:51:10 +00:00)


##### **TFIDF-weighted Word Embeddings Approach 2, Model Evaluation (work for future recommendations)**

* Still to be implemented and executed
* Idea is to use pretrained embedding, then weight it by TFIDF, use PCA for dimensionality reduction also and then model training and eval

In [None]:
# Pretrained model (bin format)
pretrained_model_path = '/content/gdrive/My Drive/Text - A3/word2vec_model.bin'
pretrained_model_zip_path = '/content/gdrive/My Drive/Text - A3/word2vec_model.bin.gz'

if not os.path.exists(pretrained_model_path):
    print(f"Model not found at {pretrained_model_path}, downloading...")
    model_url = "https://drive.google.com/uc?export=download&id=1ETEzH8X7uM_xXtIEuNLgz9VL7eQEeE_V"
    gdown.download(model_url, pretrained_model_zip_path, quiet=False)

    if os.path.exists(pretrained_model_zip_path):
        print(f"Unzipping the model...")
        with gzip.open(pretrained_model_zip_path, 'rb') as f_in:
            with open(pretrained_model_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(pretrained_model_zip_path)
        print(f"Model unzipped successfully to {pretrained_model_path}")
else:
    print(f"Model already exists at {pretrained_model_path}")

Model not found at /content/gdrive/My Drive/Text - A3/word2vec_model.bin, downloading...


Downloading...
From (original): https://drive.google.com/uc?export=download&id=1ETEzH8X7uM_xXtIEuNLgz9VL7eQEeE_V
From (redirected): https://drive.google.com/uc?export=download&id=1ETEzH8X7uM_xXtIEuNLgz9VL7eQEeE_V&confirm=t&uuid=632d89ed-e506-416b-a2b9-bda1c44d4715
To: /content/gdrive/My Drive/Text - A3/word2vec_model.bin.gz
100%|██████████| 1.65G/1.65G [00:52<00:00, 31.6MB/s]


Unzipping the model...
Model unzipped successfully to /content/gdrive/My Drive/Text - A3/word2vec_model.bin
time: 2min 24s (started: 2025-01-01 18:31:35 +00:00)
