##### **Mount G-drive folder and access files**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
!ls '/content/gdrive/My Drive/Text - A3'

Mounted at /content/gdrive
data  results


In [None]:
folder_path = '/content/gdrive/My Drive/Text - A3/data'
name_of_train_data = 'train.csv'
name_of_test_data = 'test.csv'

In [None]:
path_to_train_data = folder_path + '/' + name_of_train_data
path_to_test_data = folder_path + '/' + name_of_test_data

##### **Installing dependencies**

In [None]:
!pip install ipython-autotime nltk

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting jedi>=0.16 (from ipython->ipython-autotime)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.2 jedi-0.19.2


##### **Importing dependencies**

In [None]:
%load_ext autotime
import pandas as pd
import nltk
import os
import zipfile
import tarfile
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from google.colab import files

time: 610 ms (started: 2025-01-01 08:52:38 +00:00)


##### **Supporting Functions**

In [None]:
def download_and_unzip_nltk_data(datasets, nltk_data_dir):
    for dataset in datasets:
        print(f"Downloading {dataset}...")
        nltk.download(dataset, download_dir=nltk_data_dir)

        for root, dirs, files in os.walk(nltk_data_dir):
            for file in files:
                file_path = os.path.join(root, file)
                if file == f"{dataset}.zip":
                    with zipfile.ZipFile(file_path, 'r') as zip_ref:
                        zip_ref.extractall(root)
                    print(f"Unzipped {file}")
                elif file == f"{dataset}.tar.gz":
                    with tarfile.open(file_path, 'r:gz') as tar_ref:
                        tar_ref.extractall(root)
                    print(f"Untarred {file}")

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

def remove_urls(text):
    return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

def to_lowercase(text):
    return text.lower()

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def evaluate_models_with_tfidf_configs(train_df, test_df, tfidf_configs, results_folder):
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)

    results = pd.DataFrame()

    for config in tfidf_configs:
        tfidf_vectorizer = TfidfVectorizer(
            ngram_range=config['ngram_range'],
            max_features=config['max_features']
        )

        features_train = tfidf_vectorizer.fit_transform(train_df['cleaned_review'])
        features_test = tfidf_vectorizer.transform(test_df['cleaned_review'])

        X_train, X_val, y_train, y_val = train_test_split(
            features_train,
            train_df['sentiment_binary'].values,
            test_size=0.2,
            random_state=42
        )
        y_test = test_df['sentiment_binary'].values

        config_results = {}

        for model_name, model in models.items():
            model.fit(X_train, y_train)

            y_test_pred = model.predict(features_test)

            test_accuracy = accuracy_score(y_test, y_test_pred)

            config_results[model_name] = test_accuracy

        config_label = f"ngram={config['ngram_range']}, max_features={config['max_features']}"
        results[config_label] = pd.Series(config_results)

    results.to_csv(os.path.join(results_folder, '1_MLmodels_TFIDF_hyperparams.csv'))
    results.to_excel(os.path.join(results_folder, '1_MLmodels_TFIDF_hyperparams.xlsx'))

    print(f"Results saved to: {results_folder}")
    return results

##### **Loading data**

In [None]:
train_df = pd.read_csv(path_to_train_data)
print('The length of the training data is', len(train_df))
train_df.head()

The length of the training data is 30000


Unnamed: 0,review,sentiment
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here ?! Let me tell you. It's the pr...,negative


time: 6.46 s (started: 2025-01-01 08:52:39 +00:00)


In [None]:
test_df = pd.read_csv(path_to_test_data)
print('The length of the testing data is', len(test_df))
test_df.head()

The length of the testing data is 20000


Unnamed: 0,review,sentiment
0,Steven Rea plays a forensic scientist thrust o...,positive
1,As the first of the TV specials offered on the...,positive
2,There may something poetically right in seeing...,negative
3,all i can say about this film is to read the b...,negative
4,I thought it was a pretty good movie and shoul...,positive


time: 2.33 s (started: 2025-01-01 08:52:45 +00:00)


##### **Cleaning/preprocessing corpus**

In [None]:
nltk_data_dir = '/root/nltk_data'
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)
nltk.data.path.append(nltk_data_dir)

datasets = ['stopwords', 'punkt', 'punkt_tab', 'wordnet', 'omw-1.4']

download_and_unzip_nltk_data(datasets, nltk_data_dir)

print("NLTK data directory contents after extraction:")
print(os.listdir(nltk_data_dir))

Downloading stopwords...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...


Unzipped stopwords.zip
Downloading punkt...


[nltk_data]   Unzipping tokenizers/punkt.zip.


Unzipped punkt.zip
Downloading punkt_tab...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unzipped punkt_tab.zip
Downloading wordnet...


[nltk_data] Downloading package wordnet to /root/nltk_data...


Unzipped wordnet.zip
Downloading omw-1.4...


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unzipped omw-1.4.zip
NLTK data directory contents after extraction:
['corpora', 'tokenizers']
time: 14.6 s (started: 2025-01-01 08:52:48 +00:00)


In [None]:
# Cleaning train data
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

train_df['cleaned_review'] = train_df['review']

preprocessing_steps = [
    remove_punctuation,
    remove_html_tags,
    remove_urls,
    remove_special_characters,
    to_lowercase,
    remove_stopwords,
    lemmatize_text
]

for step in preprocessing_steps:
    train_df['cleaned_review'] = train_df['cleaned_review'].apply(step)

review_col_index = train_df.columns.get_loc('review')
train_df.insert(review_col_index + 1, 'cleaned_review', train_df.pop('cleaned_review'))

time: 1min 31s (started: 2025-01-01 08:53:02 +00:00)


In [None]:
# Cleaning test data
test_df['cleaned_review'] = test_df['review']

preprocessing_steps = [
    remove_punctuation,
    remove_html_tags,
    remove_urls,
    remove_special_characters,
    to_lowercase,
    remove_stopwords,
    lemmatize_text
]

for step in preprocessing_steps:
    test_df['cleaned_review'] = test_df['cleaned_review'].apply(step)

review_col_index = test_df.columns.get_loc('review')
test_df.insert(review_col_index + 1, 'cleaned_review', test_df.pop('cleaned_review'))

time: 52.5 s (started: 2025-01-01 08:54:34 +00:00)


##### **Labels**

In [None]:
train_df['sentiment_binary'] = train_df['sentiment'].map({'negative': 0, 'positive': 1})
test_df['sentiment_binary'] = test_df['sentiment'].map({'negative': 0, 'positive': 1})

time: 8.48 ms (started: 2025-01-01 08:55:26 +00:00)


In [None]:
train_df.head()

Unnamed: 0,review,cleaned_review,sentiment,sentiment_binary
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,sap sea br br aspect ratio 1371br br sound for...,negative,0
1,"If you want mindless action, hot chicks and a ...",want mindless action hot chick postapocalyptic...,positive,1
2,"""The Woman in Black"" is easily one of the cree...",woman black easily one creepiest british ghost...,positive,1
3,I can barely find the words to describe how mu...,barely find word describe much piece trash off...,negative,0
4,What's in here ?! Let me tell you. It's the pr...,whats let tell presence alec baldwin he great ...,negative,0


time: 632 ms (started: 2025-01-01 08:55:26 +00:00)


In [None]:
test_df.head()

Unnamed: 0,review,cleaned_review,sentiment,sentiment_binary
0,Steven Rea plays a forensic scientist thrust o...,steven rea play forensic scientist thrust job ...,positive,1
1,As the first of the TV specials offered on the...,first tv special offered elaborate box set bar...,positive,1
2,There may something poetically right in seeing...,may something poetically right seeing dentist ...,negative,0
3,all i can say about this film is to read the b...,say film read back video case put back shelf p...,negative,0
4,I thought it was a pretty good movie and shoul...,thought pretty good movie released theater fir...,positive,1


time: 419 ms (started: 2025-01-01 08:55:27 +00:00)


##### **Vectorization and Model Evaluation**

In [None]:
!ls '/content/drive/My Drive/Text - A3/results/'

tfidf_model_evaluation_results_dummy.csv  tfidf_model_evaluation_results_dummy.xlsx
time: 126 ms (started: 2024-12-31 20:03:11 +00:00)


In [None]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'k-NN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

tfidf_configs = [
    {'ngram_range': (1, 1), 'max_features': None},
    {'ngram_range': (1, 2), 'max_features': 100},
    {'ngram_range': (1, 2), 'max_features': 200},
    {'ngram_range': (1, 2), 'max_features': 500},
    {'ngram_range': (1, 2), 'max_features': 2000},
    {'ngram_range': (1, 2), 'max_features': 5000},
    {'ngram_range': (1, 3), 'max_features': 5000},
    {'ngram_range': (1, 5), 'max_features': 2000}
]

results_folder = '/content/drive/My Drive/Text - A3/results/'
results_df = evaluate_models_with_tfidf_configs(train_df, test_df, tfidf_configs, results_folder)

results_df

Results saved to: /content/drive/My Drive/Text - A3/results/


Unnamed: 0,"ngram=(1, 1), max_features=None","ngram=(1, 2), max_features=100","ngram=(1, 2), max_features=200","ngram=(1, 2), max_features=500","ngram=(1, 2), max_features=2000","ngram=(1, 2), max_features=5000","ngram=(1, 3), max_features=5000","ngram=(1, 5), max_features=2000"
Naive Bayes,0.86325,0.7262,0.761,0.8196,0.848,0.86175,0.8612,0.84835
Logistic Regression,0.889,0.73325,0.77575,0.8395,0.87635,0.88465,0.88485,0.8756
k-NN,0.76455,0.65275,0.66985,0.6882,0.70325,0.7328,0.73335,0.7027
Random Forest,0.85275,0.72075,0.7546,0.8123,0.8397,0.8476,0.8464,0.83785
Gradient Boosting,0.8106,0.7235,0.74935,0.79645,0.8081,0.8129,0.8126,0.8083


time: 41min 18s (started: 2025-01-01 08:55:35 +00:00)


In [None]:
files.download('/content/drive/My Drive/Text - A3/results/1_MLmodels_TFIDF_hyperparams.csv')
files.download('/content/drive/My Drive/Text - A3/results/1_MLmodels_TFIDF_hyperparams.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

time: 16.5 ms (started: 2025-01-01 09:36:54 +00:00)
