# DSC 253 - Homework 1

Code documentation and organization received LLM assistance for better presentation. LLM tools were also used to help fix bugs and brainstorm fine tuning the BERT training model.

In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## 1. Bag of Words

In [4]:
# Import libraries and read data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
import nltk
nltk.download('punkt_tab', quiet=True)  # Download punkt tokenizer if needed

# Read data file
file_path = '/content/drive/MyDrive/DSC253/nyt.csv'

# Load the file into 'data' so your following code recognizes it
data = pd.read_csv(file_path)

print("Data shape:", data.shape)
print("\nFirst few rows:")
display(data.head())

Data shape: (11519, 2)

First few rows:


Unnamed: 0,text,label
0,(reuters) - carlos tevez sealed his move to ju...,sports
1,if professional pride and strong defiance can ...,sports
2,"palermo, sicily — roberta vinci beat top-seede...",sports
3,spain's big two soccer teams face a pair of it...,sports
4,the argentine soccer club san lorenzo complete...,sports


In [5]:
# Create bag of words
from sklearn.feature_extraction.text import CountVectorizer
texts = data['text']
labels = data['label']

In [6]:
# Set seed and split data 80% training, 10% test, 10% validation
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, train_size = 0.8, random_state = 42, stratify = labels)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 42, stratify = y_temp)

### a. Binary-valued Technique

In [7]:
# Binary-valued Technique (1 if word is present, 0 otherwise)
binary_vec = CountVectorizer(binary=True, tokenizer=word_tokenize, lowercase=True, stop_words='english')

# Fit model on training data
X_train_binary = binary_vec.fit_transform(X_train)
X_test_binary = binary_vec.transform(X_test)

# Train model
logistic_binary = LogisticRegression(random_state=42, max_iter=1000)
logistic_binary.fit(X_train_binary, y_train)

# Evaluate model
y_pred_binary = logistic_binary.predict(X_test_binary)
accuracy_binary = accuracy_score(y_test, y_pred_binary)
f1_binary = f1_score(y_test, y_pred_binary, average='macro')

print(f"Accuracy for Binary-valued Technique: {accuracy_binary}")
print(f"F1 Score for Binary-valued Technique: {f1_binary}")



Accuracy for Binary-valued Technique: 0.9826388888888888
F1 Score for Binary-valued Technique: 0.9582453920540095


### b. Frequency-based Technique

In [8]:
#Frequency technique
frequency_vec = CountVectorizer(tokenizer=word_tokenize, lowercase=True, stop_words='english')

# Fit model on training data
X_train_frequency = frequency_vec.fit_transform(X_train)
X_test_frequency = frequency_vec.transform(X_test)

# Train model
logistic_frequency = LogisticRegression(random_state=42, max_iter=1000)
logistic_frequency.fit(X_train_frequency, y_train)

# Evaluate model
y_pred_frequency = logistic_frequency.predict(X_test_frequency)
accuracy_frequency = accuracy_score(y_test, y_pred_frequency)
f1_frequency = f1_score(y_test, y_pred_frequency, average='macro')

print(f"Accuracy for Frequency Technique: {accuracy_frequency}")
print(f"F1 Score for Frequency Technique: {f1_frequency}")




Accuracy for Frequency Technique: 0.9861111111111112
F1 Score for Frequency Technique: 0.9674621722873823


### c. TF-IDF Value Technique

In [9]:
#TF-IDF Value Technique
tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True, stop_words='english')

# Fit model on training data
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

# Train model
logistic_tfidf = LogisticRegression(random_state=42, max_iter=1000)
logistic_tfidf.fit(X_train_tfidf, y_train)

# Evaluate model
y_pred_tfidf = logistic_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='macro')

print(f"Accuracy for TF-IDF Value Technique: {accuracy_tfidf}")
print(f"F1 Score for TF-IDF Value Technique: {f1_tfidf}")



Accuracy for TF-IDF Value Technique: 0.9817708333333334
F1 Score for TF-IDF Value Technique: 0.9614840602175233


Analysis: All 3 technique have identical Accuracy and F1 score. The frequency method performs best with 98.61% accuracy.

## 2. Word2Vec

### a. Pre-trained Glove embeddings

In [10]:
# Import libraries
import numpy as np
from gensim.models import KeyedVectors
import os

In [11]:
# Load Glove embeddings
glove_path = '/content/drive/MyDrive/DSC253/glove.6B.100d.txt'

# Load Glove embeddings
glove_vectors = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_vectors[word] = vector

print(f"Loaded {len(glove_vectors)} words from GloVe file")

Loaded 400000 words from GloVe file


In [12]:
# Convert text data to document vectors using tokenization

def get_word_vectors(text, glove_vectors, dim = 100):
    words = word_tokenize(text.lower())
    word_vectors = [glove_vectors[word] for word in words if word in glove_vectors]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(dim)

# Convert training and test data to document vectors
X_train_word2vec = np.array([get_word_vectors(text, glove_vectors) for text in X_train])
X_test_word2vec = np.array([get_word_vectors(text, glove_vectors) for text in X_test])

print(f'Training data shape: {X_train_word2vec.shape}')
print(f'Test data shape: {X_test_word2vec.shape}')


Training data shape: (9215, 100)
Test data shape: (1152, 100)


In [13]:
# Train Logistic Regression model
logistic_word2vec = LogisticRegression(random_state=42, max_iter=1000)
logistic_word2vec.fit(X_train_word2vec, y_train)

# Evaluate model
y_pred_word2vec = logistic_word2vec.predict(X_test_word2vec)
accuracy_word2vec = accuracy_score(y_test, y_pred_word2vec)
f1_word2vec = f1_score(y_test, y_pred_word2vec, average='macro')

print(f"Accuracy for Word2Vec: {accuracy_word2vec}")
print(f"F1 Score for Word2Vec: {f1_word2vec}")

Accuracy for Word2Vec: 0.9774305555555556
F1 Score for Word2Vec: 0.9480499015125788


### b. Train Word2Vec on AGNews


In [14]:
# Import libraries
from gensim.models import Word2Vec

In [15]:
# Load AGNews data
data_ag = pd.read_csv('/content/drive/MyDrive/DSC253/ag.csv')
print(data_ag.head())
print(f'Data shape: {data_ag.shape}')

                                                text
0  wall st. bears claw back into the black (reute...
1  carlyle looks toward commercial aerospace (reu...
2  oil and economy cloud stocks' outlook (reuters...
3  iraq halts oil exports from main southern pipe...
4  oil prices soar to all-time record, posing new...
Data shape: (90000, 1)


In [16]:
# Train Word2Vec model
ag_text = [word_tokenize(text.lower()) for text in data_ag['text']]
w2v_ag = Word2Vec(ag_text, vector_size=100, window=5, min_count=1, workers=4)
print(f'Vocab size: {len(w2v_ag.wv)}')
print(f'Vector size: {w2v_ag.vector_size}')

Vocab size: 72991
Vector size: 100


In [17]:
# Get document vectors by averaging word vectors
def get_doc_vector_w2v(text, w2v_model, dim = 100):
    words = word_tokenize(text.lower())
    word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(dim)

# Convert data to document vectors
X_train_w2v_ag = np.array([get_doc_vector_w2v(text, w2v_ag) for text in X_train])
X_test_w2v_ag = np.array([get_doc_vector_w2v(text, w2v_ag) for text in X_test])

# Train Logistic Regression model
lr_w2v_ag = LogisticRegression(random_state=42, max_iter=1000)
lr_w2v_ag.fit(X_train_w2v_ag, y_train)

# Evaluate model
y_pred_w2v_ag = lr_w2v_ag.predict(X_test_w2v_ag)
accuracy_w2v_ag = accuracy_score(y_test, y_pred_w2v_ag)
f1_w2v_ag = f1_score(y_test, y_pred_w2v_ag, average='macro')

# Print results
print(f"Accuracy for Word2Vec on AGNews: {accuracy_w2v_ag}")
print(f"F1 Score for Word2Vec on AGNews: {f1_w2v_ag}")

Accuracy for Word2Vec on AGNews: 0.9704861111111112
F1 Score for Word2Vec on AGNews: 0.9343608758142402


### c. Train Word2Vec on NYT text data

In [18]:
# Word2Vec model on NYT training data
nyt_text = [word_tokenize(text.lower()) for text in X_train]

# Train Word2Vec model
w2v_nyt = Word2Vec(nyt_text, vector_size=100, window=5, min_count=1, workers=4)
print(f'Vocab size: {len(w2v_nyt.wv)}')
print(f'Vector size: {w2v_nyt.vector_size}')

# Convert data to document vectors
X_train_w2v_nyt = np.array([get_doc_vector_w2v(text, w2v_nyt) for text in X_train])
X_test_w2v_nyt = np.array([get_doc_vector_w2v(text, w2v_nyt) for text in X_test])

# Train Logistic Regression model
lr_w2v_nyt = LogisticRegression(random_state=42, max_iter=1000)
lr_w2v_nyt.fit(X_train_w2v_nyt, y_train)

# Evaluate model
y_pred_w2v_nyt = lr_w2v_nyt.predict(X_test_w2v_nyt)
accuracy_w2v_nyt = accuracy_score(y_test, y_pred_w2v_nyt)
f1_w2v_nyt = f1_score(y_test, y_pred_w2v_nyt, average='macro')

# Print results
print(f"Accuracy for Word2Vec on NYT: {accuracy_w2v_nyt}")
print(f"F1 Score for Word2Vec on NYT: {f1_w2v_nyt}")

Vocab size: 127606
Vector size: 100
Accuracy for Word2Vec on NYT: 0.9748263888888888
F1 Score for Word2Vec on NYT: 0.9423179967453382


Pre-trained GloVe embeddings achieved the best Word2Vec performance (97.74%). Word2Vec trained on NYT data outperformed the AG News-trained version (97.40% vs. 96.88%), indicating the importance of domain-matched training data.

## 3. Pre-trained Neural Models

### a. Fine-tune BERT

In [19]:
# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset

# Load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels.unique()))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=64)

# Label mapping to convert labels to integers
label_to_id = {label: idx for idx, label in enumerate(sorted(labels.unique()))}
id_to_label = {idx: label for label, idx in label_to_id.items()}
print(f"Label mapping: {label_to_id}")
print(f"Inverse label mapping: {id_to_label}")


Label mapping: {'business': 0, 'politics': 1, 'sports': 2}
Inverse label mapping: {0: 'business', 1: 'politics', 2: 'sports'}


In [21]:
# Prepare datasets and tokenize
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_df = pd.DataFrame({'text': X_test, 'label': y_test})
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9215 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

In [22]:
# Encode labels to convert strings to numbers
def encode_labels(examples):
    examples['label'] = [label_to_id[label] for label in examples['label']]
    return examples

train_dataset = train_dataset.map(encode_labels, batched=True)
test_dataset = test_dataset.map(encode_labels, batched=True)

# Convert to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Map:   0%|          | 0/9215 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Train dataset size: 9215
Test dataset size: 1152


In [23]:
# Training BERT model
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Ensure model and tokenizer are defined
training_args = TrainingArguments(
    output_dir='./bert_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.223
200,0.1451
300,0.1111
400,0.1172
500,0.1301
600,0.0938
700,0.0337
800,0.0526
900,0.0497
1000,0.0753


TrainOutput(global_step=1728, training_loss=0.06769087169043443, metrics={'train_runtime': 410.5107, 'train_samples_per_second': 67.343, 'train_steps_per_second': 4.209, 'total_flos': 909221304136320.0, 'train_loss': 0.06769087169043443, 'epoch': 3.0})

In [24]:
# Evaluate BERT on test set
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

print("Evaluating BERT on test set...")
predictions_bert = trainer.predict(test_dataset)
y_pred_bert = np.argmax(predictions_bert.predictions, axis=-1)
y_true_bert = predictions_bert.label_ids

# Calculate accuracy and macro-f1
accuracy_bert = accuracy_score(y_true_bert, y_pred_bert)
f1_bert = f1_score(y_true_bert, y_pred_bert, average='macro')

print(f"Accuracy for BERT: {accuracy_bert}")
print(f"F1 Score for BERT: {f1_bert}")

Evaluating BERT on test set...


Accuracy for BERT: 0.9765625
F1 Score for BERT: 0.9492303144051643


### b. ModernBERT Training

In [25]:
# Load ModernBERT model and tokenizer
model_name_modern = "answerdotai/ModernBERT-base"
tokenizer_modern = AutoTokenizer.from_pretrained(model_name_modern)
model_modern = AutoModelForSequenceClassification.from_pretrained(model_name_modern, num_labels=len(labels.unique()))

# Tokenize function for ModernBERT
def tokenize_function_modern(examples):
    return tokenizer_modern(examples['text'], padding='max_length', truncation=True, max_length=64)

# Prepare datasets for ModernBERT
train_df_modern = pd.DataFrame({'text': X_train, 'label': y_train})
test_df_modern = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset_modern = Dataset.from_pandas(train_df_modern)
test_dataset_modern = Dataset.from_pandas(test_df_modern)

# Tokenize data
train_dataset_modern = train_dataset_modern.map(tokenize_function_modern, batched=True)
test_dataset_modern = test_dataset_modern.map(tokenize_function_modern, batched=True)

# Encode labels
train_dataset_modern = train_dataset_modern.map(encode_labels, batched=True)
test_dataset_modern = test_dataset_modern.map(encode_labels, batched=True)

# Convert to PyTorch tensors
train_dataset_modern.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset_modern.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"Train dataset size: {len(train_dataset_modern)}")
print(f"Test dataset size: {len(test_dataset_modern)}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9215 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/9215 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Train dataset size: 9215
Test dataset size: 1152


In [26]:
# Training ModernBERT model
training_args_modern = TrainingArguments(
    output_dir='./modernbert_results',
    num_train_epochs=3,  # As specified in assignment
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=100,
    save_strategy='no',
)

trainer_modern = Trainer(
    model=model_modern,
    args=training_args_modern,
    train_dataset=train_dataset_modern,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_modern),
)

# Train the model
trainer_modern.train()

  return torch._C._get_cublas_allow_tf32()
W0128 04:53:31.255000 847 torch/_inductor/utils.py:1558] [1/0_1] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss
100,0.2906
200,0.1208
300,0.1223
400,0.0977
500,0.1159
600,0.0885
700,0.0246
800,0.0387
900,0.0306
1000,0.0363


TrainOutput(global_step=1728, training_loss=0.060534633931065736, metrics={'train_runtime': 527.7458, 'train_samples_per_second': 52.383, 'train_steps_per_second': 3.274, 'total_flos': 1177539495096960.0, 'train_loss': 0.060534633931065736, 'epoch': 3.0})

In [27]:
# Evaluate ModernBERT on test set
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

print("Evaluating ModernBERT on test set...")
predictions_modern = trainer_modern.predict(test_dataset_modern)
y_pred_modern = np.argmax(predictions_modern.predictions, axis=-1)
y_true_modern = predictions_modern.label_ids

# Calculate accuracy and macro-f1
accuracy_modern = accuracy_score(y_true_modern, y_pred_modern)
f1_modern = f1_score(y_true_modern, y_pred_modern, average='macro')

print(f"Accuracy for ModernBERT: {accuracy_modern}")
print(f"F1 Score for ModernBERT: {f1_modern}")

Evaluating ModernBERT on test set...


Accuracy for ModernBERT: 0.9704861111111112
F1 Score for ModernBERT: 0.937501238669685


BERT achieved 97.48% accuracy, performing comparably to GloVe but slightly below traditional Bag of Words methods.