# Preprocessing
For Random Forest and "From Scratch" Transformers - Remove special characters and stopwords

For BERT model - Remover special characters only--keep punctuatuations and capitalization

In [1]:
import re
import pandas as pd
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
from torch.utils.data import WeightedRandomSampler
from collections import Counter
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.metrics import classification_report
import nltk
nltk.download('punkt')

# Use a simplified stopwords list
stop_words = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
    "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
    "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", 
    "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", 
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", 
    "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", 
    "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", 
    "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", 
    "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", 
    "can", "will", "just", "don", "should", "now"
}

def clean_text(text):
    # Normalize smart quotes & dashes
    text = text.replace('’', "'").replace('‘', "'").replace('–', '-').replace('—', '-')

    # Remove possessive 's (e.g., Verizon's -> Verizon)
    text = re.sub(r"'s\b", "", text)

    # Handle ampersands in proper nouns (e.g., "Moody's & Fitch" → "Moody_and_Fitch")
    text = re.sub(r'(\b[A-Z][a-zA-Z]*?)\s*&\s*([A-Z][a-zA-Z]*\b)', r'\1_and_\2', text)

    # Remove special characters except dash for compound words
    text = re.sub(r"[^a-zA-Z0-9\- ]+", " ", text)

    # Tokenize text into words
    tokens = re.findall(r'\b\w+\b', text.lower())

    # Lemmatize and remove stopwords
    lemmatized = [
        TextBlob(word).words[0].lemmatize() 
        for word in tokens if word not in stop_words and len(word) > 2
    ]

    return " ".join(lemmatized)



# Read data
train_df = pd.read_json("train.json1", lines=True)
test_df = pd.read_csv("dev.csv")

# Apply the text cleaning function
train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gabrielramos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Function that removes only special characters (preserves punctuation, capitalization, and stopwords)
def clean_for_bert(text):
    # Normalize smart quotes & dashes
    text = text.replace('’', "'").replace('‘', "'").replace('–', '-').replace('—', '-')

    # Remove possessive 's (e.g., Verizon's → Verizon)
    text = re.sub(r"'s\b", "", text)

    # Replace ampersands in proper nouns with "and"
    text = re.sub(r'(\b[A-Z][a-zA-Z]*?)\s*&\s*([A-Z][a-zA-Z]*\b)', r'\1_and_\2', text)

    # Remove special characters except basic punctuation
    text = re.sub(r"[^\w\s.,!?;:'\"()\-]", "", text)

    return text.strip()

# Apply the new BERT-style cleaner
train_df["bert_text"] = train_df["text"].apply(clean_for_bert)
test_df["bert_text"] = test_df["text"].apply(clean_for_bert)

# Random Forest 
Use TFID Vectorization then use Grid search to find the best n_estimators for the random forest

In [3]:
# TF-IDF + Random Forest
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
X_train_vec = vectorizer.fit_transform(train_df["clean_text"])
X_test_vec = vectorizer.transform(test_df["text"])
y_train = train_df["label"]
y_test = test_df["label"]

# Define the parameter grid
param_grid = {
    'n_estimators': [10, 50, 100, 200, 300, 400, 500]
}

# Set up the grid search
grid_search = GridSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grid,
    cv=3,
    scoring='f1_macro',  # or 'accuracy', depending on your needs
    n_jobs=-1
)

# Fit grid search
grid_search.fit(X_train_vec, y_train)

print("Best n_estimators:", grid_search.best_params_['n_estimators'])
print("Best cross-validated score:", grid_search.best_score_)

# Evaluate on test set with the best estimator
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

Best n_estimators: 50
Best cross-validated score: 0.4647503809307542
              precision    recall  f1-score   support

           0       0.50      0.86      0.63        29
           1       0.62      0.64      0.63        25
           2       0.75      0.83      0.79       103
           3       0.75      0.22      0.34        27
           4       1.00      0.19      0.32        16

    accuracy                           0.68       200
   macro avg       0.72      0.55      0.54       200
weighted avg       0.72      0.68      0.65       200



In [4]:
import joblib

# Save the trained Random Forest model
joblib.dump(best_rf, "rf_model.joblib")

['rf_model.joblib']

# "From Scratch" Encoder-only Transformer

Use general achitecture for Encoder-only Transformer:
- Word2Vec Encoding
- Positional Embeddings
- Multi-headed Self-Attention
- Multi-layered Transformer blocks
- Dropout regularization

No Pre-training. Straight to fine tuning. Hyperparamters were adjusted to reach best results possible.

In [5]:
# ── 1) Prepare device ───────────────────────────────────────────────────────────
device = torch.device("mps" if torch.mps.is_available() else "cpu")
print("Using device:", device)

X_train = train_df["clean_text"]
X_test = test_df["text"]

y_train = train_df["label"]
y_test = test_df["label"]

Using device: mps


In [6]:
from collections import Counter

counts = Counter(y_train.tolist())
total = len(y_train)

for label, cnt in sorted(counts.items()):
    print(f"Class {label:>2}: {cnt:>4} samples, {cnt/total:.1%}")

Class  0:  300 samples, 23.1%
Class  1:  255 samples, 19.6%
Class  2:  519 samples, 39.9%
Class  3:  164 samples, 12.6%
Class  4:   62 samples, 4.8%


In [7]:
counts = Counter(y_train)
probs  = torch.tensor([counts[c]/len(y_train) for c in sorted(counts)], device=device)
entropy = - (probs * probs.log()).sum().item()
print(f"Label‐distribution entropy ≈ {entropy:.4f}")

Label‐distribution entropy ≈ 1.4308


In [8]:
# ── 2) Word2Vec on train+test ──────────────────────────────────────────────────
train_texts = X_train.tolist()
test_texts  = X_test.tolist()

train_labels = y_train.tolist()
test_labels  = y_test.tolist()

tokenized_train = [t.split() for t in train_texts]
tokenized_test  = [t.split() for t in test_texts]
all_tokenized   = tokenized_train + tokenized_test

dimensions = 16
attn_heads = 2
ffn_layer_dimensions = 32
num_layers = 2

w2v = Word2Vec(
    sentences=all_tokenized,
    vector_size=dimensions,
    window=5,
    min_count=3,
    workers=8
)

def get_embedding(word):
    # fallback to zero-vector if OOV
    return w2v.wv[word] if word in w2v.wv else np.zeros(dimensions)


In [9]:
# Get max_length
# Tokenize
tok_train = [txt.split() for txt in X_train]
tok_test  = [txt.split() for txt in X_test]

# Compute lengths
train_lens = np.array([len(s) for s in tok_train])
test_lens  = np.array([len(s) for s in tok_test])

# Decide your percentile, e.g. 95%
for p in range(95, 100):
    max_len_train = int(np.percentile(train_lens, p))
    max_len_test  = int(np.percentile(test_lens,  p))

    print(f"{p}th percentile train length: {max_len_train}")
    print(f"{p}th percentile test length:  {max_len_test}")

max_len = 128  # Set this to the maximum length you want to use


95th percentile train length: 87
95th percentile test length:  151
96th percentile train length: 93
96th percentile test length:  164
97th percentile train length: 100
97th percentile test length:  178
98th percentile train length: 110
98th percentile test length:  208
99th percentile train length: 151
99th percentile test length:  240


In [10]:
# ── 3) Positional encoding helper ──────────────────────────────────────────────
def positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, None]
    i   = np.arange(d_model)[None, :]
    angle_rates = 1 / np.power(10000, (2*(i//2)) / np.float32(d_model))
    angles = pos * angle_rates
    angles[:, 0::2] = np.sin(angles[:, 0::2])
    angles[:, 1::2] = np.cos(angles[:, 1::2])
    return torch.from_numpy(angles).float()  # [seq_len, d_model]


In [11]:
# ── 4) Dataset + Dataloader ───────────────────────────────────────────────────
class TCfDDataset(Dataset):
    def __init__(self, texts, labels, max_len=max_len):
        self.texts   = texts
        self.labels  = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx].split()[:self.max_len]
        embs   = [get_embedding(t) for t in tokens]
        # pad to max_len
        if len(embs) < self.max_len:
            pad = [np.zeros(dimensions)] * (self.max_len - len(embs))
            embs.extend(pad)
        x = torch.tensor(np.stack(embs), dtype=torch.float32)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [12]:
# 1) Prepare your two datasets *without* splitting train_df
train_ds = TCfDDataset(
    texts = train_texts,
    labels= train_labels,
    max_len = max_len
)

test_ds  = TCfDDataset(
    texts = test_texts,
    labels= test_labels,
    max_len = max_len
)

# 1) Count labels
counts = Counter(train_labels)            # e.g. {0:50, 1:10, 2:5, ...}

# 2) Compute per-example weight = 1 / count[label]
example_weights = [1.0 / counts[label] for label in train_labels]
example_weights = torch.tensor(example_weights, dtype=torch.double)

# 3) Create the sampler
sampler = WeightedRandomSampler(
    weights=example_weights,    # a list/1D tensor of length N_train
    num_samples=len(example_weights),  # draw this many samples per epoch
    replacement=True            # sample with replacement
)

# 4) DataLoader with sampler (no shuffle!)
train_loader = DataLoader(
    train_ds,
    batch_size=512,
    sampler=sampler,
    drop_last=False
)

test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False)


In [13]:
# ── 5) Model ───────────────────────────────────────────────────────────────────
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, heads, ff_hidden_dim, dropout):
        super().__init__()
        self.attn   = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.norm1  = nn.LayerNorm(embed_dim)
        self.norm2  = nn.LayerNorm(embed_dim)
        self.ff     = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden_dim),
            nn.ReLU(),
            nn.Linear(ff_hidden_dim, embed_dim)
        )
        self.drop   = nn.Dropout(dropout)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.norm1(x + self.drop(attn_out))
        ff = self.ff(x)
        return self.norm2(x + self.drop(ff))

class TextClassifier(nn.Module):
    def __init__(self, embed_dim, heads, ff_hidden, num_classes, dropout, max_len, num_layers):
        super().__init__()
        self.max_len   = max_len
        self.embed_dim = embed_dim
        pe = positional_encoding(max_len, embed_dim)  # [max_len, embed_dim]
        self.register_buffer("pos_enc", pe)           # moves with model.to(device)
        # create N identical blocks
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, heads, ff_hidden, dropout)
            for _ in range(num_layers)
        ])
        self.classifier = nn.Linear(embed_dim, num_classes)


    def forward(self, x):
        # x: [B, L, D]
        L = x.size(1)
        x = x + self.pos_enc[:L, :].unsqueeze(0)
        for layer in self.layers:
            x = layer(x)
        x = x.mean(1)
        return self.classifier(x)


In [14]:
model = TextClassifier(
    embed_dim=dimensions,
    heads=attn_heads,
    ff_hidden=ffn_layer_dimensions,
    num_classes=len(np.unique(y_train)),
    dropout=0.5,
    max_len=max_len,
    num_layers=num_layers
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=11e-3)
loss_fn = nn.CrossEntropyLoss()

In [15]:
# ── 7) Training loop ───────────────────────────────────────────────────────────
for epoch in range(1, 601):
    model.train()
    running_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss   = loss_fn(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)

    avg_loss = running_loss / len(train_ds)
    print(f"Epoch {epoch:02d} — train loss: {avg_loss:.4f}")


Epoch 01 — train loss: 1.6669
Epoch 02 — train loss: 1.6591
Epoch 03 — train loss: 1.6134
Epoch 04 — train loss: 1.6128
Epoch 05 — train loss: 1.6099
Epoch 06 — train loss: 1.6031
Epoch 07 — train loss: 1.5986
Epoch 08 — train loss: 1.6041
Epoch 09 — train loss: 1.6015
Epoch 10 — train loss: 1.5969
Epoch 11 — train loss: 1.5871
Epoch 12 — train loss: 1.5974
Epoch 13 — train loss: 1.5771
Epoch 14 — train loss: 1.5721
Epoch 15 — train loss: 1.5761
Epoch 16 — train loss: 1.5612
Epoch 17 — train loss: 1.5714
Epoch 18 — train loss: 1.5720
Epoch 19 — train loss: 1.5389
Epoch 20 — train loss: 1.5615
Epoch 21 — train loss: 1.5084
Epoch 22 — train loss: 1.4703
Epoch 23 — train loss: 1.4573
Epoch 24 — train loss: 1.4504
Epoch 25 — train loss: 1.4441
Epoch 26 — train loss: 1.4212
Epoch 27 — train loss: 1.4342
Epoch 28 — train loss: 1.4388
Epoch 29 — train loss: 1.3934
Epoch 30 — train loss: 1.4287
Epoch 31 — train loss: 1.4444
Epoch 32 — train loss: 1.4360
Epoch 33 — train loss: 1.3878
Epoch 34 —

In [16]:
# ── 8) Evaluation ──────────────────────────────────────────────────────────────
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        logits = model(xb)
        preds  = torch.argmax(logits, dim=1).cpu()
        all_preds.extend(preds.tolist())
        all_labels.extend(yb.tolist())

print(classification_report(
    all_labels,
    all_preds,
    digits=2,
    zero_division=0
))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.17      0.64      0.27        25
           2       0.57      0.59      0.58       103
           3       0.00      0.00      0.00        27
           4       0.00      0.00      0.00        16

    accuracy                           0.39       200
   macro avg       0.15      0.25      0.17       200
weighted avg       0.32      0.39      0.33       200



# BERT Text Classifier
Use Pre-trained BERT then fine tune using training data.  Hyperparamters were adjusted to reach best results possible.

In [17]:
# ── 1) Device ──────────────────────────────────────────────────────────────────
device = torch.device("mps" if torch.mps.is_available() else "cpu")
print("Using device:", device)


X_train = train_df["bert_text"]
X_test = test_df["text"]

y_train = train_df["label"]
y_test = test_df["label"]

train_texts = X_train.tolist()
test_texts  = X_test.tolist()

train_labels = y_train.tolist()
test_labels  = y_test.tolist()

Using device: mps


In [18]:
# Get max_length
# Tokenize
tok_train = [txt.split() for txt in X_train]
tok_test  = [txt.split() for txt in X_test]

# Compute lengths
train_lens = np.array([len(s) for s in tok_train])
test_lens  = np.array([len(s) for s in tok_test])

# Decide your percentile, e.g. 95%
for p in range(95, 100):
    max_len_train = int(np.percentile(train_lens, p))
    max_len_test  = int(np.percentile(test_lens,  p))

    print(f"{p}th percentile train length: {max_len_train}")
    print(f"{p}th percentile test length:  {max_len_test}")

max_len = 175  # Set this to the maximum length you want to use

95th percentile train length: 138
95th percentile test length:  151
96th percentile train length: 144
96th percentile test length:  164
97th percentile train length: 162
97th percentile test length:  178
98th percentile train length: 177
98th percentile test length:  208
99th percentile train length: 246
99th percentile test length:  240


In [19]:

# ── 2) Tokenizer & Model ───────────────────────────────────────────────────────
PRETRAINED = "bert-base-cased"
NUM_LABELS = len(train_df["label"].unique())

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
model     = AutoModelForSequenceClassification.from_pretrained(
    PRETRAINED,
    num_labels=NUM_LABELS
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# ── 3) Dataset wrapper ─────────────────────────────────────────────────────────
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        txt  = self.texts[idx]
        lab  = self.labels[idx]
        enc  = self.tokenizer(
            txt,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        # squeeze out the batch dimension
        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         torch.tensor(lab, dtype=torch.long)
        }


In [21]:
# ── 4) DataLoaders ─────────────────────────────────────────────────────────────
BATCH_SIZE = 32
MAX_LEN    = 128

train_ds = TextDataset(X_train, y_train, tokenizer, max_len=MAX_LEN)
test_ds  = TextDataset(X_test,  y_test,  tokenizer, max_len=MAX_LEN)

# 1) Count labels
counts = Counter(train_labels)            # e.g. {0:50, 1:10, 2:5, ...}

# 2) Compute per-example weight = 1 / count[label]
example_weights = [1.0 / counts[label] for label in train_labels]
example_weights = torch.tensor(example_weights, dtype=torch.double)

# 3) Create the sampler
sampler = WeightedRandomSampler(
    weights=example_weights,    # a list/1D tensor of length N_train
    num_samples=len(example_weights),  # draw this many samples per epoch
    replacement=True            # sample with replacement
)

# 4) DataLoader with sampler (no shuffle!)
train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    drop_last=False
)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

In [22]:
# ── 5) Optimizer & Scheduler ──────────────────────────────────────────────────
EPOCHS     = 6
TOTAL_STEPS = len(train_loader) * EPOCHS

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=TOTAL_STEPS
)

In [23]:
# ── 6) Training Loop ──────────────────────────────────────────────────────────
loss_fn = nn.CrossEntropyLoss()

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # move inputs to GPU
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch} — avg train loss: {avg_loss:.4f}")

Epoch 1 — avg train loss: 1.5598
Epoch 2 — avg train loss: 1.0733
Epoch 3 — avg train loss: 0.5886
Epoch 4 — avg train loss: 0.3821
Epoch 5 — avg train loss: 0.2900
Epoch 6 — avg train loss: 0.2622


In [24]:
# ── 7) Evaluation ─────────────────────────────────────────────────────────────
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        logits = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).logits

        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

print("\nTest-set classification report:")
print(classification_report(all_labels, all_preds, digits=2, zero_division=0))


Test-set classification report:
              precision    recall  f1-score   support

           0       0.77      0.83      0.80        29
           1       0.56      0.72      0.63        25
           2       0.82      0.79      0.80       103
           3       0.57      0.59      0.58        27
           4       0.80      0.50      0.62        16

    accuracy                           0.73       200
   macro avg       0.71      0.69      0.69       200
weighted avg       0.75      0.73      0.74       200



In [25]:
# Save model and tokenizer
model.save_pretrained("bert_model/")
tokenizer.save_pretrained("bert_model/")

('bert_model/tokenizer_config.json',
 'bert_model/special_tokens_map.json',
 'bert_model/vocab.txt',
 'bert_model/added_tokens.json',
 'bert_model/tokenizer.json')