# HW4: NLP Assignment



In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
# import nltk
# from nltk.corpus import stopwords

# # Ensure stopwords are downloaded
# try:
#     nltk.data.find('corpora/stopwords')
# except LookupError:
#     nltk.download('stopwords')

# Load data
data = []
# Reading with latin-1 to handle potential encoding issues, as is common with some text datasets
with open('Sentences_50Agree.txt', 'r', encoding='latin-1') as f:
    for line in f:
        line = line.strip()
        if not line: continue
        # Split by the last @ to separate text and label
        parts = line.rsplit('@', 1)
        if len(parts) == 2:
            data.append(parts)

df = pd.DataFrame(data, columns=['text', 'label'])
print(f"Loaded {len(df)} samples.")
df.head()

Loaded 4846 samples.


Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [11]:
import torch

# Check for MPS (Metal Performance Shaders) availability for Mac
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Mac GPU)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (NVIDIA GPU)")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using MPS (Mac GPU)


In [3]:
import random
import numpy as np
import torch

def seed_everything(seed=42):
    # 1. Set seed for Python's built-in random
    random.seed(seed)
    
    # 2. Set seed for NumPy
    np.random.seed(seed)
    
    # 3. Set seed for PyTorch (CPU)
    torch.manual_seed(seed)
    
    # 4. Set seed for PyTorch (GPU/MPS)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # 5. Force PyTorch to use deterministic algorithms (slower, but reproducible)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(2025)

In [4]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stop words and filter empty words
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess)

# Only consider documents with at least 1 word
df = df[df['processed_text'].str.len() > 0].copy()
print(f"Samples after filtering: {len(df)}")
df.head()

Samples after filtering: 4846


Unnamed: 0,text,label,processed_text
0,"According to Gran , the company has no plans t...",neutral,"according gran , company plans move production..."
1,Technopolis plans to develop in stages an area...,neutral,"technopolis plans develop stages area less , s..."
2,The international electronic industry company ...,negative,international electronic industry company elco...
3,With the new production plant the company woul...,positive,new production plant company would increase ca...
4,According to the company 's updated strategy f...,positive,according company 's updated strategy years - ...


In [5]:
# Split data into training and test data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], 
    df['label'], 
    test_size=0.1, 
    stratify=df['label'], 
    random_state=2013
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 4361
Test set size: 485


In [6]:
# Report the training data‚Äôs base rate multinomial classification accuracy on the test data
# Base rate is the accuracy if we simply predict the majority class from the training set

# Find majority class in training data
majority_class = y_train.mode()[0]
print(f"Majority class in training data: {majority_class}")

# Calculate accuracy on test data
# If we predict 'majority_class' for every instance in test set
base_rate_accuracy = (y_test == majority_class).mean()
print(f"Base rate accuracy on test data: {base_rate_accuracy:.4f}")

Majority class in training data: neutral
Base rate accuracy on test data: 0.5938


## 2. Naive Bayes
(a) Compute TF-IDF values for every word in your pre-processed text for both the training and test data. Avoid data leakage.
(b) Fit the Multinomial Naive Bayes estimator on the training data with Laplace smoothing, then report the accuracy on the test dataset.
(c) Describe how the Multinomial Naive Bayes classifier performs against base rates.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# (a) Compute TF-IDF values
# Initialize vectorizer
tfidf = TfidfVectorizer()

# Fit on training data ONLY to avoid data leakage, then transform training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform test data using the vocabulary and IDF learned from training data
X_test_tfidf = tfidf.transform(X_test)

print(f"Shape of TF-IDF training matrix: {X_train_tfidf.shape}")

Shape of TF-IDF training matrix: (4361, 8461)


In [8]:
# (b) Fit Multinomial Naive Bayes with Laplace smoothing
# alpha=1.0 is the default in sklearn, which implements Laplace smoothing
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_mnb = mnb.predict(X_test_tfidf)

# Report accuracy
mnb_accuracy = accuracy_score(y_test, y_pred_mnb)
print(f"Multinomial Naive Bayes Accuracy: {mnb_accuracy:.4f}")
print(f"Base Rate Accuracy: {base_rate_accuracy:.4f}")

Multinomial Naive Bayes Accuracy: 0.6825
Base Rate Accuracy: 0.5938


### (c) Comparison
The Multinomial Naive Bayes classifier achieved an accuracy of 0.6825, which is higher than the base rate accuracy of 0.5938.

This implies that the predictor variables (the words in the text) have a relationship with the response variable (sentiment). The improvement over the base rate indicates that the words in the sentences provide meaningful information that helps distinguish between the sentiment classes, rather than just guessing the most common class.

## 3. Feedforward Neural Network
(a) Fit a feedforward neural network (MLP) using TF-IDF values. Justify your hyperparameters.
(b) Report the test accuracy.
(c) Compare performance to Naive Bayes and base rate.

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

# Encode labels to integers to avoid issues with MLPClassifier early stopping validation
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# (a) Fit MLPClassifier
# Justification for hyperparameters:
# - hidden_layer_sizes=(100,): The input dimension is high (~8000 words). A layer of 100 neurons allows the model 
#   to learn a compressed representation of features without being too complex (which would cause overfitting).
# - activation='relu': The standard activation function for modern neural networks, helps with training speed and convergence.
# - solver='adam': Efficient for large datasets and high-dimensional data.
# - early_stopping=True: Crucial for this dataset because we have more features than samples. 
#   This stops training when validation score stops improving, preventing the model from memorizing the training data.
# - max_iter=300: Giving the solver enough time to converge.

mlp = MLPClassifier(
    hidden_layer_sizes=(100,), 
    activation='relu', 
    solver='adam', 
    max_iter=300, 
    random_state=2013,
    early_stopping=True
)

mlp.fit(X_train_tfidf, y_train_enc)

# (b) Report test accuracy
y_pred_mlp = mlp.predict(X_test_tfidf)
mlp_accuracy = accuracy_score(y_test_enc, y_pred_mlp)

print(f"MLP (Feedforward NN) Accuracy: {mlp_accuracy:.4f}")
print(f"Multinomial Naive Bayes Accuracy: {mnb_accuracy:.4f}")
print(f"Base Rate Accuracy: {base_rate_accuracy:.4f}")

MLP (Feedforward NN) Accuracy: 0.7175
Multinomial Naive Bayes Accuracy: 0.6825
Base Rate Accuracy: 0.5938


**Hyperparameter Justification:**
*   **`hidden_layer_sizes=(100,)`**: The input dimension is high (~8000 words). A single hidden layer of 100 neurons allows the model to learn a compressed representation of features without being too complex, which helps avoid overfitting on this relatively small dataset.
*   **`activation='relu'`**: ReLU is the standard activation function for modern neural networks. It helps with training speed and convergence compared to older functions like sigmoid or tanh.
*   **`solver='adam'`**: Adam is an efficient optimizer for large datasets and high-dimensional data, handling sparse gradients (like TF-IDF) well.
*   **`early_stopping=True`**: This is crucial because we have more features than samples. It stops training when the validation score stops improving, preventing the model from memorizing the training data (overfitting).
*   **`max_iter=300`**: Provides the solver enough iterations to converge to a good solution.

### (c) Comparison
The Feedforward Neural Network (MLP) achieved an accuracy of 0.7175, which is higher than both the Multinomial Naive Bayes (0.6825) and the base rate (0.5938).

This improvement suggests that the relationship between the predictor variables (words) and the response (sentiment) is non-linear and complex. While Naive Bayes assumes independence between features, the Neural Network can capture interactions between words (e.g., "not" + "good"), allowing it to model the sentiment more accurately. The fact that it outperforms the simpler models indicates that these complex interactions are present and important in this dataset.

## 4. CNN for Text Classification
(a) Setup data for PyTorch:
*   Tokenize text.
*   Define vocabulary (including `<PAD>` and `<UNK>`).
*   Convert labels to integers.
*   Create custom Dataset class and DataLoader with padding.

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

# 1. Tokenize text
# Since we already preprocessed (lowercase, removed numbers/stopwords), we can just split by space
X_train_tokens = [text.split() for text in X_train]
X_test_tokens = [text.split() for text in X_test]

# 2. Define Vocabulary
# Count words in training data
word_counts = Counter()
for tokens in X_train_tokens:
    word_counts.update(tokens)

# Create vocab mapping
# Reserve 0 for <PAD> and 1 for <UNK>
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, count in word_counts.items():
    if word not in vocab:
        vocab[word] = len(vocab)

print(f"Vocabulary size: {len(vocab)}")

# 3. Convert labels to integers
# We already did this with LabelEncoder in the previous step (y_train_enc, y_test_enc)
# But let's ensure they are torch tensors later
print(f"Labels classes: {le.classes_}")

Vocabulary size: 8963
Labels classes: ['negative' 'neutral' 'positive']


In [11]:
# 4. Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, tokens_list, labels, vocab):
        self.tokens_list = tokens_list
        self.labels = labels
        self.vocab = vocab
        self.unk_idx = vocab['<UNK>']

    def __len__(self):
        return len(self.tokens_list)

    def __getitem__(self, idx):
        tokens = self.tokens_list[idx]
        label = self.labels[idx]
        
        # Convert tokens to indices, use UNK if not found
        indices = [self.vocab.get(token, self.unk_idx) for token in tokens]
        
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# 5. Collate Function for Padding
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
    
    # Pad sequences
    # padding_value=0 because <PAD> is 0
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    label_list = torch.tensor(label_list, dtype=torch.long)
    
    return text_list, label_list

# Initialize Datasets
train_dataset = TextDataset(X_train_tokens, y_train_enc, vocab)
test_dataset = TextDataset(X_test_tokens, y_test_enc, vocab)

# Initialize DataLoaders
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# Verify a batch
for text_batch, label_batch in train_loader:
    print(f"Text batch shape: {text_batch.shape}")
    print(f"Label batch shape: {label_batch.shape}")
    break

Text batch shape: torch.Size([64, 34])
Label batch shape: torch.Size([64])


In [12]:
import torch.nn as nn
import torch.nn.functional as F

# (b) Initialize CNN Network
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_filters, kernel_sizes):
        super(TextCNN, self).__init__()
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Convolution Layers
        # We create a ModuleList of Conv2d layers.
        # Input channels = 1 (text is like a grayscale image with 1 channel)
        # Output channels = num_filters
        # Kernel size = (n_gram_size, embed_dim) -> covers n words across full embedding width
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, 
                      out_channels=num_filters, 
                      kernel_size=(k, embed_dim)) 
            for k in kernel_sizes
        ])
        
        # Fully Connected Layer
        # Input size = num_filters * number of different kernel sizes
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # x shape: [batch_size, seq_len]
        
        # 1. Embedding
        x = self.embedding(x)  # [batch_size, seq_len, embed_dim]
        
        # 2. Permute for Conv2d
        # Conv2d expects [batch_size, channels, height, width]
        # We treat seq_len as height, embed_dim as width
        x = x.unsqueeze(1)  # [batch_size, 1, seq_len, embed_dim]
        
        # 3. Convolutions + ReLU + Max Pooling
        # Apply each conv layer, squeeze dimensions, apply ReLU, then max pool
        conv_results = []
        for conv in self.convs:
            # Conv: [batch_size, num_filters, seq_len-k+1, 1]
            out = conv(x)
            # Squeeze: [batch_size, num_filters, seq_len-k+1]
            out = out.squeeze(3)
            # ReLU
            out = F.relu(out)
            # Max Pool over the entire sequence length
            # Max Pool: [batch_size, num_filters]
            out = F.max_pool1d(out, out.size(2)).squeeze(2)
            conv_results.append(out)
            
        # 4. Concatenate
        # [batch_size, num_filters * len(kernel_sizes)]
        x = torch.cat(conv_results, 1)
        
        # 5. Dropout
        x = self.dropout(x)
        
        # 6. Fully Connected
        logits = self.fc(x)
        return logits

# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_DIM = 100       # Standard choice for word embeddings (dense enough to capture meaning)
NUM_FILTERS = 100     # Number of features to extract per n-gram size
KERNEL_SIZES = [2, 3, 4] # Look for 2-word, 3-word, and 4-word phrases (bi-grams, tri-grams, 4-grams)
NUM_CLASSES = 3       # Negative, Neutral, Positive

# Initialize Model
model = TextCNN(VOCAB_SIZE, EMBED_DIM, NUM_CLASSES, NUM_FILTERS, KERNEL_SIZES)

# Count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")
print(model)

The model has 987,503 trainable parameters
TextCNN(
  (embedding): Embedding(8963, 100, padding_idx=0)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


**Hyperparameter Choices:**
*   **Embedding Dimension (100):** A standard size that balances capturing semantic meaning with computational efficiency.
*   **Number of Filters (100):** We want to learn 100 different features for each n-gram size.
*   **Kernel Sizes ([2, 3, 4]):** We want the model to look at pairs of words (bi-grams), triplets (tri-grams), and 4-word phrases to capture local context.

**(c) Train CNN**
Train for 10 epochs and report test performance.

In [13]:
import torch.optim as optim

# (c) Train CNN
# device is already defined at the beginning of the notebook
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for text, labels in iterator:
        text, labels = text.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(text)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        # Calculate accuracy
        preds = predictions.argmax(dim=1)
        acc = (preds == labels).float().mean()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for text, labels in iterator:
            text, labels = text.to(device), labels.to(device)
            predictions = model(text)
            loss = criterion(predictions, labels)
            
            epoch_loss += loss.item()
            preds = predictions.argmax(dim=1)
            acc = (preds == labels).float().mean()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

N_EPOCHS = 10
best_valid_loss = float('inf')
patience = 3
no_improve_count = 0

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')
    
    # Early Stopping Check
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        no_improve_count = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        no_improve_count += 1
        if no_improve_count >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}!")
            break

# Load best model for final evaluation
model.load_state_dict(torch.load('best_model.pt'))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f"\nFinal CNN Test Accuracy (Best Model): {test_acc:.4f}")

Epoch: 01
	Train Loss: 0.956 | Train Acc: 58.28%
	 Val. Loss: 0.820 |  Val. Acc: 64.31%
Epoch: 02
	Train Loss: 0.763 | Train Acc: 67.50%
	 Val. Loss: 0.715 |  Val. Acc: 69.81%
Epoch: 02
	Train Loss: 0.763 | Train Acc: 67.50%
	 Val. Loss: 0.715 |  Val. Acc: 69.81%
Epoch: 03
	Train Loss: 0.650 | Train Acc: 72.89%
	 Val. Loss: 0.680 |  Val. Acc: 69.67%
Epoch: 03
	Train Loss: 0.650 | Train Acc: 72.89%
	 Val. Loss: 0.680 |  Val. Acc: 69.67%
Epoch: 04
	Train Loss: 0.558 | Train Acc: 77.60%
	 Val. Loss: 0.659 |  Val. Acc: 71.80%
Epoch: 04
	Train Loss: 0.558 | Train Acc: 77.60%
	 Val. Loss: 0.659 |  Val. Acc: 71.80%
Epoch: 05
	Train Loss: 0.473 | Train Acc: 80.77%
	 Val. Loss: 0.661 |  Val. Acc: 71.61%
Epoch: 05
	Train Loss: 0.473 | Train Acc: 80.77%
	 Val. Loss: 0.661 |  Val. Acc: 71.61%
Epoch: 06
	Train Loss: 0.397 | Train Acc: 84.40%
	 Val. Loss: 0.625 |  Val. Acc: 73.95%
Epoch: 06
	Train Loss: 0.397 | Train Acc: 84.40%
	 Val. Loss: 0.625 |  Val. Acc: 73.95%
Epoch: 07
	Train Loss: 0.348 | T

### (d) Comparison
The CNN achieved a test accuracy of approximately 74.15%, which is the highest among all models tested (Base Rate: 59.38%, Naive Bayes: 68.25%, MLP: 71.75%).

This implies that the structure of the input text (the specific order of words) contains valuable information. Unlike Naive Bayes (which ignores order completely) or the MLP on TF-IDF (which only captures order loosely via n-grams if included in TF-IDF, but mostly relies on word counts), the CNN explicitly looks for local patterns (2-word, 3-word, 4-word phrases) regardless of where they appear in the sentence. The fact that the CNN performs best suggests that local context and specific phrases (e.g., "not good", "very happy") are strong predictors of sentiment in this dataset.

## 5. Sequence Modeling (LSTM)
(a) Initialize a sequential model (LSTM) with embedding, LSTM layer, and fully connected layers. Justify choices.
(b) Train the model and report test performance.
(c) Compare performance to CNN and previous approaches.

In [14]:
# (a) Initialize LSTM Model
class TextLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, n_layers, bidirectional, dropout):
        super(TextLSTM, self).__init__()
        
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # LSTM Layer
        self.lstm = nn.LSTM(embed_dim, 
                            hidden_dim, 
                            num_layers=n_layers, 
                            bidirectional=bidirectional, 
                            dropout=dropout if n_layers > 1 else 0,
                            batch_first=True)
        
        # Fully Connected Layer
        # If bidirectional, hidden state size is doubled
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_classes)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: [batch_size, seq_len]
        
        # 1. Embedding
        embedded = self.dropout(self.embedding(x)) # [batch_size, seq_len, embed_dim]
        
        # 2. LSTM
        # output: [batch_size, seq_len, hidden_dim * num_directions]
        # hidden: [num_layers * num_directions, batch_size, hidden_dim]
        # cell: [num_layers * num_directions, batch_size, hidden_dim]
        output, (hidden, cell) = self.lstm(embedded)
        
        # 3. Extract final hidden state
        # If bidirectional, we concatenate the final forward and backward hidden states
        if self.lstm.bidirectional:
            # hidden[-2] is the last forward hidden state
            # hidden[-1] is the last backward hidden state
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
            
        # 4. Fully Connected
        logits = self.fc(hidden)
        return logits

# Hyperparameters
# Using same EMBED_DIM as CNN for fair comparison
HIDDEN_DIM = 100      # Number of hidden units in LSTM
N_LAYERS = 2          # Number of LSTM layers
BIDIRECTIONAL = True  # Use bidirectional LSTM to capture context from both directions
DROPOUT = 0.5         # Regularization

# Initialize Model
lstm_model = TextLSTM(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_CLASSES, N_LAYERS, BIDIRECTIONAL, DROPOUT)

print(f"The LSTM model has {count_parameters(lstm_model):,} trainable parameters")
print(lstm_model)

The LSTM model has 1,300,103 trainable parameters
TextLSTM(
  (embedding): Embedding(8963, 100, padding_idx=0)
  (lstm): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=200, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


**Hyperparameter Choices:**
*   **Type (LSTM):** LSTMs are generally better than vanilla RNNs at capturing long-term dependencies and avoiding vanishing gradient problems.
*   **Hidden Units (100):** Chosen to match the embedding dimension and provide sufficient capacity.
*   **Bidirectional (True):** Sentiment often depends on the entire sentence context. "Not good" requires seeing "not" (before) and "good" (after) to understand the negation fully. Bidirectional LSTMs read the sentence forwards and backwards.
*   **Layers (2):** Stacking layers allows the model to learn more complex hierarchical features.

**(b) Train LSTM**
Train for 10 epochs and report test performance.

In [15]:
# (b) Train LSTM
lstm_model = lstm_model.to(device)
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Reset best loss for new training
best_valid_loss = float('inf')
no_improve_count = 0

print("Training LSTM Model...")
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(lstm_model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(lstm_model, test_loader, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')
    
    # Early Stopping Check
    if test_loss < best_valid_loss:
        best_valid_loss = test_loss
        no_improve_count = 0
        torch.save(lstm_model.state_dict(), 'best_lstm_model.pt')
    else:
        no_improve_count += 1
        if no_improve_count >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}!")
            break

# Load best model
lstm_model.load_state_dict(torch.load('best_lstm_model.pt'))
test_loss, test_acc = evaluate(lstm_model, test_loader, criterion)
print(f"\nFinal LSTM Test Accuracy (Best Model): {test_acc:.4f}")

Training LSTM Model...
Epoch: 01
	Train Loss: 0.925 | Train Acc: 58.33%
	 Val. Loss: 0.848 |  Val. Acc: 63.95%
Epoch: 01
	Train Loss: 0.925 | Train Acc: 58.33%
	 Val. Loss: 0.848 |  Val. Acc: 63.95%
Epoch: 02
	Train Loss: 0.834 | Train Acc: 64.11%
	 Val. Loss: 0.791 |  Val. Acc: 66.54%
Epoch: 02
	Train Loss: 0.834 | Train Acc: 64.11%
	 Val. Loss: 0.791 |  Val. Acc: 66.54%
Epoch: 03
	Train Loss: 0.799 | Train Acc: 64.94%
	 Val. Loss: 0.763 |  Val. Acc: 68.59%
Epoch: 03
	Train Loss: 0.799 | Train Acc: 64.94%
	 Val. Loss: 0.763 |  Val. Acc: 68.59%
Epoch: 04
	Train Loss: 0.751 | Train Acc: 67.63%
	 Val. Loss: 0.732 |  Val. Acc: 69.17%
Epoch: 04
	Train Loss: 0.751 | Train Acc: 67.63%
	 Val. Loss: 0.732 |  Val. Acc: 69.17%
Epoch: 05
	Train Loss: 0.700 | Train Acc: 70.72%
	 Val. Loss: 0.740 |  Val. Acc: 69.03%
Epoch: 05
	Train Loss: 0.700 | Train Acc: 70.72%
	 Val. Loss: 0.740 |  Val. Acc: 69.03%
Epoch: 06
	Train Loss: 0.665 | Train Acc: 71.96%
	 Val. Loss: 0.738 |  Val. Acc: 72.39%
Epoch: 06

### (c) Comparison
The LSTM model achieved a test accuracy of approximately 73.06%, which is comparable to the CNN (74.14%) and better than the MLP (71.75%) and Naive Bayes (68.25%).

**Comparison with CNN:**
I am slightly surprised that the LSTM did not outperform the CNN, as LSTMs are theoretically designed to capture long-range dependencies in sequences (e.g., understanding a sentiment that depends on words far apart in the sentence). However, the CNN's superior performance suggests that for this specific dataset (financial news headlines), **local features** (short phrases like "profit rose" or "loss widened") are more important than long-range global context. The CNN is excellent at detecting these short, informative triggers, while the LSTM might be "overthinking" the sequence structure for relatively short sentences.

## 6. Fine-tuning Transformers
(a) Re-initialize dataset with raw text.
(b) Fine-tune DistilBERT (distilbert-base-uncased).
(c) Report accuracy.
(d) Compare with previous approaches.

In [9]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset




In [13]:
from sklearn.preprocessing import LabelEncoder

# Split data using the same random state to ensure consistency
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    df['text'], 
    df['label'], 
    test_size=0.1, 
    stratify=df['label'], 
    random_state=2013
)

# Initialize LabelEncoder (re-creating it here in case Section 3 wasn't run)
le = LabelEncoder()
le.fit(y_train_raw)

# Encode labels
y_train_bert = le.transform(y_train_raw)
y_test_bert = le.transform(y_test_raw)

print(f"Training set size: {len(X_train_raw)}")
print(f"Test set size: {len(X_test_raw)}")

Training set size: 4361
Test set size: 485


In [14]:
# (b) Fine-tune DistilBERT

# 1. Tokenizer
# Load the tokenizer for distilbert-base-uncased
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 2. Custom Dataset for BERT
class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels[idx]
        
        # Encoding the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
        
# Create Datasets
train_dataset_bert = BertDataset(X_train_raw, y_train_bert, tokenizer)
test_dataset_bert = BertDataset(X_test_raw, y_test_bert, tokenizer)

# Create DataLoaders
train_loader_bert = DataLoader(train_dataset_bert, batch_size=16, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=16, shuffle=False)

print("DataLoaders created.")

DataLoaders created.


In [16]:
NUM_CLASSES = 3       # Negative, Neutral, Positive

# 3. Initialize Model
bert_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=NUM_CLASSES
)

# Move to device (MPS/CUDA/CPU)
bert_model = bert_model.to(device)

# Optimizer
# We use a smaller learning rate for fine-tuning
optimizer = AdamW(bert_model.parameters(), lr=2e-5)

# Training Loop
def train_bert(model, iterator, optimizer):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for i, batch in enumerate(iterator):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        acc = (preds == labels).float().mean()
        epoch_acc += acc.item()
        
        # Print progress every 50 batches
        if (i + 1) % 50 == 0:
            print(f"  Batch {i + 1}/{len(iterator)} processed...")
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate_bert(model, iterator):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            epoch_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            acc = (preds == labels).float().mean()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Train
N_EPOCHS = 5
print("Training DistilBERT...")
for epoch in range(N_EPOCHS):
    print(f"Starting Epoch {epoch+1}...")
    train_loss, train_acc = train_bert(bert_model, train_loader_bert, optimizer)
    test_loss, test_acc = evaluate_bert(bert_model, test_loader_bert)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training DistilBERT...
Starting Epoch 1...
  Batch 50/273 processed...
  Batch 50/273 processed...
  Batch 100/273 processed...
  Batch 100/273 processed...
  Batch 150/273 processed...
  Batch 150/273 processed...
  Batch 200/273 processed...
  Batch 200/273 processed...
  Batch 250/273 processed...
  Batch 250/273 processed...
Epoch: 01
	Train Loss: 0.536 | Train Acc: 77.78%
	 Val. Loss: 0.363 |  Val. Acc: 85.89%
Starting Epoch 2...
Epoch: 01
	Train Loss: 0.536 | Train Acc: 77.78%
	 Val. Loss: 0.363 |  Val. Acc: 85.89%
Starting Epoch 2...
  Batch 50/273 processed...
  Batch 50/273 processed...
  Batch 100/273 processed...
  Batch 100/273 processed...
  Batch 150/273 processed...
  Batch 150/273 processed...
  Batch 200/273 processed...
  Batch 200/273 processed...
  Batch 250/273 processed...
  Batch 250/273 processed...
Epoch: 02
	Train Loss: 0.277 | Train Acc: 89.29%
	 Val. Loss: 0.402 |  Val. Acc: 85.69%
Starting Epoch 3...
Epoch: 02
	Train Loss: 0.277 | Train Acc: 89.29%
	 Val. L

In [17]:
# (c) Report Accuracy
print(f"Final DistilBERT Test Accuracy: {test_acc:.4f}")

Final DistilBERT Test Accuracy: 0.8367


### (d) Comparison
The fine-tuned DistilBERT model achieved the highest accuracy among all models tested, significantly outperforming the Base Rate, Naive Bayes, MLP, CNN, and LSTM.

**Performance & Hardware:**
Training this model took only about **3 minutes** for 5 epochs. This impressive speed is largely due to using **MPS (Metal Performance Shaders)** acceleration on the **Apple M4 Pro chip**. I love my laptop

**Meaning of the Difference:**
The superior performance of DistilBERT compared to previous approaches stems from:
1.  **Transfer Learning:** Unlike the other models that learned from scratch, DistilBERT started with a massive amount of pre-existing knowledge about English syntax and semantics (pre-trained on Wikipedia).
2.  **Contextual Embeddings:** It understands that the meaning of a word changes based on its context (e.g., "bank" of a river vs. "bank" for money), whereas TF-IDF and standard embeddings are static.
3.  **Self-Attention:** The Transformer architecture allows the model to weigh the importance of every word relative to every other word in the sentence simultaneously, capturing complex dependencies more effectively than the sequential processing of LSTMs or the local feature extraction of CNNs.

## 7. Zero-shot AI Classification
(a) Use an external script to query a Large Language Model (Gemini) for zero-shot classification.

**AI Tool Used:** Google Gemini (via `run_ai_labeling.py` automation script)

**Prompt Used:**
```text
I have a list of {len(chunk)} financial headlines. Please classify the sentiment of each one as exactly 'negative', 'neutral', or 'positive'. 
Return the result as a CSV format with a single column named 'predicted_label' containing only the labels in the same order as the input. 
Do not include the original text in the output. Do not include numbering. Do not include markdown formatting like ```csv.
Just the raw list of labels, one per line.

Here is the list:
[LIST OF SENTENCES]
```

(b) Load the predictions and report the accuracy.
(c) Compare with the best supervised model.

In [None]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import os
import sys
import time
import re
from sklearn.model_selection import train_test_split

# --- CONFIGURATION ---
# Directory to store browser session (cookies, login state)
BROWSER_PROFILE_DIR = "./browser_profile"
# Input file (Raw data file)
INPUT_FILE = 'Sentences_50Agree.txt'
# Output file to be read by the notebook
OUTPUT_FILE = 'ai_predictions.csv'
# Number of sentences to process in one batch (to avoid hitting character limits)
CHUNK_SIZE = 50 
# Set to True to run without opening a visible browser window (after you have logged in)
HEADLESS = False 

async def ask_gemini(page, prompt):
    """
    Sends a prompt to Gemini and retrieves the response.
    """
    try:
        # 1. Navigate to Gemini
        if "gemini.google.com" not in page.url:
            await page.goto("https://gemini.google.com/app")
            await page.wait_for_load_state("networkidle")

        # 2. Find the input box
        # Gemini uses a contenteditable div
        textarea_selector = 'div[contenteditable="true"][role="textbox"], textarea'
        textarea = await page.wait_for_selector(textarea_selector, timeout=10000)
        
        # 3. Fill and send prompt
        await textarea.fill(prompt)
        await textarea.press("Enter")
        
        # 4. Wait for the response to generate
        print("   Waiting for response...")
        
        # Wait for the response container
        # Gemini structure: <model-response> contains the text
        response_selector = 'model-response' 
        await page.wait_for_selector(response_selector, timeout=30000)
        
        # Wait for stability
        prev_text = ""
        stable_count = 0
        for _ in range(60): # Wait up to 60 seconds for stability
            await asyncio.sleep(1)
            elements = await page.query_selector_all(response_selector)
            if not elements: continue
            
            last_element = elements[-1]
            curr_text = await last_element.inner_text()
            
            if curr_text == prev_text and len(curr_text) > 10:
                stable_count += 1
                if stable_count >= 3: # Stable for 3 seconds
                    return curr_text
            else:
                stable_count = 0
            
            prev_text = curr_text
            
        return prev_text

    except Exception as e:
        print(f"   Error querying Gemini: {e}")
        return ""

async def main():
    # 1. Check for Input File
    if not os.path.exists(INPUT_FILE):
        print(f"‚ùå Error: '{INPUT_FILE}' not found.")
        return

    # 2. Load and Split Data (Replicating Notebook Logic)
    print(f"üìÇ Loading data from {INPUT_FILE}...")
    
    # Parse the raw text file
    data = []
    try:
        with open(INPUT_FILE, 'r', encoding='latin-1') as f:
            for line in f:
                line = line.strip()
                if not line: continue
                # Split by the last @ to separate text and label
                parts = line.rsplit('@', 1)
                if len(parts) == 2:
                    data.append(parts)
    except Exception as e:
        print(f"‚ùå Error reading file: {e}")
        return

    df = pd.DataFrame(data, columns=['text', 'label'])
    
    # Perform the exact same split as the notebook to get the TEST set
    print("   Splitting data (test_size=0.1, random_state=2013)...")
    try:
        _, X_test, _, y_test = train_test_split(
            df['text'], 
            df['label'], 
            test_size=0.1, 
            stratify=df['label'], 
            random_state=2013
        )
    except ImportError:
        print("‚ùå Error: scikit-learn is not installed. Please run 'pip install scikit-learn'")
        return

    texts = X_test.tolist()
    print(f"   Found {len(texts)} test sentences to classify.")
    
    # Save ground truth for the notebook to use later
    ground_truth_df = pd.DataFrame({'text': X_test, 'true_label': y_test})
    ground_truth_df.to_csv('test_sentences_ground_truth.csv', index=False)
    print("   Saved ground truth to 'test_sentences_ground_truth.csv'.")

    all_predictions = []

    # 3. Launch Browser
    print("üöÄ Launching Browser...")
    async with async_playwright() as p:
        # Create/Load persistent context to keep login session
        if not os.path.exists(BROWSER_PROFILE_DIR):
            os.makedirs(BROWSER_PROFILE_DIR)
            
        context = await p.chromium.launch_persistent_context(
            user_data_dir=BROWSER_PROFILE_DIR,
            headless=HEADLESS,
            channel="chrome", # Uses your installed Chrome if available
            args=["--disable-blink-features=AutomationControlled"]
        )
        
        page = await context.new_page()
        
        # Check if logged in
        await page.goto("https://gemini.google.com/app")
        await asyncio.sleep(3)
        if "accounts.google.com" in page.url or "ServiceLogin" in page.url:
            print("\n‚ö†Ô∏è  PLEASE LOG IN TO GEMINI IN THE BROWSER WINDOW.")
            print("   The script will wait for you to log in.")
            input("   Press Enter here once you are logged in and can see the chat interface...")

        # 4. Process in Chunks
        total_chunks = (len(texts) + CHUNK_SIZE - 1) // CHUNK_SIZE
        
        for i in range(0, len(texts), CHUNK_SIZE):
            chunk_idx = i // CHUNK_SIZE + 1
            chunk = texts[i:i+CHUNK_SIZE]
            print(f"\nüîÑ Processing Batch {chunk_idx}/{total_chunks} ({len(chunk)} sentences)...")
            
            # Construct the Prompt
            prompt = f"""
I have a list of {len(chunk)} financial headlines. Please classify the sentiment of each one as exactly 'negative', 'neutral', or 'positive'. 
Return the result as a CSV format with a single column named 'predicted_label' containing only the labels in the same order as the input. 
Do not include the original text in the output. Do not include numbering. Do not include markdown formatting like ```csv.
Just the raw list of labels, one per line.

Here is the list:
"""
            for text in chunk:
                prompt += f"{text}\n"
            
            # Query AI
            response_text = await ask_gemini(page, prompt)
            
            # Parse Response using Regex to be robust against formatting (spaces, newlines, commas)
            # We look for the specific words: negative, neutral, positive
            found_labels = re.findall(r'\b(negative|neutral|positive)\b', response_text, re.IGNORECASE)
            
            # Convert to lowercase
            batch_labels = [l.lower() for l in found_labels]
            
            print(f"   Received {len(batch_labels)} labels from AI.")
            
            # Handle mismatches (AI sometimes misses one or adds extra)
            if len(batch_labels) != len(chunk):
                print(f"   ‚ö†Ô∏è  Warning: Count mismatch (Sent {len(chunk)}, Got {len(batch_labels)}). Padding/Truncating.")
                # If too few, pad with 'neutral'
                if len(batch_labels) < len(chunk):
                    batch_labels.extend(['neutral'] * (len(chunk) - len(batch_labels)))
                # If too many, truncate
                else:
                    batch_labels = batch_labels[:len(chunk)]
            
            all_predictions.extend(batch_labels)
            
            # Small pause to be nice to the server
            await asyncio.sleep(2)

        await context.close()

    # 5. Save Results
    print(f"\nüíæ Saving results to {OUTPUT_FILE}...")
    output_df = pd.DataFrame({'predicted_label': all_predictions})
    output_df.to_csv(OUTPUT_FILE, index=False)
    print("‚úÖ Done! You can now run the 'Evaluate' cell in your notebook.")

if __name__ == "__main__":
    asyncio.run(main()


In [19]:
# Load the ground truth and predictions from the external script
try:
    ground_truth_df = pd.read_csv('test_sentences_ground_truth.csv')
    predictions_df = pd.read_csv('ai_predictions.csv')

    # Ensure lengths match
    if len(ground_truth_df) != len(predictions_df):
        print(f"Warning: Length mismatch. Ground Truth: {len(ground_truth_df)}, Predictions: {len(predictions_df)}")
        # Truncate to the shorter one for calculation
        min_len = min(len(ground_truth_df), len(predictions_df))
        ground_truth_df = ground_truth_df.iloc[:min_len]
        predictions_df = predictions_df.iloc[:min_len]

    # Calculate Accuracy
    # We need to ensure labels are normalized (lowercase, stripped)
    y_true = ground_truth_df['true_label'].str.lower().str.strip()
    y_pred = predictions_df['predicted_label'].str.lower().str.strip()

    ai_accuracy = accuracy_score(y_true, y_pred)
    print(f"Zero-shot AI (Gemini) Accuracy: {ai_accuracy:.4f}")
    
    # Compare with best previous model (DistilBERT)
    # Note: 'test_acc' variable holds the last calculated accuracy (DistilBERT)
    print(f"DistilBERT Accuracy: {test_acc:.4f}")

except FileNotFoundError:
    print("Error: Could not find 'test_sentences_ground_truth.csv' or 'ai_predictions.csv'.")
    print("Please run the 'run_ai_labeling.py' script first.")

Zero-shot AI (Gemini) Accuracy: 0.7938
DistilBERT Accuracy: 0.8367


### (c) Comparison
The Zero-shot AI model (Gemini) achieved an accuracy of **79.38%**.

**Comparison with Previous Approaches:**
*   **Base Rate:** 59.38%
*   **Naive Bayes:** 68.25%
*   **MLP:** 71.75%
*   **CNN:** 74.15%
*   **LSTM:** 73.06%
*   **DistilBERT (Fine-tuned):** 83.67%
*   **Zero-shot Gemini:** 79.38%

**Analysis:**
The Zero-shot Gemini model performed exceptionally well, outperforming all traditional supervised models (Naive Bayes, MLP, CNN, LSTM) without seeing a single training example from this dataset. It achieved nearly 80% accuracy purely based on its pre-trained knowledge of language and sentiment.

However, it did not beat the fine-tuned DistilBERT model (83.67%). This makes sense because DistilBERT was allowed to study the specific training data for this task, learning the exact vocabulary and style of these financial headlines (the "specialist" approach). Gemini, while a powerful "generalist," had to guess the sentiment based on general principles without knowing the specific quirks of this dataset.

**Conclusion:**
Off-the-shelf LLMs are incredibly powerful baselines that can beat complex custom models (like CNNs/LSTMs) with zero effort. However, for maximum performance, fine-tuning a smaller model (like DistilBERT) on domain-specific data is still the superior approach.