## Loading 20NG dataset with 5 categories

In [1]:
from sklearn.datasets import fetch_20newsgroups
import re
import torch
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter


In [4]:
categories = ['sci.space', 'comp.graphics', 'rec.sport.baseball', 'sci.med', 'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'\W+', ' ', text)
    return text.split()

tokenized_texts = [preprocess_text(doc) for doc in newsgroups.data]
print(tokenized_texts[0])

['do', 'you', 'really', 'have', 'that', 'much', 'information', 'on', 'him', 'really', 'i', 'don', 't', 'know', 'you', 'tell', 'me', 'what', 'percentage', 'of', 'players', 'reach', 'or', 'exceed', 'their', 'mle', 's', 'in', 'their', 'rookie', 'season', 'we', 're', 'talking', 'about', '1993', 'you', 'know', 'if', 'that', 'were', 'your', 'purpose', 'maybe', 'offerman', 'spent', '1992', 'getting', 'acclimated', 'if', 'you', 'will', 'the', 'dodgers', 'as', 'a', 'team', 'paid', 'a', 'big', 'price', 'that', 'season', 'perhaps', 'they', 'will', 'reap', 'the', 'benefits', 'down', 'the', 'road', 'do', 'you', 'really', 'think', 'they', 'would', 'have', 'done', 'what', 'they', 'did', 'if', 'they', 'were', 'competing', 'for', 'a', 'pennant', 'for', 'a', 'stat', 'head', 'i', 'm', 'amazed', 'that', 'you', 'put', 'any', 'credence', 'in', 'spring', 'training', 'did', 'you', 'notice', 'who', 'he', 'got', 'those', '10', 'hits', 'off', 'of', 'or', 'are', 'you', 'going', 'to', 'tell', 'me', 'that', 'it', '

## Loading Glove Embeddings

In [5]:
import numpy as np

glove_path = "glove.6B.100d.txt"

glove_embeddings = {}
embedding_dim = 100

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype=np.float32)
        glove_embeddings[word] = vector

print("Loaded GloVe embeddings:", len(glove_embeddings), "words.")
print(glove_embeddings['hello'])


Loaded GloVe embeddings: 400000 words.
[ 0.26688    0.39632    0.6169    -0.77451   -0.1039     0.26697
  0.2788     0.30992    0.0054685 -0.085256   0.73602   -0.098432
  0.5479    -0.030305   0.33479    0.14094   -0.0070003  0.32569
  0.22902    0.46557   -0.19531    0.37491   -0.7139    -0.51775
  0.77039    1.0881    -0.66011   -0.16234    0.9119     0.21046
  0.047494   1.0019     1.1133     0.70094   -0.08696    0.47571
  0.1636    -0.44469    0.4469    -0.93817    0.013101   0.085964
 -0.67456    0.49662   -0.037827  -0.11038   -0.28612    0.074606
 -0.31527   -0.093774  -0.57069    0.66865    0.45307   -0.34154
 -0.7166    -0.75273    0.075212   0.57903   -0.1191    -0.11379
 -0.10026    0.71341   -1.1574    -0.74026    0.40452    0.18023
  0.21449    0.37638    0.11239   -0.53639   -0.025092   0.31886
 -0.25013   -0.63283   -0.011843   1.377      0.86013    0.20476
 -0.36815   -0.68874    0.53512   -0.46556    0.27389    0.4118
 -0.854     -0.046288   0.11304   -0.27326    0.1

In [None]:
# Step 1: Create a vocabulary mapping
word_to_idx = {word: i+1 for i, word in enumerate(glove_embeddings.keys())}  # +1 to reserve index 0 for padding
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Step 2: Convert tokenized text to indices
def doc_to_indices(doc, word_to_idx, max_len=50):
    return [word_to_idx[word] for word in doc if word in word_to_idx][:max_len]

indexed_texts = [doc_to_indices(doc, word_to_idx) for doc in tokenized_texts]

# Step 3: Pad sequences to have equal length
from torch.nn.utils.rnn import pad_sequence

indexed_tensors = [torch.tensor(seq, dtype=torch.long) for seq in indexed_texts]
padded_sequences = pad_sequence(indexed_tensors, batch_first=True, padding_value=0)  # Padding index 0

# Convert labels to tensor
labels = torch.tensor(newsgroups.target, dtype=torch.long)


In [6]:
from torch import nn

# Step 1: Create embedding matrix
vocab_size = len(word_to_idx) + 1  # +1 for padding
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_to_idx.items():
    if word in glove_embeddings:
        embedding_matrix[i] = glove_embeddings[word]

# Step 2: Create PyTorch embedding layer
embedding_layer = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True)


In [None]:
class TextClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.embedding = embedding_layer
        self.lstm = nn.LSTM(100, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        x = self.fc(hidden[-1])
        return self.softmax(x)

hidden_dim = 256
output_dim = len(categories)
model = TextClassifier(embedding_layer, hidden_dim, output_dim)


In [23]:
import torch.optim as optim

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to DataLoader
batch_size = 32
dataset = TensorDataset(padded_sequences, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 40
for epoch in range(num_epochs):
    for batch in dataloader:
        x_batch, y_batch = batch
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 1.3321
Epoch 2, Loss: 1.6165
Epoch 3, Loss: 1.6130
Epoch 4, Loss: 1.4380
Epoch 5, Loss: 1.3888
Epoch 6, Loss: 1.5074
Epoch 7, Loss: 1.1498
Epoch 8, Loss: 1.3211
Epoch 9, Loss: 1.3561
Epoch 10, Loss: 1.4768
Epoch 11, Loss: 1.2904
Epoch 12, Loss: 1.1543
Epoch 13, Loss: 0.9958
Epoch 14, Loss: 1.0251
Epoch 15, Loss: 0.9575
Epoch 16, Loss: 1.0024
Epoch 17, Loss: 1.0052
Epoch 18, Loss: 0.9056
Epoch 19, Loss: 0.9057
Epoch 20, Loss: 1.1000
Epoch 21, Loss: 0.9067
Epoch 22, Loss: 0.9557
Epoch 23, Loss: 1.0562
Epoch 24, Loss: 0.9051
Epoch 25, Loss: 1.1163
Epoch 26, Loss: 0.9675
Epoch 27, Loss: 1.0879
Epoch 28, Loss: 0.9560
Epoch 29, Loss: 1.0551
Epoch 30, Loss: 0.9560
Epoch 31, Loss: 0.9058
Epoch 32, Loss: 0.9054
Epoch 33, Loss: 0.9551
Epoch 34, Loss: 1.0052
Epoch 35, Loss: 1.0050
Epoch 36, Loss: 0.9551
Epoch 37, Loss: 1.0050
Epoch 38, Loss: 0.9551
Epoch 39, Loss: 1.0050
Epoch 40, Loss: 0.9050


In [27]:
model.embedding.weight.requires_grad = True  # Unfreeze embeddings

# Re-train with fine-tuned embeddings
for epoch in range(num_epochs):
    for batch in dataloader:
        x_batch, y_batch = batch
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    print(f"Fine-Tuning Epoch {epoch+1}, Loss: {loss.item():.4f}")


Fine-Tuning Epoch 1, Loss: 1.0049
Fine-Tuning Epoch 2, Loss: 0.9549
Fine-Tuning Epoch 3, Loss: 0.9549
Fine-Tuning Epoch 4, Loss: 0.9049
Fine-Tuning Epoch 5, Loss: 1.0049
Fine-Tuning Epoch 6, Loss: 0.9049
Fine-Tuning Epoch 7, Loss: 0.9549
Fine-Tuning Epoch 8, Loss: 0.9049
Fine-Tuning Epoch 9, Loss: 1.0049
Fine-Tuning Epoch 10, Loss: 0.9549
Fine-Tuning Epoch 11, Loss: 0.9549
Fine-Tuning Epoch 12, Loss: 0.9549
Fine-Tuning Epoch 13, Loss: 0.9549
Fine-Tuning Epoch 14, Loss: 1.0548
Fine-Tuning Epoch 15, Loss: 1.0049
Fine-Tuning Epoch 16, Loss: 0.9549
Fine-Tuning Epoch 17, Loss: 1.0050
Fine-Tuning Epoch 18, Loss: 0.9549
Fine-Tuning Epoch 19, Loss: 0.9549


KeyboardInterrupt: 

In [24]:
def compute_accuracy(model, dataloader):
    model.eval()  # Set model to evaluation mode
    correct, total = 0, 0
    
    with torch.no_grad():  # No need to track gradients during evaluation
        for x_batch, y_batch in dataloader:
            outputs = model(x_batch)
            predictions = torch.argmax(outputs, dim=1)  # Get predicted class
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)
    
    return correct / total  # Return accuracy as a fraction


In [25]:
# Compute accuracy before fine-tuning
initial_accuracy = compute_accuracy(model, dataloader)
print(f"Accuracy with Frozen GloVe Embeddings: {initial_accuracy:.4f}")


# Compute accuracy after fine-tuning
fine_tuned_accuracy = compute_accuracy(model, dataloader)
print(f"Accuracy After Fine-Tuning: {fine_tuned_accuracy:.4f}")


Accuracy with Frozen GloVe Embeddings: 0.9488
Accuracy After Fine-Tuning: 0.9488
