In [None]:
import numpy as np
import torch
from torch import nn
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

freezzer = False
nltk.download('punkt')

categories = ['rec.sport.hockey', 'comp.graphics', 'sci.med', 'talk.politics.guns', 'soc.religion.christian']
ng_data = fetch_20newsgroups(subset='train')
min_size = 5 
max_size = 1000
tokens = []
temp = []
print(len(ng_data.data))
for i in range(len(ng_data.data)):
    a = word_tokenize(ng_data.data[i].lower())
    if len(a) > min_size and len(a)<max_size:
        tokens.append(a)
        temp.append(ng_data.target[i])
        
print(len(tokens))

min_frequency = 5

word_counts = Counter([word for doc in tokens for word in doc])


filtered_words = [word for word, count in word_counts.items() if count >= min_frequency]

def load_glove_embeddings(glove_file_path):
    glove_dict = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_dict[word] = vector
    return glove_dict

glove_dict = load_glove_embeddings("glove.6B.100d.txt")

word_to_index = {}
index = 0

for word in filtered_words:
    if word in glove_dict:
        word_to_index[word] = index
        index += 1
word_to_index['<UNK>'] = len(word_to_index)

vocab_size = len(word_to_index)


def doc2ind(tokens, word_to_index):
    return [word_to_index.get(word, word_to_index['<UNK>']) for word in tokens]



ng_vector_idx = [doc2ind(doc, word_to_index) for doc in tokens]


def pad_sequences(sequences, padding_value=0):
    return pad_sequence([torch.tensor(seq) for seq in sequences], batch_first=True, padding_value=padding_value)


ng_padded = pad_sequences(ng_vector_idx)

# Encode labels (target categories)
label_encoder = LabelEncoder()
ng_labels = label_encoder.fit_transform(temp)

# Split 
X_train, X_val, y_train, y_val = train_test_split(ng_padded, ng_labels, test_size=0.2, random_state=42)

# Neural Network 
class TextClassifier(nn.Module):
    def __init__(self, embedding_dim, vocab_size, num_classes, glove_dict, freezzer):
        super(TextClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        embedding_matrix = np.zeros((vocab_size, embedding_dim))
        
        for word, idx in word_to_index.items():
            if word in glove_dict:
                embedding_matrix[idx] = glove_dict[word]  
        
        self.embedding_layer.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embedding_layer.weight.requires_grad = freezzer  # Freeze the GloVe embeddings

        # Define a fully connected layer for classification
        self.fc = nn.Linear(embedding_dim, 256)
        self.fc2 = nn.Linear(256,num_classes)

    def forward(self, x):
        x = self.embedding_layer(x)  
        x = x.mean(dim=1)  
        x = self.fc(x)
        x = torch.relu(x) 
        x = self.fc2(x)
        return x


num_classes = len(set(ng_labels))
model = TextClassifier(embedding_dim=100, vocab_size=vocab_size, num_classes=num_classes, glove_dict=glove_dict, freezzer = False)

print(model)

class NewsgroupDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __getitem__(self, index):
        return self.data[index], self.targets[index]

    def __len__(self):
        return len(self.data)

train_dataset = NewsgroupDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = NewsgroupDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 40

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        running_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Validation Accuracy: {accuracy}%")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


11314
10783
TextClassifier(
  (embedding_layer): Embedding(19717, 100)
  (fc): Linear(in_features=100, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=20, bias=True)
)
Epoch 1/40, Loss: 2.960785389829565
Epoch 2/40, Loss: 2.8990357866993657
Epoch 3/40, Loss: 2.8308806834397493
Epoch 4/40, Loss: 2.747389549679226
Epoch 5/40, Loss: 2.640270300264712
Epoch 6/40, Loss: 2.5472207634537307
Epoch 7/40, Loss: 2.453374895343074
Epoch 8/40, Loss: 2.3917201545503404
Epoch 9/40, Loss: 2.3275935835308497
Epoch 10/40, Loss: 2.270705758642267
Epoch 11/40, Loss: 2.2258672091696
Epoch 12/40, Loss: 2.1893404139412773
Epoch 13/40, Loss: 2.1480565631831134
Epoch 14/40, Loss: 2.115941608393634
Epoch 15/40, Loss: 2.08919463996534
Epoch 16/40, Loss: 2.062292710940043
Epoch 17/40, Loss: 2.030610822306739
Epoch 18/40, Loss: 2.0074416783120896
Epoch 19/40, Loss: 1.986762222537288
Epoch 20/40, Loss: 1.9618773504539773
Epoch 21/40, Loss: 1.9424958101025334
Epoch 22/40, Loss: 1.925094796

In [19]:
num_classes = len(set(ng_labels)) 
model = TextClassifier(embedding_dim=100, vocab_size=vocab_size, num_classes=num_classes, glove_dict=glove_dict, freezzer=True)

print(model)

class NewsgroupDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __getitem__(self, index):
        return self.data[index], self.targets[index]

    def __len__(self):
        return len(self.data)

train_dataset = NewsgroupDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = NewsgroupDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 40  

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        running_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Validation Accuracy: {accuracy}%")


TextClassifier(
  (embedding_layer): Embedding(19717, 100)
  (fc): Linear(in_features=100, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=20, bias=True)
)
Epoch 1/40, Loss: 2.9507897271050347
Epoch 2/40, Loss: 2.790778586599562
Epoch 3/40, Loss: 2.4784791451913337
Epoch 4/40, Loss: 2.1663900909600433
Epoch 5/40, Loss: 1.91250788503223
Epoch 6/40, Loss: 1.7049969200734738
Epoch 7/40, Loss: 1.523323197276504
Epoch 8/40, Loss: 1.3679495286058496
Epoch 9/40, Loss: 1.241154784405673
Epoch 10/40, Loss: 1.1174088992454387
Epoch 11/40, Loss: 1.009323047487824
Epoch 12/40, Loss: 0.9145931952529484
Epoch 13/40, Loss: 0.8213542667803941
Epoch 14/40, Loss: 0.732907067294474
Epoch 15/40, Loss: 0.6696249450798388
Epoch 16/40, Loss: 0.5875544992861924
Epoch 17/40, Loss: 0.52528087121469
Epoch 18/40, Loss: 0.4707951945839105
Epoch 19/40, Loss: 0.42061749200026194
Epoch 20/40, Loss: 0.37564025118395133
Epoch 21/40, Loss: 0.33956517877954023
Epoch 22/40, Loss: 0.2996701166309