In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

def load_glove_embeddings(file_path, word_to_index, embedding_dim):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    num_words = len(word_to_index) + 1  # Add 1 for the padding token
    embedding_matrix = np.zeros((num_words, embedding_dim))

    for word, index in word_to_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
        else:
            embedding_matrix[index] = embeddings_index.get("<unk>", np.zeros(embedding_dim))

    return embedding_matrix


with open("processed_train_data.json", "r") as f_train, open("processed_val_data.json", "r") as f_val:
    processed_train_data = json.load(f_train)
    processed_val_data = json.load(f_val)


texts_train = [entry["text"] for entry in processed_train_data.values()]
labels_train = [entry["labels"] for entry in processed_train_data.values()]

texts_val = [entry["text"] for entry in processed_val_data.values()]
labels_val = [entry["labels"] for entry in processed_val_data.values()]


word_to_index = {}
index = 1 
for entry in processed_train_data.values():
    tokens = entry["text"].split() 
    for token in tokens:
        if token not in word_to_index:
            word_to_index[token] = index
            index += 1

word_to_index["<unk>"] = 0
embedding_dim = 100

glove_embeddings_path = "glove.6B.100d.txt"  
embedding_matrix = load_glove_embeddings(glove_embeddings_path, word_to_index, embedding_dim)

label_to_index = {'O': 0, 'B': 1, 'I': 2}

X_train = [[word_to_index.get(token, word_to_index["<unk>"]) for token in text.split()] for text in texts_train]
y_train = [[label_to_index[label] for label in entry] for entry in labels_train] 
X_val = [[word_to_index.get(token, word_to_index["<unk>"]) for token in text.split()] for text in texts_val]
y_val = [[label_to_index[label] for label in entry] for entry in labels_val] 



class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.y[idx])


def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs, targets


train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

class SimpleGRU(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, output_size):
        super(SimpleGRU, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), padding_idx=0)
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.gru(x)
        output = self.fc(output)
        return output

hidden_size = 100
output_size = len(label_to_index)
model = SimpleGRU(embedding_matrix, hidden_size, output_size)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 60

for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, output_size), targets.view(-1))
        loss.backward()
        optimizer.step()

    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        val_loss = 0.0
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, output_size), targets.view(-1))
            val_loss += loss.item()

 
            preds = torch.argmax(outputs, dim=2).cpu().numpy()
            targets = targets.cpu().numpy()

            all_preds.extend(preds)
            all_targets.extend(targets)

        val_loss /= len(val_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}')


    all_preds = np.concatenate(all_preds, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)


    accuracy = accuracy_score(all_targets, all_preds)
    macro_f1 = f1_score(all_targets, all_preds, average='macro')

    print(f'Validation Accuracy: {accuracy:.4f}, Validation Macro F1: {macro_f1:.4f}')
torch.save(model.state_dict(),"models/gru_glove.pth")

Epoch 1/60, Validation Loss: 0.1888
Validation Accuracy: 0.9666, Validation Macro F1: 0.3277
Epoch 2/60, Validation Loss: 0.1538
Validation Accuracy: 0.9670, Validation Macro F1: 0.3410
Epoch 3/60, Validation Loss: 0.1344
Validation Accuracy: 0.9683, Validation Macro F1: 0.3769
Epoch 4/60, Validation Loss: 0.1196
Validation Accuracy: 0.9687, Validation Macro F1: 0.4225
Epoch 5/60, Validation Loss: 0.1101
Validation Accuracy: 0.9697, Validation Macro F1: 0.4396
Epoch 6/60, Validation Loss: 0.1025
Validation Accuracy: 0.9722, Validation Macro F1: 0.4735
Epoch 7/60, Validation Loss: 0.0958
Validation Accuracy: 0.9744, Validation Macro F1: 0.5413
Epoch 8/60, Validation Loss: 0.0908
Validation Accuracy: 0.9745, Validation Macro F1: 0.5698
Epoch 9/60, Validation Loss: 0.0866
Validation Accuracy: 0.9755, Validation Macro F1: 0.6249
Epoch 10/60, Validation Loss: 0.0837
Validation Accuracy: 0.9759, Validation Macro F1: 0.6451
Epoch 11/60, Validation Loss: 0.0810
Validation Accuracy: 0.9769, Val

In [2]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Load processed test data from JSON
with open("processed_test_data.json", "r") as f_test:
    processed_test_data = json.load(f_test)

# Extract text and labels from the dictionary
texts_test = [entry["text"] for entry in processed_test_data.values()]
labels_test = [entry["labels"] for entry in processed_test_data.values()]

# Convert texts and labels to numerical format
X_test = [[word_to_index.get(token, word_to_index["<unk>"]) for token in text.split()] for text in texts_test]
y_test = [[label_to_index[label] for label in entry] for entry in labels_test]

# Create a custom dataset for the test data
test_dataset = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Run the model on the test data after 60 epochs
num_epochs = 60

for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, output_size), targets.view(-1))
        loss.backward()
        optimizer.step()

# Evaluate the model on the test data
model.eval()
all_preds_test = []
all_targets_test = []

with torch.no_grad():
    for inputs_test, targets_test in test_loader:
        outputs_test = model(inputs_test)
        preds_test = torch.argmax(outputs_test, dim=2).cpu().numpy()
        targets_test = targets_test.cpu().numpy()

        all_preds_test.extend(preds_test)
        all_targets_test.extend(targets_test)

# Flatten predictions and targets for evaluation
all_preds_test = np.concatenate(all_preds_test, axis=0)
all_targets_test = np.concatenate(all_targets_test, axis=0)

# Calculate accuracy and macro F1 score on test data
test_accuracy = accuracy_score(all_targets_test, all_preds_test)
test_macro_f1 = f1_score(all_targets_test, all_preds_test, average='macro')

print(f'Final Test Accuracy: {test_accuracy:.4f}, Final Test Macro F1: {test_macro_f1:.4f}')


Final Test Accuracy: 0.9655, Final Test Macro F1: 0.6604
