In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import gensim.downloader as api
import pandas as pd
from torch import flatten
from sklearn.metrics import accuracy_score, f1_score

In [49]:
train_data_0_path = "/home3/luharj/DLNLP/Assignment1/data/train/ClassificationDataset-train0.xlsx"
valid_data_0_path = "/home3/luharj/DLNLP/Assignment1/data/valid/ClassificationDataset-valid0.xlsx"

In [50]:
glove_model = api.load("glove-wiki-gigaword-300")

In [51]:
train_df_0 = pd.read_excel(train_data_0_path)
valid_df_0 = pd.read_excel(valid_data_0_path)

In [78]:
class LinearLayers(nn.Module) :
    def __init__(self) :
        super(LinearLayers, self).__init__()
        self.fc1 = nn.Linear(in_features=300, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=64)
        self.fc4 = nn.Linear(in_features=64, out_features=16)
        self.fc5 = nn.Linear(in_features=16, out_features=3)
        self.softmax = nn.Softmax(dim = 1)
        self.relu = nn.ReLU()
    def forward(self, x) :
        x = x.squeeze()
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.softmax(self.fc5(x))
        return x

In [66]:
def get_labels(word) :
    if word == "negative" :
        return 0
    if word == "neutral" :
        return 1
    else:
        return 2

In [67]:
def get_embedding(sentences) :
    words = sentences.split(" ")
    embeddings = np.zeros(300)
    for word in words:
        if word in glove_model:
            embeddings+=glove_model[word]
        else :
            embeddings+=np.random.uniform(-1,1,300)
    # print(embeddings)
    return np.array(embeddings)

In [68]:
def get_data(data_frame) :
    training_data = []
    training_labels = []
    for index, row in data_frame.iterrows() :
        data = get_embedding(row[1])
        labels = get_labels(row[0])
        training_data.append(data)
        training_labels.append(labels)
    return np.array(training_data), np.array(training_labels)

In [69]:
training_data, training_labels = get_data(data_frame=train_df_0)
validation_data, validation_labels = get_data(data_frame=valid_df_0)

In [70]:
X = torch.Tensor(training_data)
y = torch.LongTensor(training_labels)

In [71]:
X_train = X
y_train = y
X_val = torch.Tensor(validation_data)
y_val = torch.LongTensor(validation_labels)

In [72]:
X_train = X_train.unsqueeze(1)
X_val = X_val.unsqueeze(1)

In [73]:
print(X_train.shape, X_val.shape)

torch.Size([2798, 1, 300]) torch.Size([1203, 1, 300])


In [74]:
batch_size = 64
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [80]:
model = LinearLayers()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [81]:
epochs = 20  # You can adjust the number of epochs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LinearLayers(
  (fc1): Linear(in_features=300, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=16, bias=True)
  (fc5): Linear(in_features=16, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
  (relu): ReLU()
)

In [82]:
# Training and validation loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    true_labels = []
    predicted_labels = []

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        output_labels = torch.argmax(outputs, dim=1)

        # Append true and predicted labels
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(output_labels.cpu().numpy())

        running_loss += loss.item()

    # Calculate accuracy, macro F1, and micro F1 on the training set
    accuracy = accuracy_score(true_labels, predicted_labels)
    macro_f1 = f1_score(true_labels, predicted_labels, average='macro')
    micro_f1 = f1_score(true_labels, predicted_labels, average='micro')

    # Initialize variables for validation
    val_true_labels = []
    val_predicted_labels = []
    val_running_loss = 0.0

    model.eval()  # Switch to evaluation mode for validation
    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)

            val_outputs = model(val_inputs)
            val_loss = criterion(val_outputs, val_labels)

            val_output_labels = torch.argmax(val_outputs, dim=1)

            # Append true and predicted labels for validation
            val_true_labels.extend(val_labels.cpu().numpy())
            val_predicted_labels.extend(val_output_labels.cpu().numpy())

            val_running_loss += val_loss.item()

    # Calculate accuracy, macro F1, and micro F1 on the validation set
    val_accuracy = accuracy_score(val_true_labels, val_predicted_labels)
    val_macro_f1 = f1_score(val_true_labels, val_predicted_labels, average='macro')
    val_micro_f1 = f1_score(val_true_labels, val_predicted_labels, average='micro')

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {running_loss / len(train_loader)}, Accuracy: {accuracy}, Macro F1: {macro_f1}, Micro F1: {micro_f1}, Validation Loss: {val_running_loss / len(val_loader)}, Val Accuracy: {val_accuracy}, Val Macro F1: {val_macro_f1}, Val Micro F1: {val_micro_f1}")

Epoch 1/20, Train Loss: 0.8968700820749457, Accuracy: 0.6644031451036455, Macro F1: 0.3280185527608208, Micro F1: 0.6644031451036455, Validation Loss: 0.8400415684047499, Val Accuracy: 0.7073981712385703, Val Macro F1: 0.405248522766771, Val Micro F1: 0.7073981712385703
Epoch 2/20, Train Loss: 0.8465028608387167, Accuracy: 0.6994281629735526, Macro F1: 0.4128693137202201, Micro F1: 0.6994281629735526, Validation Loss: 0.8250957915657445, Val Accuracy: 0.7281795511221946, Val Macro F1: 0.42979976177582446, Val Micro F1: 0.7281795511221947
Epoch 3/20, Train Loss: 0.8149476389993321, Accuracy: 0.7312365975696926, Macro F1: 0.44869830873173494, Micro F1: 0.7312365975696926, Validation Loss: 0.8108063277445341, Val Accuracy: 0.7381546134663342, Val Macro F1: 0.44630743442328996, Val Micro F1: 0.7381546134663342
Epoch 4/20, Train Loss: 0.7923700199885801, Accuracy: 0.7551822730521801, Macro F1: 0.4758839510540886, Micro F1: 0.7551822730521801, Validation Loss: 0.8128797160951715, Val Accurac

In [83]:
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    true_labels = []
    predicted_labels = []
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        output_labels = torch.argmax(outputs, dim=1)
        
        # Append true and predicted labels
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(output_labels.cpu().numpy())

        running_loss += loss.item()

    accuracy = accuracy_score(true_labels, predicted_labels)
    
    # Calculate macro and micro F1 scores for the epoch
    macro_f1 = f1_score(true_labels, predicted_labels, average='macro')
    micro_f1 = f1_score(true_labels, predicted_labels, average='micro')
    
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_loader)}, Accuracy: {accuracy}, Macro F1: {macro_f1}, Micro F1: {micro_f1}")

Epoch 1/20, Loss: 0.6909252865747972, Accuracy: 0.859542530378842, Macro F1: 0.5663971699163878, Micro F1: 0.859542530378842
Epoch 2/20, Loss: 0.7038447044112466, Accuracy: 0.8470335954253038, Macro F1: 0.5560958575786724, Micro F1: 0.8470335954253039
Epoch 3/20, Loss: 0.7163717191327702, Accuracy: 0.8327376697641172, Macro F1: 0.5455681496115501, Micro F1: 0.8327376697641172
Epoch 4/20, Loss: 0.6934417147528041, Accuracy: 0.8573981415296641, Macro F1: 0.565673359069809, Micro F1: 0.8573981415296642
Epoch 5/20, Loss: 0.6861702881076119, Accuracy: 0.8652609006433166, Macro F1: 0.5719845836301533, Micro F1: 0.8652609006433166
Epoch 6/20, Loss: 0.6839803944934498, Accuracy: 0.868120085775554, Macro F1: 0.5743842163459191, Micro F1: 0.868120085775554
Epoch 7/20, Loss: 0.6882735179229216, Accuracy: 0.8631165117941386, Macro F1: 0.5702971837950866, Micro F1: 0.8631165117941386
Epoch 8/20, Loss: 0.6782289648597891, Accuracy: 0.8724088634739099, Macro F1: 0.5781680712552494, Micro F1: 0.872408