In [13]:
import pandas as pd
import re
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [40]:
### DOWNLOAD TOTAL DATASET
## [BOX SHOULD TAKE APPROX 25 SECONDS TO RUN]

# import training data
df = pd.read_excel(r'C:\Users\kailf\OneDrive\Documents\2023_Summer\NLP_project\NLPed_dataset2.xlsx')


# function to convert embeddings from string to np.array
def string_matrix_to_array(input_string):

    # Remove leading and trailing whitespace
    input_string = input_string.strip()
    
    # Remove brackets from the start and end
    input_string = input_string.lstrip('[').rstrip(']')
    
    # Replace line breaks, double spaces and triple spaces with single spaces
    input_string = input_string.replace('\n', ' ')
    input_string = input_string.replace('  ', ' ')
    input_string = input_string.replace('   ', ' ')

    # # replace '] [' with ']['
    # input_string = input_string.replace('] [', '][')
    
    # Split the string into rows based on ']' character
    rows = input_string.split(']')
    
    # Remove any empty strings from the split result
    rows = [row.strip() for row in rows if row.strip()]
    
    # Split each row into values and convert to floats
    parsed_rows = []
    for row in rows:
        row = row.strip('[').strip()
        values = [float(val) for val in row.split() if val.strip()]
        parsed_rows.append(values)
    
    # Convert the list of rows into a 2D NumPy array
    result_array = np.array(parsed_rows)
    
    return result_array


# change word embeddings from type string to type np.array
df['word embeddings'] = df['word embeddings'].apply(string_matrix_to_array)


# create test dataframe from df taking one in every 8 rows and remove from df
test_df = df.iloc[::8, :]
df = df.drop(test_df.index)
df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)



In [66]:
### CREATE DATASET AND DATALOADERS

class MyDataset(Dataset):
    def __init__(self, df):
        self.data = df['word embeddings'].to_numpy()
        self.labels = df['airline_sentiment'].to_numpy()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


# Custom collate function for handling variable-length sequences
def custom_collate(batch):
    # Get maximum sequence length in the batch
    max_seq_len = max([len(seq) for seq, _ in batch])
    
    # Pad sequences to the maximum length
    padded_batch = []
    labels = []
    
    for seq, label in batch:
        pad_length = max_seq_len - len(seq)
        padded_seq = np.pad(seq, ((0, pad_length), (0, 0)), 'constant')
        padded_batch.append(padded_seq)
        labels.append(label)
    
    #### DONT KNOW WHICH ONE TO RETURN
    # return np.array(padded_batch), np.array(labels)
    return torch.tensor(padded_batch), torch.tensor(labels)



batch_size = 32

train_dataset = MyDataset(df)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)



In [73]:
### CREATE MODEL

class LSTMModel(nn.Module):
    def __init__(self, input_d, hidden_d, layer_d, output_d):
        super(LSTMModel, self).__init__()
        
        self.hidden_dim = hidden_d
        self.layer_dim = layer_d

        # LSTM model 
        self.lstm = nn.LSTM(input_d, hidden_d, layer_d, batch_first=True) 

        self.fc = nn.Linear(hidden_d, output_d)

    def forward(self, x):
    
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        out, _ = self.lstm(x, (h0.detach(), c0.detach()))

        out = self.fc(out[:, -1, :]) 
        return out


input_dim = 96 # because each word embedding is of length 96
output_dim = 3 # because we are classifying into one of 3

hidden_dim = 128 # start at 128, explore more later
layer_dim = 3 # start with 1, increase more later


model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)

error = nn.CrossEntropyLoss()

learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)





### TRAIN MODEL

num_epochs = 10

# change model to float dtype
model = model.float()

for epoch in range(num_epochs):

    # iterating through batches
    for inputs, labels in train_loader:

        # Clear gradients
        optimizer.zero_grad()
        
        # Forward pass with float dtype
        outputs = model(inputs.float())
        
        # Compute loss
        loss = error(outputs, labels)
        
        # Backpropagation
        loss.backward()
        
        # Update weights
        optimizer.step()
    
    # Print or log the loss for monitoring training progress
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

Epoch [1/10], Loss: 1.2470204830169678
Epoch [2/10], Loss: 0.8078147768974304
Epoch [3/10], Loss: 0.7380357384681702
Epoch [4/10], Loss: 1.2568457126617432
Epoch [5/10], Loss: 0.6585753560066223
Epoch [6/10], Loss: 0.660285472869873
Epoch [7/10], Loss: 1.257710576057434
Epoch [8/10], Loss: 1.5855109691619873
Epoch [9/10], Loss: 0.6147798895835876
Epoch [10/10], Loss: 1.3316705226898193


In [74]:
# test model
test_dataset = MyDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

# change model to eval mode
model.eval()

# keep track of correct predictions
correct = 0
total = 0

# iterate through test dataset
for inputs, labels in test_loader:

    # Forward pass
    outputs = model(inputs.float())
    
    # Get predictions from the maximum value
    _, predicted_labels = torch.max(outputs, 1)
    
    # Total number of labels in the current batch
    total += labels.size(0)
    
    # Total correct predictions
    for label, prediction in zip(labels, predicted_labels):
        if label == prediction:
            correct += 1

# Accuracy
accuracy = 100 * correct / total
print(f'Accuracy: {round(accuracy, 2)}%')
print(f'Correct: {correct} out of {total}')

correct = 0
total = 0

for inputs, labels in train_loader:

    # Forward pass
    outputs = model(inputs.float())
    
    # Get predictions from the maximum value
    _, predicted_labels = torch.max(outputs, 1)
    
    # Total number of labels in the current batch
    total += labels.size(0)
    
    # Total correct predictions
    for label, prediction in zip(labels, predicted_labels):
        if label == prediction:
            correct += 1

accuracy = 100 * correct / total
print(f'Accuracy: {round(accuracy, 2)}%')
print(f'Correct: {correct} out of {total}')

# 1 epoch gives 54.86

Accuracy: 54.86%
Correct: 553 out of 1008
Accuracy: 53.55%
Correct: 3775 out of 7049
