In [1]:
EPOCHS = 25
BATCH_SIZE = 32
N_EVAL = 100
MAX_SENT_LENGTH = 50 # maximum word length of sentence inputs

In [2]:
import torch
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# method used to import constants
import sys
import os

sys.path.insert(1, os.path.join(sys.path[0], '..'))
import constants

class StartingDataset(torch.utils.data.Dataset):
    """
    Dataset
    """

    def __init__(self, data_path="/Users/Terru/Desktop/UCLA/ACM AI/Projects/train.csv"):
        '''
        data_path (str): path for the csv file that contains the data that you want to use
        '''

        # Import data
        self.df = pd.read_csv(data_path)

        # Generates embeddings
        self.words = []
        self.word2idx = {}
        self.embedding = {}
        self.idx = 0

        with open("/Users/Terru/Desktop/UCLA/ACM AI/Projects/glove.6B/glove.6B.300d.txt") as f:
            for l in f:
                line = l.split()
                word = line[0]
                vector = np.array([float(number) for number in line[1:]])
                self.embedding[word] = vector
                self.words.append(word)

        # TO-DO: possibly stem embeddings!

    # Returns an instance from the dataset
    def __getitem__(self, i):
        '''
        i (int): the desired instance of the dataset
        '''
        # return the ith sample's list of embeddings for each word and label

        text = self.df.iloc[i, 1]

        # basic preprocessing——case, removing punctuation
        text = text.lower().split()
        text = [word.translate(str.maketrans('', '', string.punctuation)) for word in text]

        # lemmatizing
        lemma = WordNetLemmatizer()
        text = [lemma.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]

        # TO-DO: Using a map/dict to remove contractions?

        # Generating embeddings
        embeddings = [self.embedding[i] for i in text[:constants.MAX_SENT_LENGTH] if i in self.embedding]
        if len(embeddings) < constants.MAX_SENT_LENGTH:
            add = [np.zeros(300) for i in range(constants.MAX_SENT_LENGTH - len(embeddings))]
            embeddings.extend(add)

        # changes to PyTorch tensors
        embeddings = torch.from_numpy(np.asarray(embeddings))
        label = torch.from_numpy(np.asarray(self.df.iloc[i, 2]))


        return embeddings, label

    # Returns the size of the dataset
    def __len__(self):
        return len(self.df)


# data = StartingDataset()
# # print(data.df.head())
# # print(len(data))
# # print(data.embedding["the"])
# print(data[2])
# print(len(data[2][0]))

In [41]:
import torch
import torch.nn as nn

class StartingNetwork(torch.nn.Module):
    """
    Network
    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size, num_layers):
        super().__init__()

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.num_layers = num_layers

        # LSTM taking word embeddings as inputs and outputting hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True)
        # optional: dropout

        self.linear = nn.Linear(self.hidden_dim*2, 64)
        # Linear layer maps from hidden state space to output space
        self.fc = nn.Linear(64, output_size)

        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        output, _ = self.lstm(x)
        output = output[:, -1, :]
        intermed = self.dropout(torch.sigmoid(self.linear(output)))
        logits = self.fc(intermed)
        logits = logits.squeeze()
        return logits

In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

def starting_train(train_dataset, val_dataset, model, hyperparameters, n_eval, device):
    """
    Trains and evaluates a model.
    Args:
        train_dataset:   PyTorch dataset containing training data.
        val_dataset:     PyTorch dataset containing validation data.
        model:           PyTorch model to be trained.
        hyperparameters: Dictionary containing hyperparameters.
        n_eval:          Interval at which we evaluate our model.
    """

    # Get keyword arguments
    batch_size, epochs = hyperparameters["batch_size"], hyperparameters["epochs"]

    # Initialize dataloaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=batch_size, shuffle=True
    )

    # Initalize optimizer (for gradient descent) and loss function
    optimizer = optim.AdamW(model.parameters())
    loss_fn = nn.BCEWithLogitsLoss()

    # change model parameters to float
    model = model.float()

    step = 0
    writer = SummaryWriter() # tensorboard
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1} of {epochs}")

        # Loop over each batch in the dataset
        for batch in tqdm(train_loader):

            texts, labels = batch

            # Move to GPU if available
            texts = texts.to(device)
            labels = labels.to(device)

            # TODO: Forward propagate
            outputs = model(texts.float())
            
            # converts labels to float32 so it's compatible with model output
            labels = labels.float()
            
            # TODO: Backpropagation and gradient descent
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Periodically evaluate our model + log to Tensorboard
            if step % n_eval == 0 and step > 0:
                # TODO:
                writer.add_scalar("Training loss: ", loss.item(), epoch+1)
                # Compute training loss and accuracy.
                # Log the results to Tensorboard.

                # TODO:
                # Compute validation loss and accuracy.
                # Log the results to Tensorboard.
                # Don't forget to turn off gradient calculations!
                writer.add_scalar("Validation accuracy: ", evaluate(val_loader, model, loss_fn, device), epoch+1)
                # NOT DONE. ^ depending on what our evaluate() function actually returns we may need to subset it
                # such as [0]
                evaluate(val_loader, model, loss_fn, device) # testing it here

            writer.flush() # sends output
            step += 1

        print("Epoch: ", epoch+1, "Loss: ", loss.item()) # displays loss of last batch for every epoch
    writer.close()


# def compute_accuracy(outputs, labels):
#     """
#     Computes the accuracy of a model's predictions.
#     Example input:
#         outputs: [0.7, 0.9, 0.3, 0.2]
#         labels:  [1, 1, 0, 1]
#     Example output:
#         0.75
#     """
#
#     n_correct = (torch.round(outputs) == labels).sum().item()
#     n_total = len(outputs)
#     return n_correct / n_total


def evaluate(val_loader, model, loss_fn, device):
    """
    Computes the loss and accuracy of a model on the validation dataset.
    TODO!
    """
    model.eval()

    model = model.to(device)

    correct = 0
    total = 0
    for batch in val_loader:
        texts, labels = batch

        # pass to GPU if available
        texts = texts.to(device)
        labels = labels.to(device)

        # classical accuracy:
        predictions = model(texts).argmax(axis=1)
        correct += (predictions == labels).sum().item()
        total += len(labels)

    print("\n Accuracy: ", 100*(correct/total), "%")

    # TO-DO: calculate ROC accuracy. or separate accuracies for sincere and insincere.
    # sk_learn roc_curve
    # further: try all types like F-1 score and look for anomalies/things to note
    pass

## THIS IS THE CELL YOU ONLY RUN ONCE. (change data_path of course to your data!)

In [6]:
data_path = "/Users/Terru/Desktop/UCLA/ACM AI/Projects/train.csv"

train_dataset = StartingDataset(data_path)
val_dataset = StartingDataset(data_path)

In [43]:
import os
import torch

def main():
    # Get command line arguments
    hyperparameters = {"epochs": constants.EPOCHS, "batch_size": constants.BATCH_SIZE}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print("Epochs:", constants.EPOCHS)
    print("Batch size:", constants.BATCH_SIZE)

    # Initalize dataset and model. Then train the model!
    # data_path = "/Users/Terru/Desktop/UCLA/ACM AI/Projects/train.csv"

    # TO-DO: train, test, (val) split ofc

    # train_dataset = StartingDataset(data_path)
    # val_dataset = StartingDataset(data_path)
    model = StartingNetwork(300, 1024, 400000, 1, 1)
    # hyperparameters more or less arbitrary. At least they are for now, I just randomly set them
    # vocab size of glove.6B that we're using is 400K

    starting_train(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        model=model,
        hyperparameters=hyperparameters,
        n_eval=constants.N_EVAL,
        device=device,
    )

if __name__ == "__main__":
    main()

Epochs: 25
Batch size: 32
Epoch 1 of 25


  0%|                                     | 48/40817 [01:57<27:36:26,  2.44s/it]


KeyboardInterrupt: 