In [1]:
import torch.nn as nn
import torch
from torch import nn, optim
import argparse
import random
import time
import pickle
from torch.utils.data import DataLoader
from tqdm import tqdm

In [3]:
seed = 5
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False

In [4]:
class SarcasmDetectionModel(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, num_layers: int, dropout: float):
        super(SarcasmDetectionModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True,
                            bidirectional=True, num_layers=num_layers, dropout=dropout)
        # self.linear = nn.Linear(2 * hidden_dim, 1)
        self.linear = nn.Sequential(
            nn.Linear(2 * hidden_dim, 36),
            nn.ReLU(),
            nn.Linear(36, 1)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        text = torch.permute(text, (1, 0, 2))

        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            text, text_lengths, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(
            packed_output)
        hidden = self.dropout(
            torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.linear(hidden)

In [5]:
import torch
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import os


def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    # round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  # convert into float for division
    return correct.sum() / len(correct)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def load_file(path: str):
    """
    The function load the pickle file and returns him
    """
    with open(path, "rb") as f:
        data = pickle.load(f)
    return data


def save_file_pickle(data: object, path: str):
    """
    The function saves the given data in a pickle file with the given path
    """
    with open(path, "wb") as f:
        pickle.dump(data, f)


def get_max_sentence_length(d: dict) -> int:
    max_len = 0
    for key, val in d.items():
        if len(val) > max_len:
            max_len = len(val)
    return max_len


def pad_sentences(d: dict) -> list:
    max_len = get_max_sentence_length(d)
    tensor_dict = {}
    len_dict = {}
    for sen_id, sen_vecs in tqdm(d.items()):
        len_dict[sen_id] = len(sen_vecs)
        new_vecs = []
        for i in range(max_len):
            if i < len(sen_vecs):
                new_vecs.append(torch.from_numpy(sen_vecs[i]))
            else:
                new_vecs.append(torch.zeros(200))
        tensor_dict[sen_id] = torch.stack(new_vecs)
    return tensor_dict, len_dict


def plot(x, y, plot_type, save_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    path = os.path.join(save_path, f"train_{plot_type}.png")
    plt.plot(x, y)
    plt.xlabel("Epoch")
    plt.ylabel(plot_type)
    plt.title(f"Training {plot_type} per Epoch")
    plt.grid()
    plt.savefig(path)
    plt.clf()

In [6]:
from torch.utils.data import Dataset
import torch
import numpy as np


class SarcasmDataset(Dataset):
    def __init__(self, data, labels, lengths):
        """
        PyTorch dataset class
        Args:
            data - list[list[]]
            labels - list()
        Return:
            None
        """
        self.data = data
        self.labels = labels
        self.lengths = lengths

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        labels = np.array(self.labels[index])
        labels = torch.from_numpy(labels).long()
        sen = self.data[index]
        l = self.lengths[index]
        # print(f"l.shape: {l.shape}")

        return sen, labels, l


In [7]:
def train(model, iterator, optimizer, criterion, device):
    epoch_loss = 0.0
    epoch_acc = 0.0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()

        sen, labels, lengths = batch
        sen = sen.to(device)
        labels = labels.to(device)
        # lengths = lengths.to(device)

        predictions = model(sen, lengths).squeeze(1)
        labels = labels.float()
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion, device):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    with torch.no_grad():
        for batch in iterator:

            sen, labels, lengths = batch
            sen = sen.to(device)
            labels = labels.to(device)
            lengths = lengths.float()
            sen = sen.float()

            predictions = model(sen, lengths).squeeze(1)
            labels = labels.float()
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


DATA_SAVE_PATH = ""
PLOT_PATH = "plots/"

# get all the arguments

ATTRIBUTE = "NN"
bz = 64
dropout = 0.2
num_epochs = 10
num_layers = 2
hidden_dim = 256
lr = 1e-3
input_dim = 200

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)





train_sen_vec = load_file(f"{DATA_SAVE_PATH}train_sen_2_vec.pkl")
train_sen_labels = load_file(f"{DATA_SAVE_PATH}train_sen_2_label.pkl")
test_sen_vec = load_file(f"{DATA_SAVE_PATH}test_sen_2_vec.pkl")
test_sen_labels = load_file(f"{DATA_SAVE_PATH}test_sen_2_label.pkl")


train_lengths = [len(val) for val in train_sen_vec.values()]
test_lengths = [len(val) for val in test_sen_vec.values()]

train_data, train_length = pad_sentences(train_sen_vec)
test_data, test_length = pad_sentences(test_sen_vec)

# save_file_pickle(train_data, f"{DATA_SAVE_PATH}train_sen_tensor.pkl")
# save_file_pickle(train_length, f"{DATA_SAVE_PATH}train_len_tensor.pkl")
# save_file_pickle(test_data, f"{DATA_SAVE_PATH}test_sen_tensor.pkl")
# save_file_pickle(test_length, f"{DATA_SAVE_PATH}data\\test_len_tensor.pkl")

train_dataset = SarcasmDataset(list(train_data.values()), list(
    train_sen_labels.values()), list(train_length.values()))
test_dataset = SarcasmDataset(list(test_data.values()), list(
    test_sen_labels.values()), list(test_length.values()))

train_dataloader = DataLoader(train_dataset, batch_size=bz, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=bz, shuffle=False)

model = SarcasmDetectionModel(input_dim, hidden_dim, num_layers, dropout)
print(model)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)


print("start training on glove representation")
train_losses, train_accs = [], []
for epoch in range(num_epochs):
    start_time = time.time()
    train_loss, train_acc = train(
        model, train_dataloader, optimizer, criterion, device)
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.3f}%")
print("finished training on glove representation")

cuda:0


100%|██████████| 22891/22891 [00:16<00:00, 1351.50it/s]
100%|██████████| 5722/5722 [00:00<00:00, 13777.54it/s]


SarcasmDetectionModel(
  (lstm): LSTM(200, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (linear): Sequential(
    (0): Linear(in_features=512, out_features=36, bias=True)
    (1): ReLU()
    (2): Linear(in_features=36, out_features=1, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
)
start training on glove representation
Epoch: 1 | Epoch Time: 0m 15s
Train Loss: 0.418 | Train Acc: 80.369%
Epoch: 2 | Epoch Time: 0m 15s
Train Loss: 0.322 | Train Acc: 86.060%
Epoch: 3 | Epoch Time: 0m 15s
Train Loss: 0.265 | Train Acc: 88.777%
Epoch: 4 | Epoch Time: 0m 15s
Train Loss: 0.212 | Train Acc: 91.077%
Epoch: 5 | Epoch Time: 0m 16s
Train Loss: 0.152 | Train Acc: 93.979%
Epoch: 6 | Epoch Time: 0m 15s
Train Loss: 0.097 | Train Acc: 96.349%
Epoch: 7 | Epoch Time: 0m 15s
Train Loss: 0.061 | Train Acc: 97.789%
Epoch: 8 | Epoch Time: 0m 15s
Train Loss: 0.044 | Train Acc: 98.429%
Epoch: 9 | Epoch Time: 0m 15s
Train Loss: 0.038 | Train Acc: 98.669%
Epoch: 10 | Epoch 

In [8]:
plot(list(range(num_epochs)), train_losses, "Loss",
     os.path.join(PLOT_PATH))
plot(list(range(num_epochs)), train_accs, "Accuracy",
     os.path.join(PLOT_PATH))


<Figure size 432x288 with 0 Axes>

In [10]:
print("evaluating the model on the glove representation")
test_loss, test_acc = evaluate(model, test_dataloader, criterion, device)
print(f"Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.3f}%")
# plot(list(range(num_epochs)), train_losses, "Loss",
#      os.path.join(PLOT_PATH, "train_loss.png"))
# plot(list(range(num_epochs)), train_accs, "Accuracy",
#      os.path.join(PLOT_PATH, "train_accuracy.png"))

print("testing projected vectors")
for ATTRIBUTE in ["NN", "JJ"]:
    print(f"running on {ATTRIBUTE}")
    projected_sen_vec = load_file(
        f"{DATA_SAVE_PATH}test_sen_2_vec_{ATTRIBUTE}.pkl")
    projected_sen_labels = load_file(
        f"{DATA_SAVE_PATH}test_sen_2_label_{ATTRIBUTE}.pkl")

    projected_lengths = [len(val) for val in test_sen_vec.values()]

    projected_data, projected_length = pad_sentences(projected_sen_vec)


    projected_dataset = SarcasmDataset(list(projected_data.values()), list(
        projected_sen_labels.values()), list(projected_length.values()))

    projected_dataloader = DataLoader(
        projected_dataset, batch_size=bz, shuffle=False)


    print("evaluating the model on the projected representation")
    test_loss, test_acc = evaluate(
        model, projected_dataloader, criterion, device)
    print(f"Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.3f}%")


evaluating the model on the glove representation
Test Loss: 0.674 | Test Acc: 87.086%
testing projected vectors
running on NN


100%|██████████| 5722/5722 [00:00<00:00, 7079.00it/s]


evaluating the model on the projected representation
Test Loss: 0.796 | Test Acc: 84.396%
running on JJ


100%|██████████| 5722/5722 [00:00<00:00, 7700.18it/s]


evaluating the model on the projected representation
Test Loss: 1.180 | Test Acc: 80.413%
