In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Embedding
from torch.utils.data import Dataset, DataLoader
import torchtext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Preprocessing Helper Functions

In [2]:
def convert_to_one_hot(Y, num_emojis):
    Y = np.eye(num_emojis)[Y.reshape(-1)]
    return Y


def read_emoji_csv(path):
    data = pd.read_csv(path, header=None)
    X = data.iloc[:,0].values
    y = data.iloc[:,1].values
    return X, y


def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        words_to_index = {}
        index_to_words = {}
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

        for i, w in enumerate(sorted(words), start=1):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m,)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]
    X_indices = np.zeros((m, max_len)).astype(np.float32)
    for i in range(m):
        sentence_words = [i.lower() for i in X[i].split()]
        for idx, val in enumerate(sentence_words):
            X_indices[i, idx] = word_to_index[val]
    
    return X_indices


def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index)
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0]
    emb_matrix = np.zeros((vocab_size, emb_dim))
    
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_size, emb_dim)
    embedding_layer.weight.requires_grad = False
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer


def torch_pretrained_embedding():
    glove = torchtext.vocab.GloVe(name='6B', dim=50)
    word2idx = glove.stoi
    idx2word = glove.itos
    embedding_layer = nn.Embedding.from_pretrained(glove.vectors, freeze=True)
    return embedding_layer, word2idx, idx2word

# Model

In [9]:
class EmojiDataset():
    def __init__(self, X_features, y_labels):
        self.x_features = torch.from_numpy(X_features)
        self.y_labels = torch.from_numpy(y_labels).type(torch.float32)
        self.len = len(X_features)

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        return self.x_features[index], self.y_labels[index]


def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    labels = torch.argmax(labels, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))


class ModelHelperFunctions(nn.Module):
    def training_step(self, batch):
        lines, labels = batch
        lines, labels = lines.to(torch.float32).to(torch.device("mps")), labels.to(torch.device("mps"))
        output = self(lines)
        loss = F.cross_entropy(output, labels)
        return loss

    def validate(self, batch):
        line, labels = batch
        #line = line.reshape(3, sequence_length, embedding_dim)
        line, labels = line.to(torch.device("mps")), labels.to(torch.device("mps"))
        output = self(line)
        loss = F.cross_entropy(output, labels)
        acc = accuracy(output, labels)
        return {'test_loss': loss.detach(), 'test_acc': acc}

    def validation_results(self, outputs):
        batch_losses = [x['test_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['test_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'test_loss': epoch_loss.item(), 
                'test_acc': epoch_acc.item()}

    def result_per_epoch(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, test_loss: {:.4f}, test_acc: {:.4f}".format(
            epoch, result['train_loss'], result['test_loss'], result['test_acc']))


class GRUNet(ModelHelperFunctions):
    def __init__(self, 
                 input_dim, 
                 hidden_dim, 
                 num_classes, 
                 num_layers, 
                 embedding_layer, 
                 embedding_dim, 
                 dropout=0.1):

        super(GRUNet, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.embeddings = embedding_layer
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True) # x must have shape: (batch_size, seq_len, input_size)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(torch.device("mps"))
        x = self.embeddings(x.to(torch.int64))
        out, _ = self.gru(x, hidden) # shape: (batch_size, seq_length, hidden_size)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out
        

In [10]:
X_train, y_train = read_emoji_csv('data/train_emoji.csv')
X_test, y_test = read_emoji_csv('data/test_emoji.csv')

embedding_layer, word2idx, idx2word = torch_pretrained_embedding()

max_length  = len(max(X_train, key=len).split())
X_train_indices = sentences_to_indices(X_train, word2idx, max_length)
X_test_indices = sentences_to_indices(X_test, word2idx, max_length)
y_train_oh = convert_to_one_hot(y_train, num_emojis = 30)
y_test_oh = convert_to_one_hot(y_test, num_emojis = 30)

In [11]:
X1 = np.array(["how are you doing", "I am really busy at the moment", "we can take it from here"])
X1_indices = sentences_to_indices(X1, word2idx, max_len=7)
print("X1 =", X1)
print("X1_indices =\n", X1_indices)

X1 = ['how are you doing' 'I am really busy at the moment'
 'we can take it from here']
X1_indices =
 [[ 197.   32.   81.  914.    0.    0.    0.]
 [  41.  913.  588. 4259.   22.    0. 1600.]
 [  53.   86.  190.   20.   25.  187.    0.]]


In [19]:
embedding_dim = 50
sequence_length = max_length
hidden_size = 128
num_classes = 30
batch_size = 3
num_epochs = 25
learning_rate = 0.001
num_layers = 2

model = GRUNet(input_dim=sequence_length, 
               hidden_dim=hidden_size,
               num_classes=num_classes,
               num_layers=num_layers,
               embedding_layer=embedding_layer,
               embedding_dim=embedding_dim).to(torch.device("mps"))

loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


@torch.no_grad()
def evaluate(model, test_loader):
    model.eval()
    outputs = [model.validate(batch) for batch in test_loader]
    return model.validation_results(outputs)


def fit(model, num_epochs, train_loader, test_loader, lr, optimizer):
    history = []
    epoch_list = []

    for epoch in range(num_epochs):
        epoch_list.append(epoch)

        # Training Phase
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        # Testing Phase
        result = evaluate(model, test_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.result_per_epoch(epoch, result)
        history.append(result)

    return history, epoch_list


def plot_results(history, epoch_lst):
    accuracies = [x['test_acc'] for x in history]
    train_losses = [x.get('train_loss') for x in history]
    val_losses = [x['test_loss'] for x in history]
    fig, (ax1, ax2) = plt.subplots(2, sharex=True)
    ax1.plot(epoch_lst, accuracies)
    ax1.set_title('Validation Accuracy vs. No. of epochs')
    ax1.set(ylabel='Accuracy')
    ax2.plot(epoch_lst, train_losses)
    ax2.plot(epoch_lst, val_losses)
    ax2.set_title('Losses vs. No. of epochs')
    ax2.set(xlabel='Epochs', ylabel='Loss')
    ax2.legend(['Training', 'Validation'])
    plt.show()

In [20]:
train_data = EmojiDataset(X_features=X_train_indices, y_labels=y_train_oh)
test_data = EmojiDataset(X_features=X_test_indices, y_labels=y_test_oh)
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
print("Lines in training data:", len(train_loader.dataset))
print("Lines in testing data:", len(test_loader.dataset))

Lines in training data: 189
Lines in testing data: 54


In [21]:
history, epoch_lst = fit(model=model, 
                         num_epochs=num_epochs, 
                         train_loader=train_loader,
                         test_loader=test_loader,
                         lr=learning_rate,
                         optimizer=optimizer)

Epoch [0], train_loss: 3.4234, test_loss: 3.3385, test_acc: 0.1296
Epoch [1], train_loss: 3.3517, test_loss: 3.3798, test_acc: 0.0370
Epoch [2], train_loss: 3.2723, test_loss: 3.3528, test_acc: 0.0926
Epoch [3], train_loss: 3.1368, test_loss: 3.1412, test_acc: 0.1852
Epoch [4], train_loss: 2.8907, test_loss: 2.8723, test_acc: 0.2407
Epoch [5], train_loss: 2.4894, test_loss: 2.8212, test_acc: 0.2778
Epoch [6], train_loss: 2.2429, test_loss: 2.7424, test_acc: 0.2963
Epoch [7], train_loss: 1.9578, test_loss: 2.7710, test_acc: 0.2407
Epoch [8], train_loss: 1.7511, test_loss: 3.0040, test_acc: 0.2778
Epoch [9], train_loss: 1.6186, test_loss: 2.9047, test_acc: 0.2593
Epoch [10], train_loss: 1.3700, test_loss: 2.8181, test_acc: 0.2963
Epoch [11], train_loss: 1.1644, test_loss: 2.7615, test_acc: 0.3333
Epoch [12], train_loss: 0.9918, test_loss: 2.9186, test_acc: 0.2778
Epoch [13], train_loss: 0.7648, test_loss: 3.0563, test_acc: 0.2593
Epoch [14], train_loss: 0.5746, test_loss: 3.1308, test_ac