In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from gensim.models import FastText
from gensim.models.fasttext import load_facebook_vectors

import fasttext
import io

import numpy as np
import pandas as pd
from collections import Counter
from itertools import chain

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

def long_tensor(arr):
    return(torch.tensor(arr, dtype=torch.long).to(device))

In [39]:
# Dataset with text prepoc and presentation code
class Textsequencer(torch.utils.data.Dataset):
    def __init__(self, words):
        # TO DO pre-processing here
        self.sequence_length = None
        self.words = words
        word_counts = Counter(words)
        self.vocab = sorted(word_counts, key=word_counts.get, reverse=True)
        self.vocab_size = len(self.vocab)

        self.index_to_word = {index: word for index, word in enumerate(self.vocab)}
        self.word_to_index = {word: index for index, word in enumerate(self.vocab)}
        self.words_indexes = [self.word_to_index[w] for w in self.words]
    
    # text processing here
    def get_skipgrams(self, context_size):

        skipgrams = []
        for i in range(context_size, len(self.words) - context_size):
            context = (
                [self.words[i - j - 1] for j in range(context_size)],
                [self.words[i + j + 1] for j in range(context_size)]
            )
            context[0].reverse()
            context = context[0] + context[1]
            target = self.words[i]
            skipgrams.append((context, target))
        return(skipgrams)

    # get the context tensors for training
    def index_vector(self, context):
        if isinstance(context, str):
            context = [context]
        return(context, [self.word_to_index[w] for w in context])
    
    # to present to the data loader
    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        if not self.sequence_length:
            raise(AttributeError("Set the sequence_length property to generate sequences of length n"))
        return (
            self.words[index:index+self.sequence_length], # word sequence x
            self.words_indexes[index:index+self.sequence_length], # word index sequence x
            self.words[index+1:index+self.sequence_length+1], # word sequence y
            self.words_indexes[index+1:index+self.sequence_length+1], # word index sequence y
        )

In [3]:
# here set up a CBOW model to learn embeddings with pytorch
class CBOW(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim, hidden_dim):
        super(CBOW, self).__init__()
        self.vocab_size = vocab_size
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        # nn.Embedding is a matrix of learnable weights, one row per word
        self.embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
        # embedding vectors to be projected into a hidden layer, context size * 2 because skip gram is bef-y-aft
        self.linear1 = nn.Linear(self.context_size * 2 * self.embedding_dim, hidden_dim)
        # finally fully connect and softmax to output is the target y in bef-y-aft
        self.linear2 = nn.Linear(hidden_dim, self.vocab_size) # fully connect hidden layer to (log) softmax

    def forward(self, inputs):
        out = self.embeddings(inputs)
        out = out.view(1, -1) # multiple words in the skipgram.
        out = self.linear1(out)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1) # log probabilities
        return(out)

# here set up a CBOW model to learn embeddings with pytorch
class CBOW_conv(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim, hidden_dim):
        super(CBOW, self).__init__()
        self.vocab_size = vocab_size
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        # nn.Embedding is a matrix of learnable weights, one row per word
        self.embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
        # embedding vectors pooled, elementwise
        
        
        self.linear1 = nn.Linear(self.context_size * 2 * self.embedding_dim, hidden_dim)
        # finally fully connect and softmax to output is the target y in bef-y-aft
        self.linear2 = nn.Linear(hidden_dim, self.vocab_size) # fully connect hidden layer to (log) softmax

    def forward(self, inputs):
        out = self.embeddings(inputs)
        print(out.shape)
        out = out.view(1, -1) # multiple words in the skipgram.
        print(out.shape)
        out = self.linear1(out)
        print(out.shape)
        out = F.relu(out)
        out = self.linear2(out)
        print(out.shape)
        out = F.log_softmax(out, dim=1) # log probabilities
        return(out)    
    

# traning this type of model
def learn_embeddings(model, loss_function, optimiser, text_sequencer, context_size, epochs):
    
    model.to(device)
    model.train()
    
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for batch, (context_words, target_word) in enumerate(text_sequencer.get_skipgrams(context_size)):

            # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
            # into integer indices and wrap them in tensors)
            _, context_idx = text_sequencer.index_vector(context_words)
            _, target_id = text_sequencer.index_vector(target_word)
            
            # Step 2. Recall that torch *accumulates* gradients. Before passing in a
            # new instance, you need to zero out the gradients from the old
            # instance
            model.zero_grad()

            # Step 3. Run the forward pass, getting log probabilities over next
            # words
            log_probs = model(long_tensor(context_idx))

            # Step 4. Compute your loss function. (Again, Torch wants the target
            # word wrapped in a tensor)
            loss = loss_function(log_probs, long_tensor(target_id))

            # Step 5. Do the backward pass and update the gradient
            loss.backward()
            optimiser.step()

            # Get the Python number from a 1-element Tensor by calling tensor.item()
            total_loss += loss.item()
            if batch % 10000 == 0:
                print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
        losses.append(total_loss)
        
    model.eval()
    return(model, losses)

# similarity of two words using the pytorch embeddings
def test(model, word_1, word_2, word_to_ix):
    # test word similarity
    word_1_vec = model.embeddings.weight[word_to_ix[word_1]]
    word_2_vec = model.embeddings.weight[word_to_ix[word_2]]
    
    dot_product = word_1_vec.dot(word_2_vec)
    norm = torch.norm(word_1_vec) * torch.norm(word_2_vec)
    word_similarity = (dot_product / norm)
    print(f"Similarity between {word_1} & {word_2} : {word_similarity}")

In [9]:
# simple LSTM model: inject with a pre-trained embedding : lstm -> fully connected
class emb_LSTM(nn.Module):
    def __init__(self,
                 vocab_size, input_size, # number of words and embedding dimension
                 lstm_layers, lstm_size, # lstm layer parameters
                 dropout): # a further hyper-param
        super(emb_LSTM, self).__init__()
        self.vocab_size = vocab_size
        self.input_size = input_size
        self.lstm_size = lstm_size
        self.lstm_layers = lstm_layers
        self.dropout = dropout

        self.lstm = nn.LSTM(
            input_size=self.input_size,
            hidden_size=self.lstm_size,
            num_layers=self.lstm_layers,
            dropout=self.dropout,
        )
        self.fully_connected = nn.Linear(self.lstm_size, vocab_size)
    
    def forward(self, input_vec, prev_state):
        out, state = self.lstm(input_vec, prev_state)
        out = self.fully_connected(output)
        out = F.log_softmax(out, dim=1) # log probabilities predicting the target word
        return(out, state)

# a standalone function to generate the initial state tensors (zeros)
def get_init_state(lstm_layers, sequence_length, lstm_size):
    init_state = torch.zeros(lstm_layers, sequence_length, lstm_size)
    return(init_state.to(device), init_state.to(device))

def train_lstm(model, loss_function, optimiser, data_loader, init_state, epochs):
    
    model.to(device)
    model.train()

    losses = []
    for epoch in range(epochs):
        total_loss = 0
        state_h, state_c = init_state

        for batch, (words_x, words_idx_x, words_y, words_idx_y) in enumerate(data_loader):
            optimiser.zero_grad()

            y_pred, (state_h, state_c) = model(x.to(device), (state_h, state_c))
            loss = loss_function(y_pred.transpose(1, 2), y.to(device))

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimiser.step()
            
            total_loss += loss.item()
            if batch % 56 == 0:
                print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
        losses.append(total_loss)
    model.eval()
    return(model, losses)

In [5]:
# prepare jokes data (needs pre-processing, lower, clean of punct)
train_df = pd.read_csv('data/reddit-cleanjokes.csv')
jokes = train_df['Joke']
jokes_nested_lists = [j.split(" ") for j in train_df['Joke']]
jokes_words_list = [c for c in chain(*jokes_nested_lists)]

In [6]:
dataset = Textsequencer(jokes_words_list)
context_size = 2

embedding_dim = 64
hidden_dim = 32
embedding_learning_rate = 0.05
epochs = 20

embedding = CBOW(vocab_size=dataset.vocab_size, context_size=context_size,
             embedding_dim=embedding_dim, hidden_dim=hidden_dim)

optimiser = optim.SGD(embedding.parameters(), lr=embedding_learning_rate)
loss_function = nn.NLLLoss() # this is the loss function for multi-class with a log_softmax final layer

embedding, losses = learn_embeddings(embedding, loss_function, optimiser, dataset, context_size, epochs)
print(losses)

{'epoch': 0, 'batch': 0, 'loss': 9.17879867553711}
{'epoch': 0, 'batch': 10000, 'loss': 6.666481971740723}
{'epoch': 0, 'batch': 20000, 'loss': 3.663606643676758}
{'epoch': 1, 'batch': 0, 'loss': 3.471705913543701}
{'epoch': 1, 'batch': 10000, 'loss': 2.1564955711364746}
{'epoch': 1, 'batch': 20000, 'loss': 0.14438307285308838}
{'epoch': 2, 'batch': 0, 'loss': 3.3639581203460693}
{'epoch': 2, 'batch': 10000, 'loss': 4.9272260665893555}
{'epoch': 2, 'batch': 20000, 'loss': 10.653604507446289}
{'epoch': 3, 'batch': 0, 'loss': 2.041869878768921}
{'epoch': 3, 'batch': 10000, 'loss': 1.6359328031539917}
{'epoch': 3, 'batch': 20000, 'loss': 1.7291313409805298}
{'epoch': 4, 'batch': 0, 'loss': 1.5142778158187866}
{'epoch': 4, 'batch': 10000, 'loss': 1.0933905839920044}
{'epoch': 4, 'batch': 20000, 'loss': 3.2296481132507324}
{'epoch': 5, 'batch': 0, 'loss': 0.14421312510967255}
{'epoch': 5, 'batch': 10000, 'loss': 2.8275954723358154}
{'epoch': 5, 'batch': 20000, 'loss': 3.5698933601379395}
{'

In [31]:
embedding.embeddings.weight

TypeError: 'Parameter' object is not callable

In [14]:
lstm_size = 64
lstm_layers = 2

lstm_model = emb_LSTM(vocab_size=dataset.vocab_size, # vocab_size: num of words (output size)
                input_size=embedding_dim, # input size must match the embedding dim used
                lstm_size=64, lstm_layers=3, dropout=0.2) # lstm hyper-params

In [18]:
sequence_length = 4
dataset.sequence_length = sequence_length

batch_size = 10
data_loader = DataLoader(dataset, batch_size=batch_size)

init_state = get_init_state(lstm_layers, sequence_length, lstm_size)

lstm_learning_rate = 0.005
loss_function = nn.CrossEntropyLoss()
optimiser = optim.Adam(lstm_model.parameters(), lr=lstm_learning_rate)

lstm_epochs = 50

losses, lstm_model = train_lstm(lstm_model, loss_function, optimiser, data_loader, init_state, epochs=lstm_epochs)
print(losses)

ValueError: too many values to unpack (expected 2)

In [43]:
dataset = Textsequencer(jokes_words_list)
sequence_length = 4
dataset.sequence_length = sequence_length
data_loader = DataLoader(dataset, batch_size=2)
(words_x, words_idx_x, words_y, words_idx_y) = next(iter(data_loader))
print(words_x, words_idx_x, words_y, words_idx_y)

[('What', 'did'), ('did', 'the'), ('the', 'bartender'), ('bartender', 'say')] [tensor([2, 8]), tensor([8, 0]), tensor([  0, 248]), tensor([248,  20])] [('did', 'the'), ('the', 'bartender'), ('bartender', 'say'), ('say', 'to')] [tensor([8, 0]), tensor([  0, 248]), tensor([248,  20]), tensor([20,  4])]


In [12]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x.to(device), (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words


print(predict(dataset, model, text='Knock knock. Whos there?'))

NameError: name 'model' is not defined

In [13]:
next((i for i in range(3)))

0

In [None]:
torch.tensor([[2,3,4], [234,234,234]]).shape