In [None]:
import os
import pandas as pd
import torch
import spacy
from tqdm import tqdm_notebook as tqdm
from torchtext import data
from torchtext import datasets
import random

import numpy as np
import torchtext
import sys

#### Basic text preprocessing

We will leverage Spacy's modern text preprocessing methods to lemmatize, handle some spelling errors, create a pronoun flag, etc. 

In [None]:
# specify device type
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
# load spacy model
nlp = spacy.load('en_core_web_lg')

In [None]:
# default data dir
basepath = '/home/datawrestler/data/quora/dataset'

DATA_DIR = '/home/datawrestler/data/quora'

os.listdir(DATA_DIR)

In [None]:
df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), engine='python')

In [None]:
df.head(1)

In [None]:
df = df.sample(n=df.shape[0])

In [None]:
df = df.reset_index(drop=True)

In [None]:
df = df.rename(columns={'question_text': 'text'})

In [None]:
df.shape

In [None]:

df.iloc[0:700000].to_csv(os.path.join(basepath, 'train.csv'), index=False)
df.iloc[700000:800000].to_csv(os.path.join(basepath, 'test.csv'), index=False)
df.iloc[800000:900000].to_csv(os.path.join(basepath, 'valid.csv'), index=False)

In [None]:
out_text = df.iloc[0:400000]['text'].tolist()
out_test_text = df.iloc[400000:500000]['text'].tolist()
out_valid_text = df.iloc[500000:600000]['text'].tolist()

In [None]:
# for the language modelling, we are going to write out to a text file
# that is newline separated

def write_text_data(fname, text):

    with open(os.path.join(basepath, fname), 'w') as outfile:
        for line in text:
            outfile.write('\n{}'.format(line))
        outfile.close()
        
write_text_data('train.txt', out_text)
write_text_data('test.txt', out_test_text)
write_text_data('valid.txt', out_valid_text)

In [None]:
tokenize_count = 0

# use custom tokenizer with large spacy model
def tokenizer(text): # create a tokenizer function
    global tokenize_count
    if tokenize_count % 1000 == 0:
        sys.stdout.write('\rDoc: {}'.format(tokenize_count))
        sys.stdout.flush()
    tokenize_count += 1
    return [tok.text for tok in nlp.tokenizer(text)]

In [None]:
from torchtext.data import TabularDataset
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(sequential=True, tokenize=tokenizer)

In [None]:
from torchtext.datasets.language_modeling import LanguageModelingDataset



class CustomLMData(LanguageModelingDataset):
    
    name = 'lm_dataset'

    @classmethod
    def splits(cls, text_field, root=None, train='lmdata.txt',
               validation=None, test=None,
               **kwargs):
        """
        Create dataset from custom data persisted to disc. Data
        must be newline separated text files and path must be designated. 
        
        Arguments:
            text_field: The field that will be used for text data.
            root: The root directory that the dataset's zip archive will be
                expanded into; therefore the directory in whose wikitext-2
                subdirectory the data files will be stored.
            train: The filename of the train data. Default: 'wiki.train.tokens'.
            validation: The filename of the validation data, or None to not
                load the validation set. Default: 'wiki.valid.tokens'.
            test: The filename of the test data, or None to not load the test
                set. Default: 'wiki.test.tokens'.
                
        Resources: 
            https://github.com/pytorch/text/blob/master/torchtext/data/dataset.py
            https://github.com/pytorch/text/blob/master/torchtext/datasets/language_modeling.py
            https://torchtext.readthedocs.io/en/latest/examples.html
        """
        return super(CustomLMData, cls).splits(
            root=root, train=train, validation=validation, test=test,
            text_field=text_field, **kwargs)
    
    @classmethod
    def iters(cls, batch_size=32, bptt_len=35, device=0, path=basepath,
              train='lmdata.txt', validation=None, test=None, root=basepath,
              vectors=None, **kwargs):
        """Create iterator objects for splits of the WikiText-2 dataset.
        This is the simplest way to use the dataset, and assumes common
        defaults for field, vocabulary, and iterator parameters.
        Arguments:
            batch_size: Batch size.
            bptt_len: Length of sequences for backpropagation through time.
            device: Device to create batches on. Use -1 for CPU and None for
                the currently active GPU device.
            root: The root directory that the dataset's zip archive will be
                expanded into; therefore the directory in whose wikitext-2
                subdirectory the data files will be stored.
            wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the
                text field. The word vectors are accessible as
                train.dataset.fields['text'].vocab.vectors.
            Remaining keyword arguments: Passed to the splits method.
        """
        TEXT = data.Field()

        train = cls.splits(TEXT, root=root, path=basepath, **kwargs)

        TEXT.build_vocab(train, vectors=vectors)

        return data.BPTTIterator.splits(train,
            batch_size=batch_size, bptt_len=bptt_len,
            device=device)


In [None]:
# using our custom dataset class that inherits from the languagemodelling dataset of 
# torchtext, create our train, test, valid splits of quora questions
train, test, valid = CustomLMData.splits(
    TEXT,
    path=basepath,
    train='train.txt',
    test='test.txt',
    validation='valid.txt',

)

In [None]:
# build vocab
# to see available pretrained embedding options, take a peek at the source code:
# https://github.com/pytorch/text/blob/master/torchtext/vocab.py
TEXT.build_vocab(train, vectors='glove.42B.300d', min_freq=5, 
                max_size=100000)

In [None]:
# create our batch iterator object for training. This will automatically 
# shift our input text forward t+1 for our target data for the language model 
# to predict the next word in the sequence
train_iter, test_iter, valid_iter = data.BPTTIterator.splits(
    (train, test, valid), 
    batch_size=128, 
    bptt_len=40, # specifying the sequence length for back prop through time
    device=device,
    repeat=False, 
    sort_key=lambda x: len(x.text)
)

In [None]:
b = next(iter(train_iter))

In [None]:
# numerilization occurs
b.text[:5, :3]

In [None]:
# we can peep into the numerilization with
TEXT.vocab.itos[1656]

In [None]:
b.target[:5, :3]

#### Building and Training the language model

The goal here is to use the pretrained glove 300 dimensional vectors to hot start our embedding model that will be fine tuned on our actual data. We are going to build an RNN bidirectional language model

In [None]:

   super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
    
    def forward(self, x):
        # Set initial states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
   
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V

 
class BiRNN(nn.Module):
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz,
                 dropout=0.5, tie_weights=True):
        """
        Bidirectional language model 
        
        https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py
        """
        super(BiRNN, self).__init__()
        self.nhid = nhid
        self.nlayers = nlayers
        self.bsz = bsz
        self.tie_weights = tie_weights # TODO: figure out tying weight with bidirectional LSTM
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.lstm = nn.LSTM(ninp, nhid, nlayers, dropout=dropout, bidirectional=True)
        self.decoder = nn.Linear(nhid*2, ntoken) # we need *2 for bidirectional
        self.init_weights()
        self.hidden = self.init_hidden(bsz) # the input is a batched consecutive corpus
                                            # therefore, we retain the hidden state across batches
     
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
 
    def forward(self, input_data):
        emb = self.drop(self.encoder(input_data))
        output, self.hidden = self.lstm(emb, self.hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        # once again we need x2 for bidirectional LSTM
        return (V(weight.new(self.nlayers*2, bsz, self.nhid).zero_().cuda()),
                V(weight.new(self.nlayers*2, bsz, self.nhid).zero_()).cuda())
  
    def reset_history(self):
        self.hidden = tuple(V(v.data) for v in self.hidden)

In [None]:
# we need to use our pretrained embeddings to init the RNN

BATCH_SIZE = 128

weight_matrix = TEXT.vocab.vectors
model = BiRNN(weight_matrix.size(0), 
                 weight_matrix.size(1), 300, 4, BATCH_SIZE, 
             tie_weights=True)

model.encoder.weight.data.copy_(weight_matrix)
model.cuda()

In [None]:
learning_rate = 1e-3

# define our loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.7, 0.99))
n_tokens = weight_matrix.size(0)

In [None]:
# construct the evaluation criteria

def validation_loss(valid_iter, model):
    
    # monitor the loss
    val_loss = 0
    # turn on evaluation mode
    model.eval()
    for batch in valid_iter:
        model.reset_history()
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        val_loss += loss.item() * text.size(0)
    val_loss /= len(valid.examples[0].text)

    return val_loss

In [None]:
# http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/

# and write our training loop


from tqdm import trange
from time import sleep

from tqdm import tqdm_notebook as tqdm

def clip_grads(model, clip_weight=0.25):
    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip_weight)
    for p in model.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    

def train_model(num_epochs=10):
    """One epoch of a training loop"""
    
    for epoch in range(0, num_epochs):
        # turn on training mode
        epoch_loss = 0
        t = tqdm(train_iter)
        batch_ii = 0
        for batch in t:
            batch_ii += 1
            # reset the hidden state or else the model will try to backpropagate to the
            # beginning of the dataset, requiring lots of time and a lot of memory
            model.train()
            t.set_description('Epoch: {}'.format(epoch))
            t.refresh()
            model.reset_history()

            optimizer.zero_grad()

            text, targets = batch.text, batch.target
            prediction = model(text)
            # pytorch currently only supports cross entropy loss for inputs of 2 or 4 dimensions.
            # we therefore flatten the predictions out across the batch axis so that it becomes
            # shape (batch_size * sequence_length, n_tokens)
            # in accordance to this, we reshape the targets to be
            # shape (batch_size * sequence_length)
            loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
            loss.backward()
            
            # clip gradients
            clip_grads(model)

            optimizer.step()

            # epoch_loss += loss.data[0] * prediction.size(0) * prediction.size(1)
            epoch_loss += loss.item() * prediction.size(0) * prediction.size(1)

            epoch_loss /= len(train.examples[0].text)
            
            
            
        # print('Epoch: {}, Training Loss: {:.4f}'.format(epoch, epoch_loss))
        # capture validation loss for each batch
        valid_loss = validation_loss(valid_iter, model)
        print('Epoch: {} | Training Loss: {:.4f} | Valid Loss: {:.4f}'.format(epoch, 
                                                                             epoch_loss, 
                                                                             valid_loss))
 
    final_val_loss = validation_loss(valid_iter, model)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, 
                                                                             epoch_loss, 
                                                                             final_val_loss))

    
train_model(num_epochs=100)

In [None]:
# save model
# https://pytorch.org/tutorials/beginner/saving_loading_models.html

if not os.path.exists(os.path.join(basepath, 'models')):
    os.makedirs(os.path.join(basepath, 'models'))
    
# save entire model - if only wanting to save for inference, 
# use model.state_dict()
torch.save(model, os.path.join(basepath, 'models/lm_300_model.pt'))

In [None]:
test_model = torch.load(os.path.join(basepath, 'models/lm_300_model.pt'))
test_model.eval()

In [None]:
def word_ids_to_sentence(id_tensor, vocab, join=None):
    """Converts a sequence of word ids to a sentence"""
    if isinstance(id_tensor, torch.LongTensor):
        ids = id_tensor.transpose(0, 1).contiguous().view(-1)
    elif isinstance(id_tensor, np.ndarray):
        ids = id_tensor.transpose().reshape(-1)
    batch = [vocab.itos[ind] for ind in ids] # denumericalize
    if join is None:
        return batch
    else:
        return join.join(batch)

In [None]:


arrs = model(b.text).cpu().data.numpy()
word_ids_to_sentence(np.argmax(arrs, axis=2), TEXT.vocab, join=' ')[0:500]

In [None]:
vocab = TEXT.vocab

vocab.stoi['the']

In [None]:
len(vocab.__dict__['freqs'].keys())

In [None]:
# https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [None]:
# pull out vocab items

wrd_to_embedding = {}
for wrd in list(vocab.__dict__['freqs'].keys()):
    print(wrd)
    lookup_tensor = torch.tensor([vocab.stoi[wrd]], dtype=torch.long, device=device)
    emb = model.drop(model.encoder(lookup_tensor))
    # convert embedding to numpy array
    emb = emb.cpu()
    np_array = emb.detach().numpy()
    wrd_to_embedding[wrd] = np_array

In [None]:
from scipy.spatial.distance import cosine

cosine(wrd_to_embedding['successful'], wrd_to_embedding['pick'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(wrd_to_embedding['soldier'], wrd_to_embedding['war'])

In [None]:
dir(vocab)

In [None]:
# test if vectors actually drifted by looking at our original vectors 
# from the glove implementation
old_w2v = {}

for wrd in list(vocab.__dict__['freqs'].keys()):
    print(wrd)
    wrd_id = vocab.stoi[wrd]
    vocab.vectors[wrd_id].cpu().detach().numpy()
    old_w2v[wrd] = np_array


In [None]:
cosine_similarity(wrd_to_embedding['war'], old_w2v['war'])