In [1]:
# load the relevant libraries for our analysis
import os
import pandas as pd
import torch
import spacy
from tqdm import tqdm_notebook as tqdm
from torchtext import data
from torchtext import datasets
import random
import numpy as np
import torchtext
import sys

In [2]:
# specify device type
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

This tutorial serves as an introduction to the usage of torchtext a library designed to preprocess text data so it
can be usedin tamdem with pytorch as well as with other deep learning libraries

For this tutorial we will use the Consumer Complaint Database put together by the Consumer Financial 
Protection Bureau (CBPB). More info about the data.



#### Basic text preprocessing

We will leverage Spacy's modern text preprocessing methods to lemmatize, handle some spelling errors, create a pronoun flag, etc. The first step in our code is to load the large english language model.

In [3]:
# load spacy model
nlp = spacy.load('en_core_web_lg')

Spacy is one of the most powerful NLP libraries available for data processing (a word on why spacy is better?)/ The next step is to define the data paths.

In [None]:
# load spacy model
#nlp = spacy.load('en_core_web_lg')

In [4]:
# default data dir
basepath = '/media/jlealtru/data_files/github/Tutorials'

DATA_DIR = '/media/jlealtru/data_files/github/Tutorials/datasets/text_analytics/financial'

os.listdir(DATA_DIR)

['Consumer_Complaints.csv']

Since the dataset is quite big, we are not hosting on github, you can get it from here:
link    
    

In [None]:
# read the complains data to have a better idea on how this works
df = pd.read_csv(os.path.join(DATA_DIR, 'Consumer_Complaints.csv'), engine='python')
print(f"The dataframe has {len(df):,} observations.")
df.head()

We are interested in predicting the type of issue faced by the customer. In the dataset we have 18 different
types of products.

In [None]:
df[['Product','Complaint ID']].groupby('Product').count()

In [None]:
Despite the fact the data has more than 1 million observations, a coursory exploration reveals that the data 
has multiple missing values. Checking the actual distribution of answers.

In [None]:
df[['Consumer consent provided?','Complaint ID']].groupby('Consumer consent provided?').count()

In [None]:
print(f"We have complete information for {len(df[df['Consumer consent provided?']=='Consent provided']):,} observations")

In [None]:
#df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), engine='python')

We filter the data to only have observations with consumer narrative, select relevant columns and shuffle 
the data. We use the pandas native sample function that generates a random sample of lenght n, in this 
case it is  equal to the number of the observations on the filtered dataset. We also set the random stated
to a predifined number so we can replicate the results.

In [None]:
df=df[df['Consumer consent provided?']=='Consent provided']
df=df[['Complaint ID','Consumer complaint narrative','Product']]
df=df.sample(n=df.shape[0], random_state=10)

In [None]:
df.head()

In [None]:
# check the distribution of cases for product
df[['Complaint ID', 'Product']].groupby('Product').count()

In [None]:
# We see that we have twp classes that are very unbalanced, Virtual Currency and Other financial Services. 
# To improve the performance of the model we merge both into a single class
df['Product'][(df['Product']=='Virtual currency') | (df['Product']=='Other financial service')]='Other financial services'

In [None]:
# check if this works
df[['Complaint ID', 'Product']].groupby('Product').count()

We reset the index and rename the text containing the text field of our data, we will use
that text in the torchtext context.

In [None]:
df = df.reset_index(drop=True)
df.head()

In [None]:
df = df.rename(columns={'Consumer complaint narrative': 'text'})
df.head()

In [None]:
dict_values_target={k:v for k,v in zip(range(len(df.Product.unique())), df.Product.unique())}
product_lookup=pd.DataFrame([[key,value] for key,value in dict_values_target.items()],
     columns=["product_id","product_text"])
product_lookup.head()

In [None]:
print(len(df))
df=pd.merge(df,product_lookup, how='left', left_on='Product',right_on='product_text')
print(len(df))

In [None]:
df=df[['text', 'product_id']]

In [None]:
Next we create, validation, training and test datasets.

In [None]:
# create test, train and validation datasets 
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test_= df[~msk]
msk1=np.random.rand(len(test_)) <= 0.5
test=test_[msk1]
val=test_[~msk1]
del test_
#df.iloc[0:700000].to_csv(os.path.join(basepath, 'train.csv'), index=False)
#df.iloc[700000:800000].to_csv(os.path.join(basepath, 'test.csv'), index=False)#
#df.iloc[800000:900000].to_csv(os.path.join(basepath, 'valid.csv'), index=False)

In [None]:
print(f"The len of train dataset is {len(train):,}, the len of test is {len(test):,} and the len of valid is "+
     f"{len(val):,}")

In [None]:
#train
#Complaint ID Product
train.to_csv(os.path.join(basepath, 'train.csv'),index=False)
test.to_csv(os.path.join(basepath, 'test.csv'),index=False)
val.to_csv(os.path.join(basepath, 'val.csv'),index=False)

In [None]:
a=pd.read_csv(os.path.join(basepath, 'test.csv'))
a.columns

In [5]:
# do the tokenizer
tokenize_count = 0

# use custom tokenizer with large spacy model
def tokenizer(text): # create a tokenizer function
    global tokenize_count
    if tokenize_count % 1000 == 0:
        sys.stdout.write('\rDoc: {}'.format(tokenize_count))
        sys.stdout.flush()
    tokenize_count += 1
    return [tok.text for tok in nlp.tokenizer(text)]

In [6]:
TEXT = data.Field(sequential=True, tokenize=tokenizer)
#LABEL = data.Field(sequential=False, 
#                   use_vocab=False, 
#                   pad_token=None, 
#                   unk_token=None)
#LABEL=data.LabelField(dtype=torch.float)
#LABEL = data.LabelField(dtype=torch.float)
LABEL = data.Field(sequential=False, use_vocab=False)

In [7]:
_datafields = [  ('text', TEXT), ('product_id', LABEL)
               # we won't be needing the id, so we pass in None as the field
               ]

In [8]:
from torchtext.data import TabularDataset
from torchtext import data

# define 
SEED = 1234

#  add a line about cudnn feterministic
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True



In [None]:
# Use the tabular dataset split function to create

trn, vld, tst = TabularDataset.splits(
    path="data", # the root directory where the data lies
    train=os.path.join(basepath, 'train.csv'), 
    validation=os.path.join(basepath, 'val.csv'),
    test=os.path.join(basepath, 'test.csv'), 
    format='csv',
    skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
    fields=_datafields)

Doc: 277000

In [None]:
# make sure the process worked fine
trn.fields.items()

In [None]:
# build the vocabulary using glove vectors of 300 dimensions. To limit the size of the vocabulary we limit the 
# vocabulary to 50,000 in size and a minumun occurence of 5 times
TEXT.build_vocab(trn, vectors='glove.42B.300d', min_freq=5, 
                max_size=50000)
LABEL.build_vocab(trn) 

In [None]:
# print the number of unique tokens and the len of the label categories
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

In [None]:
trn[0].product_id

In [None]:
# check the most frequent words in the vocabulary
print(TEXT.vocab.freqs.most_common(20))

In [None]:
print(LABEL.vocab.stoi)

In [None]:
# create our batch iterator object for training. This will automatically 
# shift our input text forward t+1 for our target data for the language model 
# to predict the next word in the sequence
train_iter, test_iter, valid_iter = data.BucketIterator.splits(
    (trn, tst, vld), 
    batch_size=32,
    #bptt_len=35, # specifying the sequence length for back prop through time
    device=device,
    #repeat=False, 
    sort_key=lambda x: len(x.text))

In [None]:
# check the data split getting an observation from the training iterable
b=next(iter(train_iter))

In [None]:
b.product_id.shape

We will use a LSTM RNN, a pretty standard model used to classify sequential data. For a refresher of RNN you can check out the good videos of Deep AI (they tend to be a bit theory heavy). The deep learning for coders is another great resource if you are interested in videos of Fastai Some of the most common tasks in pytorch is the classif(link to discussion on where.

The models from torch are saved on the nn module. We define a class that will hold the model, layers and parameters necessary for our sample. You may notice the use of the super parameter (Jason wanna chime in here).

Discussion of the LSTM model 
  - add discussion of the model
  - add reference to the LSTM model and a link to a couple of tutorials.


In [None]:
# define a class that will hold the model as well as the necessary parameters for it to work
import torch.nn as nn
import torch.nn.functional as F


class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        """
        Parameters of the model:
        vocab_size: size of the vocabulary after creating it using the Glove embeddings with the defined thresholds
        embedding_dim: size of the vocabulary embeddings
        hidden_dim: hidden dimensions of the lstm model
        output_dim: number of classes in our data
        n_layers: number of lstm layers
        bidirectional: dummy to specify if this is a bidirectional model
        dropout: droptout parameter for the dropout model
        
        """
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        # return the hidden state and the cell. We will concatenate the last two hidden vectors
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        # apply dropout before passing to fully connected layer
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        #define output
        output=self.fc(hidden)
        
        
        return output

In [None]:
# define the parameters for the model and instatiate the model class
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 128
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [None]:
# define the pretrained_embeddings
pretrained_embeddings = TEXT.vocab.vectors

In [None]:
# move the embedding weights to the model and move the model to the gpu
model.embedding.weight.data.copy_(pretrained_embeddings)
model.cuda()

In [None]:
# define our loss and optimizer
#loss_function = nn.NLLLoss()
#optimizer = optim.SGD(model.parameters(), lr=0.1)
#nn.

In [None]:
#import torch.optim as optim

#optimizer = optim.Adam(model.parameters())

In [None]:
import torch.optim as optim
learning_rate = 1e-3

# define our loss function and the parameters for updating the model.
# TALK ABOUT BETAS
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.7, 0.99))
n_tokens = pretrained_embeddings.size(0)

In [None]:
#help(nn.CrossEntropyLoss)
#import torch.nn.functional as F

In [None]:
#loss_function = nn.NLLLoss()
#optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
#def accuracy(out, labels):
#    return torch.sum(labels.data == out)/float(labels.size)

In [None]:
torch.cuda.empty_cache()

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    running_loss = 0.0
    model.train()
    
    for batch_i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        batch.text = batch.text.cuda()
        batch.product_id=batch.product_id.cuda()
        
        predictions = model(batch.text)
        
        #criterion = nn.CrossEntropyLoss()
        #loss = criterion(predictions, batch.product_id)
        #if predictions[0].shape==batch.product_id.shape[0]:
            
    
        loss=criterion(predictions, batch.product_id)
        
        
        #acc = binary_accuracy(predictions, batch.product_id)
        epoch_loss += loss.item()
        running_loss += loss.item()
        correct = (torch.max(predictions, 1)[1] == batch.product_id).sum()
        #print(float(correct)/32)
        if batch_i % 10 == 9:    # print every 10 batches
            print('Epoch: {}, Batch: {}, Avg. Loss: {}, correct{}'.format(epoch + 1,
                                                                          batch_i+1, running_loss/1000,
                                                                          float(correct)/32))        
        loss.backward()
        
        optimizer.step()
        
        #print("Epoch: %d, loss: %1.5f" % (epoch+1, epoch_loss/len(train_iter)))
        #epoch_loss += loss.item()
        
        #correct += (predicted.type(torch.DoubleTensor) == labels).sum()
        #pred=predictions[0]
        #print(torch.max(predictions, 0)[1],batch.product_id)
        #print(torch.max(predictions, 1)[1].shape)
        #correct = (rounded_preds == y).float()
        
        #correct = (output == batch.product_id).float().sum()
        #correct/batch.product_id.shape[0]
        #epoch_acc += acc.item()
        
    return epoch_loss,predictions

In [None]:
# define test function to measure accuracy

def test_data():
    
    # iterate through the test dataset
    for i, batch in enumerate(test_iter):
        
        # move inputs to gpu
        batch.text = batch.text.cuda()
        batch.product_id=batch.product_id.cuda()
        
        predictions = model(batch.text)
        
        loss=criterion(predictions, batch.product_id)
                
        #acc = binary_accuracy(predictions, batch.product_id)
        epoch_loss += loss.item()
        running_loss += loss.item()
        correct = (torch.max(predictions, 1)[1] == batch.product_id).sum()
        #print(float(correct)/32)
        if batch_i % 10 == 9:    # print every 10 batches
            print('Epoch: {}, Batch: {}, Avg. Loss: {}, correct{}'.format(epoch + 1,
                                                                          batch_i+1, running_loss/1000,
                                                                          float(correct)/32))        
        loss.backward()
        
        optimizer.step()
        
        
        

In [None]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    
    #train_loss, train_acc = 
    train(model, train_iter, optimizer, criterion)

In [None]:
#torch.nn.functional.softmax(pr1[0,:])
print(b1)
print(torch.max(torch.nn.functional.softmax(pr1), 1)[1])
#(np.log(pr1[0,:]))

In [None]:
train

In [None]:
model

In [None]:
def validation_loss(valid_iter, model):
    
    # monitor the loss
    val_loss = 0
    # turn on evaluation mode
    model.eval()
    for batch in valid_iter:
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        val_loss += loss.item() * text.size(0)
    val_loss /= len(valid.examples[0].text)

    return val_loss

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        
        val_loss += loss.item() * text.size(0)
        
        acc = validation_loss(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)

In [None]:
from torchtext.datasets.language_modeling import LanguageModelingDataset



class CustomLMData(LanguageModelingDataset):
    
    name = 'lm_dataset'

    @classmethod
    def splits(cls, _datafields, root=None, train='lmdata.txt',
               validation=None, test=None,
               **kwargs):
        """
        Create dataset from custom data persisted to disc. Data
        must be newline separated text files and path must be designated. 
        
        Arguments:
            text_field: The field that will be used for text data.
            root: The root directory that the dataset's zip archive will be
                expanded into; therefore the directory in whose wikitext-2
                subdirectory the data files will be stored.
            train: The filename of the train data. Default: 'wiki.train.tokens'.
            validation: The filename of the validation data, or None to not
                load the validation set. Default: 'wiki.valid.tokens'.
            test: The filename of the test data, or None to not load the test
                set. Default: 'wiki.test.tokens'.
                
        Resources: 
            https://github.com/pytorch/text/blob/master/torchtext/data/dataset.py
            https://github.com/pytorch/text/blob/master/torchtext/datasets/language_modeling.py
            https://torchtext.readthedocs.io/en/latest/examples.html
        """
        return super(CustomLMData, cls).splits(
            root=root, train=train, validation=validation, test=test,
            fields=_datafields, **kwargs)
    
    @classmethod
    def iters(cls, batch_size=32, bptt_len=25, device=None, path=basepath,
              train='lmdata.txt', validation=None, test=None, root=basepath,
              vectors=None, **kwargs):
        """Create iterator objects for splits of the WikiText-2 dataset.
        This is the simplest way to use the dataset, and assumes common
        defaults for field, vocabulary, and iterator parameters.
        Arguments:
            batch_size: Batch size.
            bptt_len: Length of sequences for backpropagation through time.
            device: Device to create batches on. Use -1 for CPU and None for
                the currently active GPU device.
            root: The root directory that the dataset's zip archive will be
                expanded into; therefore the directory in whose wikitext-2
                subdirectory the data files will be stored.
            wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the
                text field. The word vectors are accessible as
                train.dataset.fields['text'].vocab.vectors.
            Remaining keyword arguments: Passed to the splits method.
        """
        TEXT = data.Field()

        train = cls.splits(TEXT, root=root, path=basepath, **kwargs)

        TEXT.build_vocab(train, vectors=vectors)

        return data.BPTTIterator.splits(train,
            batch_size=batch_size, bptt_len=bptt_len,
            device=device)


In [None]:
train, test, valid = CustomLMData.splits(
    TEXT,
    path=basepath,
    train='train.csv',
    test='test.csv',
    validation='valid.csv',

)

In [None]:
TEXT.build_vocab(train, vectors='glove.42B.300d', min_freq=5, 
                max_size=100000)

In [None]:
train, test, valid = CustomLMData.splits(
    TEXT,
    path=basepath,
    train='train.csv',
    test='test.csv',
    validation='valid.csv',

)

In [None]:
basepath

In [None]:
We now need to write the customer complaints to a file separated by new lines so we can use it in the
torch model. # for the language modelling, we are going to write out to a text file
# that is new line separated



In [None]:
import csv
def write_text_data(file_name,text_field):
    text
    with open(os.path.join(basepath, fname), 'w',newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter = '\n')
        writer.writerow(text_field)

write_text_data('train2.csv', train['text'])

In [None]:
#len(train['text'])
train.to_csv(os.path.join(basepath, 'train.csv'), index=False)
len(train2)

In [None]:
sum(1 for line in open(os.path.join(basepath,'train.csv')))#train['text']

In [None]:
sum(1 for line in open(os.path.join(basepath,'train.csv')))

In [None]:
def write_text_data(fname, text):

    with open(os.path.join(basepath, fname), 'w') as outfile:
        for line in text:
            outfile.write('\n{}'.format(line))
        outfile.close()
        
write_text_data('train.csv', train['text'])
write_text_data('test.csv', test['text'])
write_text_data('valid.csv', val['text'])

In [None]:
len(train['text'])

Following the example of machine learning explained defined our tokenizer using spacy. The main purpose of this funtion if to output to screen the evolution of the training document.

In [None]:
tokenize_count = 0

# use custom tokenizer with large spacy model
def tokenizer(text): # create a tokenizer function
    global tokenize_count
    if tokenize_count % 1000 == 0:
        sys.stdout.write('\rDoc: {}'.format(tokenize_count))
        sys.stdout.flush()
    tokenize_count += 1
    return [tok.text for tok in nlp.tokenizer(text)]

In [None]:
# define the text
from torchtext.data import TabularDataset
from torchtext import data

SEED = 1234

#  add a line about cudnn feterministic
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True

TEXT = data.Field(sequential=True, tokenize=tokenizer)

In [None]:
TEXT.build_vocab(train, vectors='glove.42B.300d', min_freq=5, 
                max_size=100000)

In [None]:
tst_datafields = [("Complaint ID", None), # we won't be needing the id, so we pass in None as the field
                  ("text", TEXT),
                 ("Product", TEXT)]
tst = TabularDataset(
           path=basepath, # the file path
           format='txt',
           skip_header=False, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
           fields=tst_datafields)

In [None]:
We now define a tokenizer function that will split each one of the files into the tokens we will use
for the model.

In [None]:
tokenize_count = 0

# use custom tokenizer with large spacy model
def tokenizer(text): # create a tokenizer function
    global tokenize_count
    if tokenize_count % 1000 == 0:
        sys.stdout.write('\rDoc: {}'.format(tokenize_count))
        sys.stdout.flush()
    tokenize_count += 1
    return [tok.text for tok in nlp.tokenizer(text)]

Torchtext has a series of classes to handle text data. The data.field function is the way that torchtext handles our text data. In our case we are transforming words into numeric representations so we set the paramerter sequential to True. If your data passes a numericalized field and is not sequential, you should pass use_vocab=False and sequential=False.

In [None]:
from torchtext.data import TabularDataset
from torchtext import data

SEED = 1234

#  add a line about cudnn feterministic
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True

TEXT = data.Field(sequential=True, tokenize=tokenizer)

In [None]:
sum(1 for line in open(os.path.join(basepath,'train.csv')))

In [None]:
We need to talk about the splits function from torhectext

In [None]:
from torchtext.datasets.language_modeling import LanguageModelingDataset



class CustomLMData(LanguageModelingDataset):
    
    name = 'lm_dataset'

    @classmethod
    def splits(cls, text_field, root=None, train='lmdata.txt',
               validation=None, test=None,
               **kwargs):
        """
        Create dataset from custom data persisted to disc. Data
        must be newline separated text files and path must be designated. 
        
        Arguments:
            text_field: The field that will be used for text data.
            root: The root directory that the dataset's zip archive will be
                expanded into; therefore the directory in whose wikitext-2
                subdirectory the data files will be stored.
            train: The filename of the train data. Default: 'wiki.train.tokens'.
            validation: The filename of the validation data, or None to not
                load the validation set. Default: 'wiki.valid.tokens'.
            test: The filename of the test data, or None to not load the test
                set. Default: 'wiki.test.tokens'.
                
        Resources: 
            https://github.com/pytorch/text/blob/master/torchtext/data/dataset.py
            https://github.com/pytorch/text/blob/master/torchtext/datasets/language_modeling.py
            https://torchtext.readthedocs.io/en/latest/examples.html
        """
        return super(CustomLMData, cls).splits(
            root=root, train=train, validation=validation, test=test,
            fields=_, **kwargs)
    
    @classmethod
    def iters(cls, batch_size=32, bptt_len=25, device=0, path=basepath,
              train='lmdata.txt', validation=None, test=None, root=basepath,
              vectors=None, **kwargs):
        """Create iterator objects for splits of the WikiText-2 dataset.
        This is the simplest way to use the dataset, and assumes common
        defaults for field, vocabulary, and iterator parameters.
        Arguments:
            batch_size: Batch size.
            bptt_len: Length of sequences for backpropagation through time.
            device: Device to create batches on. Use -1 for CPU and None for
                the currently active GPU device.
            root: The root directory that the dataset's zip archive will be
                expanded into; therefore the directory in whose wikitext-2
                subdirectory the data files will be stored.
            wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the
                text field. The word vectors are accessible as
                train.dataset.fields['text'].vocab.vectors.
            Remaining keyword arguments: Passed to the splits method.
        """
        TEXT = data.Field()

        train = cls.splits(TEXT, root=root, path=basepath, **kwargs)

        TEXT.build_vocab(train, vectors=vectors)

        return data.BPTTIterator.splits(train,
            batch_size=batch_size, bptt_len=bptt_len,
            device=device)


In [None]:
# using our custom dataset class that inherits from the languagemodelling dataset of 
# torchtext, create our train, test, valid splits of quora questions
train, test, valid = CustomLMData.splits(
    TEXT,
    path=basepath,
    train='train.txt',
    test='test.txt',
    validation='valid.txt',

)

In [None]:
import os
os.getcwd()
basepath

In [None]:
# build vocab
# to see available pretrained embedding options, take a peek at the source code:
# https://github.com/pytorch/text/blob/master/torchtext/vocab.py
TEXT.build_vocab(train, vectors='glove.42B.300d', min_freq=5, 
                max_size=50000)

In [None]:
# create our batch iterator object for training. This will automatically 
# shift our input text forward t+1 for our target data for the language model 
# to predict the next word in the sequence
train_iter, test_iter, valid_iter = data.BPTTIterator.splits(
    (train, test, valid), 
    batch_size=64, 
    bptt_len=25, # specifying the sequence length for back prop through time
    device=device,
    repeat=False, 
    sort_key=lambda x: len(x.text)
)

In [None]:
b1=next(iter(train_iter))

In [None]:
b = next(iter(train_iter))

In [None]:
# numerilization occurs
b.text[:5, :3]

In [None]:
# we can peep into the numerilization with
TEXT.vocab.itos[1656]

In [None]:
b.target[:5, :3]

#### Building and Training the language model

The goal here is to use the pretrained glove 300 dimensional vectors to hot start our embedding model that will be fine tuned on our actual data. We are going to build an RNN bidirectional language model

In [None]:

   super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
    
    def forward(self, x):
        # Set initial states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
   
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V

 
class BiRNN(nn.Module):
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz,
                 dropout=0.5, tie_weights=True):
        """
        Bidirectional language model 
        
        https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py
        """
        super(BiRNN, self).__init__()
        self.nhid = nhid
        self.nlayers = nlayers
        self.bsz = bsz
        self.tie_weights = tie_weights # TODO: figure out tying weight with bidirectional LSTM
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.lstm = nn.LSTM(ninp, nhid, nlayers, dropout=dropout, bidirectional=True)
        self.decoder = nn.Linear(nhid*2, ntoken) # we need *2 for bidirectional
        self.init_weights()
        self.hidden = self.init_hidden(bsz) # the input is a batched consecutive corpus
                                            # therefore, we retain the hidden state across batches
     
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
 
    def forward(self, input_data):
        emb = self.drop(self.encoder(input_data))
        output, self.hidden = self.lstm(emb, self.hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        # once again we need x2 for bidirectional LSTM
        return (V(weight.new(self.nlayers*2, bsz, self.nhid).zero_().cuda()),
                V(weight.new(self.nlayers*2, bsz, self.nhid).zero_()).cuda())
  
    def reset_history(self):
        self.hidden = tuple(V(v.data) for v in self.hidden)

In [None]:
# we need to use our pretrained embeddings to init the RNN

BATCH_SIZE = 64

weight_matrix = TEXT.vocab.vectors
model = BiRNN(weight_matrix.size(0), 
                 weight_matrix.size(1), 200, 4, BATCH_SIZE, 
             tie_weights=True)

model.encoder.weight.data.copy_(weight_matrix)
model.cuda()

In [None]:
learning_rate = 1e-3

# define our loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.7, 0.99))
n_tokens = weight_matrix.size(0)

In [None]:
# construct the evaluation criteria

def validation_loss(valid_iter, model):
    
    # monitor the loss
    val_loss = 0
    # turn on evaluation mode
    model.eval()
    for batch in valid_iter:
        model.reset_history()
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        val_loss += loss.item() * text.size(0)
    val_loss /= len(valid.examples[0].text)

    return val_loss

In [None]:
# http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/

# and write our training loop


from tqdm import trange
from time import sleep

from tqdm import tqdm_notebook as tqdm

def clip_grads(model, clip_weight=0.25):
    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip_weight)
    for p in model.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    

def train_model(num_epochs=10):
    """One epoch of a training loop"""
    
    for epoch in range(0, num_epochs):
        # turn on training mode
        epoch_loss = 0
        t = tqdm(train_iter)
        batch_ii = 0
        for batch in t:
            batch_ii += 1
            # reset the hidden state or else the model will try to backpropagate to the
            # beginning of the dataset, requiring lots of time and a lot of memory
            model.train()
            t.set_description('Epoch: {}'.format(epoch))
            t.refresh()
            model.reset_history()

            optimizer.zero_grad()

            text, targets = batch.text, batch.target
            prediction = model(text)
            # pytorch currently only supports cross entropy loss for inputs of 2 or 4 dimensions.
            # we therefore flatten the predictions out across the batch axis so that it becomes
            # shape (batch_size * sequence_length, n_tokens)
            # in accordance to this, we reshape the targets to be
            # shape (batch_size * sequence_length)
            loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
            loss.backward()
            
            # clip gradients
            clip_grads(model)

            optimizer.step()

            # epoch_loss += loss.data[0] * prediction.size(0) * prediction.size(1)
            epoch_loss += loss.item() * prediction.size(0) * prediction.size(1)

            epoch_loss /= len(train.examples[0].text)
            
            
            
        # print('Epoch: {}, Training Loss: {:.4f}'.format(epoch, epoch_loss))
        # capture validation loss for each batch
        valid_loss = validation_loss(valid_iter, model)
        print('Epoch: {} | Training Loss: {:.4f} | Valid Loss: {:.4f}'.format(epoch, 
                                                                             epoch_loss, 
                                                                             valid_loss))
 
    final_val_loss = validation_loss(valid_iter, model)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, 
                                                                             epoch_loss, 
                                                                             final_val_loss))

    
train_model(num_epochs=5)

In [None]:
# save model
# https://pytorch.org/tutorials/beginner/saving_loading_models.html

if not os.path.exists(os.path.join(basepath, 'models')):
    os.makedirs(os.path.join(basepath, 'models'))
    
# save entire model - if only wanting to save for inference, 
# use model.state_dict()
torch.save(model, os.path.join(basepath, 'models/lm_200_model.pt'))

In [None]:
test_model = torch.load(os.path.join(basepath, 'models/lm_200_model.pt'))
test_model.eval()

In [None]:
def word_ids_to_sentence(id_tensor, vocab, join=None):
    """Converts a sequence of word ids to a sentence"""
    if isinstance(id_tensor, torch.LongTensor):
        ids = id_tensor.transpose(0, 1).contiguous().view(-1)
    elif isinstance(id_tensor, np.ndarray):
        ids = id_tensor.transpose().reshape(-1)
    batch = [vocab.itos[ind] for ind in ids] # denumericalize
    if join is None:
        return batch
    else:
        return join.join(batch)

In [None]:


arrs = model(b.text).cpu().data.numpy()
word_ids_to_sentence(np.argmax(arrs, axis=2), TEXT.vocab, join=' ')[0:500]

In [None]:
vocab = TEXT.vocab

vocab.stoi['the']

In [None]:
len(vocab.__dict__['freqs'].keys())

In [None]:
# https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [None]:
# pull out vocab items

wrd_to_embedding = {}
for wrd in list(vocab.__dict__['freqs'].keys()):
    print(wrd)
    lookup_tensor = torch.tensor([vocab.stoi[wrd]], dtype=torch.long, device=device)
    emb = model.drop(model.encoder(lookup_tensor))
    # convert embedding to numpy array
    emb = emb.cpu()
    np_array = emb.detach().numpy()
    wrd_to_embedding[wrd] = np_array

In [None]:
from scipy.spatial.distance import cosine

cosine(wrd_to_embedding['successful'], wrd_to_embedding['pick'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(wrd_to_embedding['soldier'], wrd_to_embedding['war'])

In [None]:
dir(vocab)

In [None]:
# test if vectors actually drifted by looking at our original vectors 
# from the glove implementation
old_w2v = {}

for wrd in list(vocab.__dict__['freqs'].keys()):
    print(wrd)
    wrd_id = vocab.stoi[wrd]
    vocab.vectors[wrd_id].cpu().detach().numpy()
    old_w2v[wrd] = np_array


In [None]:
cosine_similarity(wrd_to_embedding['war'], old_w2v['war'])