In [None]:
# load the relevant libraries for our analysis
import os
import pandas as pd
import torch
import spacy
from tqdm import tqdm_notebook as tqdm
from torchtext import data
from torchtext import datasets
import random
import numpy as np
import torchtext
import sys

In [None]:
# specify device type
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

This tutorial serves as an introduction to the usage of torchtext a library designed to preprocess text data so it
can be usedin tamdem with pytorch as well as with other deep learning libraries

For this tutorial we will use the Consumer Complaint Database put together by the Consumer Financial 
Protection Bureau (CBPB). More info about the data.



#### Basic text preprocessing

We will leverage Spacy's modern text preprocessing methods to lemmatize, handle some spelling errors, create a pronoun flag, etc. The first step in our code is to load the large english language model.

In [None]:
# load spacy model
nlp = spacy.load('en_core_web_lg')

Spacy is one of the most powerful NLP libraries available for data processing (a word on why spacy is better?)/ The next step is to define the data paths.

In [None]:
# load spacy model
#nlp = spacy.load('en_core_web_lg')

In [None]:
# default data dir
basepath = '/media/jlealtru/data_files/github/Tutorials'

DATA_DIR = '/media/jlealtru/data_files/github/Tutorials/datasets/text_analytics/financial'

os.listdir(DATA_DIR)

Since the dataset is quite big, we are not hosting on github, you can get it from here:
link    
    

In [None]:
# read the complains data to have a better idea on how this works
df = pd.read_csv(os.path.join(DATA_DIR, 'Consumer_Complaints.csv'), engine='python')
print(f"The dataframe has {len(df):,} observations.")
df.head()

We are interested in predicting the type of issue faced by the customer. In the dataset we have 18 different
types of products.

In [None]:
df[['Product','Complaint ID']].groupby('Product').count()

In [None]:
Despite the fact the data has more than 1 million observations, a coursory exploration reveals that the data 
has multiple missing values. Checking the actual distribution of answers.

In [None]:
df[['Consumer consent provided?','Complaint ID']].groupby('Consumer consent provided?').count()

In [None]:
print(f"We have complete information for {len(df[df['Consumer consent provided?']=='Consent provided']):,} observations")

In [None]:
#df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), engine='python')

We filter the data to only have observations with consumer narrative, select relevant columns and shuffle 
the data. We use the pandas native sample function that generates a random sample of lenght n, in this 
case it is  equal to the number of the observations on the filtered dataset. We also set the random stated
to a predifined number so we can replicate the results.

In [None]:
df=df[df['Consumer consent provided?']=='Consent provided']
df=df[['Complaint ID','Consumer complaint narrative','Product']]
df=df.sample(n=df.shape[0], random_state=10)

In [None]:
df.head()

In [None]:
# check the distribution of cases for product
df[['Complaint ID', 'Product']].groupby('Product').count()

In [None]:
# We see that we have twp classes that are very unbalanced, Virtual Currency and Other financial Services. 
# To improve the performance of the model we merge both into a single class
df['Product'][(df['Product']=='Virtual currency') | (df['Product']=='Other financial service')]='Other financial services'

In [None]:
# check if this works
df[['Complaint ID', 'Product']].groupby('Product').count()

We reset the index and rename the text containing the text field of our data, we will use
that text in the torchtext context.

In [None]:
df = df.reset_index(drop=True)
df.head()

In [None]:
df = df.rename(columns={'Consumer complaint narrative': 'text'})
df.head()

In [None]:
dict_values_target={k:v for k,v in zip(range(len(df.Product.unique())), df.Product.unique())}
product_lookup=pd.DataFrame([[key,value] for key,value in dict_values_target.items()],
     columns=["product_id","product_text"])
product_lookup.head()

In [None]:
print(len(df))
df=pd.merge(df,product_lookup, how='left', left_on='Product',right_on='product_text')
print(len(df))

In [None]:
df=df[['text', 'product_id']]

In [None]:
Next we create, validation, training and test datasets.

In [None]:
# create test, train and validation datasets 
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test_= df[~msk]
msk1=np.random.rand(len(test_)) <= 0.5
test=test_[msk1]
val=test_[~msk1]
del test_
#df.iloc[0:700000].to_csv(os.path.join(basepath, 'train.csv'), index=False)
#df.iloc[700000:800000].to_csv(os.path.join(basepath, 'test.csv'), index=False)#
#df.iloc[800000:900000].to_csv(os.path.join(basepath, 'valid.csv'), index=False)

In [None]:
print(f"The len of train dataset is {len(train):,}, the len of test is {len(test):,} and the len of valid is "+
     f"{len(val):,}")

In [None]:
#train
#Complaint ID Product
train.to_csv(os.path.join(basepath, 'train.csv'),index=False)
test.to_csv(os.path.join(basepath, 'test.csv'),index=False)
val.to_csv(os.path.join(basepath, 'val.csv'),index=False)

In [None]:
a=pd.read_csv(os.path.join(basepath, 'test.csv'))
a.columns

In [None]:
# do the tokenizer
tokenize_count = 0

# use custom tokenizer with large spacy model
def tokenizer(text): # create a tokenizer function
    global tokenize_count
    if tokenize_count % 1000 == 0:
        sys.stdout.write('\rDoc: {}'.format(tokenize_count))
        sys.stdout.flush()
    tokenize_count += 1
    return [tok.text for tok in nlp.tokenizer(text)]

In [None]:
TEXT = data.Field(sequential=True, tokenize=tokenizer)
#LABEL = data.Field(sequential=False, 
#                   use_vocab=False, 
#                   pad_token=None, 
#                   unk_token=None)
#LABEL=data.LabelField(dtype=torch.float)
#LABEL = data.LabelField(dtype=torch.float)
LABEL = data.LabelField(sequential=False, use_vocab=False, pad_token=None, 
                        unk_token=None)

In [None]:
_datafields = [  ('text', TEXT), ('product_id', LABEL)
               # we won't be needing the id, so we pass in None as the field
               ]

In [None]:
from torchtext.data import TabularDataset
from torchtext import data

# define 
SEED = 1234

#  add a line about cudnn feterministic
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True



In [None]:
# Use the tabular dataset split function to create

trn, vld, tst = TabularDataset.splits(
    path="data", # the root directory where the data lies
    train=os.path.join(basepath, 'train.csv'), 
    validation=os.path.join(basepath, 'val.csv'),
    test=os.path.join(basepath, 'test.csv'), 
    format='csv',
    skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
    fields=_datafields)

In [None]:
# make sure the process worked fine
trn.fields.items()

In [None]:
# build the vocabulary using glove vectors of 300 dimensions. To limit the size of the vocabulary we limit the 
# vocabulary to 50,000 in size and a minumun occurence of 5 times
TEXT.build_vocab(trn, vectors='glove.42B.300d', min_freq=5, 
                max_size=50000)
LABEL.build_vocab(trn)

In [None]:
# print the number of unique tokens and the len of the label categories
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

In [None]:
weights=[1/value for key, value in LABEL.vocab.freqs.items()]
#print(dict(LABEL.vocab.freqs))

#k,v for dict(LABEL.vocab.freqs)}

In [None]:
# check the most frequent words in the vocabulary
print(TEXT.vocab.freqs.most_common(20))

In [None]:
print(LABEL.vocab.stoi)

In [None]:
# create our batch iterator object for training. This will automatically 
# shift our input text forward t+1 for our target data for the language model 
# to predict the next word in the sequence
train_iter, test_iter, valid_iter = data.BucketIterator.splits(
    (trn, tst, vld), 
    batch_size=32,
    #bptt_len=35, # specifying the sequence length for back prop through time
    device=device,
    #repeat=False, 
    sort_key=lambda x: len(x.text))

In [None]:
# check the data split getting an observation from the training iterable
b=next(iter(train_iter))

In [None]:
b.product_id.shape

We will use a LSTM RNN, a pretty standard model used to classify sequential data. For a refresher of RNN you can check out the good videos of Deep AI (they tend to be a bit theory heavy). The deep learning for coders is another great resource if you are interested in videos of Fastai Some of the most common tasks in pytorch is the classif(link to discussion on where.

The models from torch are saved on the nn module. We define a class that will hold the model, layers and parameters necessary for our sample. You may notice the use of the super parameter (Jason wanna chime in here).

Discussion of the LSTM model 
  - add discussion of the model
  - add reference to the LSTM model and a link to a couple of tutorials.


In [None]:
# define a class that will hold the model as well as the necessary parameters for it to work
import torch.nn as nn
import torch.nn.functional as F


class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        """
        Parameters of the model:
        vocab_size: size of the vocabulary after creating it using the Glove embeddings with the defined thresholds
        embedding_dim: size of the vocabulary embeddings
        hidden_dim: hidden dimensions of the lstm model
        output_dim: number of classes in our data
        n_layers: number of lstm layers
        bidirectional: dummy to specify if this is a bidirectional model
        dropout: droptout parameter for the dropout model
        
        """
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        # return the hidden state and the cell. We will concatenate the last two hidden vectors
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        # apply dropout before passing to fully connected layer
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        #define output
        output=self.fc(hidden)
        
        
        return output

In [None]:
# define the parameters for the model and instatiate the model class
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 128
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [None]:
# define the pretrained_embeddings
pretrained_embeddings = TEXT.vocab.vectors

In [None]:
# move the embedding weights to the model and move the model to the gpu
model.embedding.weight.data.copy_(pretrained_embeddings)
model.cuda()

In [None]:
# define our loss and optimizer
#loss_function = nn.NLLLoss()
#optimizer = optim.SGD(model.parameters(), lr=0.1)
#nn.

In [None]:
#import torch.optim as optim

#optimizer = optim.Adam(model.parameters())

In [None]:
import torch.optim as optim
learning_rate = 1e-3

# define our loss function and the parameters for updating the model.
# TALK ABOUT BETAS
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.7, 0.99))
n_tokens = pretrained_embeddings.size(0)

In [None]:
#help(nn.CrossEntropyLoss)
#import torch.nn.functional as F

In [None]:
#loss_function = nn.NLLLoss()
#optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
#def accuracy(out, labels):
#    return torch.sum(labels.data == out)/float(labels.size)

In [None]:
torch.cuda.empty_cache()

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    running_loss = 0.0
    model.train()
    
    for batch_i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        batch.text = batch.text.cuda()
        batch.product_id=batch.product_id.cuda()
        
        predictions = model(batch.text)
        
        #criterion = nn.CrossEntropyLoss()
        #loss = criterion(predictions, batch.product_id)
        #if predictions[0].shape==batch.product_id.shape[0]:
        w=weights.cuda    
    
        loss=criterion(predictions, batch.product_id, weights=w)
        
        
        #acc = binary_accuracy(predictions, batch.product_id)
        epoch_loss += loss.item()
        running_loss += loss.item()
        correct = (torch.max(predictions, 1)[1] == batch.product_id).sum()
        #print(float(correct)/32)
        if batch_i % 10 == 9:    # print every 10 batches
            print('Epoch: {}, Batch: {}, Avg. Loss: {}, correct{}'.format(epoch + 1,
                                                                          batch_i+1, running_loss/1000,
                                                                          float(correct)/32))        
        loss.backward()
        
        optimizer.step()
        
        #print("Epoch: %d, loss: %1.5f" % (epoch+1, epoch_loss/len(train_iter)))
        #epoch_loss += loss.item()
        
        #correct += (predicted.type(torch.DoubleTensor) == labels).sum()
        #pred=predictions[0]
        #print(torch.max(predictions, 0)[1],batch.product_id)
        #print(torch.max(predictions, 1)[1].shape)
        #correct = (rounded_preds == y).float()
        
        #correct = (output == batch.product_id).float().sum()
        #correct/batch.product_id.shape[0]
        #epoch_acc += acc.item()
        
    return epoch_loss,predictions

In [None]:
# define test function to measure accuracy

def test_data():
    
    # iterate through the test dataset
    for i, batch in enumerate(test_iter):
        
        # move inputs to gpu
        batch.text = batch.text.cuda()
        batch.product_id=batch.product_id.cuda()
        
        predictions = model(batch.text)
        
        loss=criterion(predictions, batch.product_id)
                
        #acc = binary_accuracy(predictions, batch.product_id)
        epoch_loss += loss.item()
        running_loss += loss.item()
        correct = (torch.max(predictions, 1)[1] == batch.product_id).sum()
        #print(float(correct)/32)
        if batch_i % 10 == 9:    # print every 10 batches
            print('Epoch: {}, Batch: {}, Avg. Loss: {}, correct{}'.format(epoch + 1,
                                                                          batch_i+1, running_loss/1000,
                                                                          float(correct)/32))        
        loss.backward()
        
        optimizer.step()
        
        
        

In [None]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    
    #train_loss, train_acc = 
    train(model, train_iter, optimizer, criterion)

In [None]:
# save model
## TODO: change the name to something uniqe for each new model
model_dir = os.path.join(DATA_DIR, 'savedmodels/')
model_name = 'simple_lstm_financial_data.pt'

# after training, save your model parameters in the dir 'saved_models'
torch.save(model.state_dict(), model_dir+model_name)

In [None]:
# evaluate test

In [None]:
#torch.nn.functional.softmax(pr1[0,:])
print(b1)
print(torch.max(torch.nn.functional.softmax(pr1), 1)[1])
#(np.log(pr1[0,:]))

In [None]:
train

In [None]:
# save model once its done.

In [None]:
model