In [1]:
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import re
import nltk
from nltk.corpus import stopwords
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from spacy.lang.en.examples import sentences 

#### 1. Declare the model

In [2]:
class LSTM_Model(nn.Module):
    def __init__(self,op_size,n_tokens,pretrained_vectors,nl=2,bidirectional=True,emb_sz=300,n_hiddenUnits=100):
        super(LSTM_Model, self).__init__()
        self.n_hidden= n_hiddenUnits
        self.embeddings= nn.Embedding(n_tokens,emb_sz)
        self.embeddings.weight.data.copy_(pretrained_vectors)
#         self.embeddings.weight.requires_grad = False
        self.rnn= nn.LSTM(emb_sz,n_hiddenUnits,num_layers=2,bidirectional=True,dropout=0.2)
        self.lArr=[]
        if bidirectional:
            n_hiddenUnits= 2* n_hiddenUnits
        self.bn1 = nn.BatchNorm1d(num_features=n_hiddenUnits)
        for i in range(nl):
            if i==0:
                self.lArr.append(nn.Linear(n_hiddenUnits*3,n_hiddenUnits))
            else:
                self.lArr.append(nn.Linear(n_hiddenUnits,n_hiddenUnits))
        self.lArr= nn.ModuleList(self.lArr)
        self.l1= nn.Linear(n_hiddenUnits,op_size)
        
    def forward(self,data,lengths):
        #torch.gpu().empty_cache()
        bs= data.shape[1]
        self.h= self.init_hidden(bs)
        embedded= self.embeddings(data)
        embedded= nn.Dropout()(embedded)
        #embedded = pack_padded_sequence(embedded, torch.as_tensor(lengths)) #
        rnn_out, self.h = self.rnn(embedded, (self.h,self.h))
        #rnn_out, lengths = pad_packed_sequence(rnn_out,padding_value=1) #
        avg_pool= F.adaptive_avg_pool1d(rnn_out.permute(1,2,0),1).view(bs,-1)
        max_pool= F.adaptive_max_pool1d(rnn_out.permute(1,2,0),1).view(bs,-1)
        ipForLinearLayer= torch.cat([avg_pool,max_pool,rnn_out[-1]],dim=1)
        for linearlayer in self.lArr:
            outp= linearlayer(ipForLinearLayer)
            ipForLinearLayer= self.bn1(F.relu(outp))
            ipForLinearLayer= nn.Dropout(p=0.6)(ipForLinearLayer)
        outp = self.l1(ipForLinearLayer)
        del embedded;del rnn_out;del self.h;
        #torch.gpu().empty_cache()
        return outp
        
    def init_hidden(self, batch_size):
        return torch.zeros((4,batch_size,self.n_hidden),device="cpu")

#### 2. Upload the Model

Load columns

In [3]:
columns = pd.read_csv("Datasets/train.csv")
columns = columns.columns[1:]

Load work embeddings english and spanish from spacy

In [4]:
my_tok = spacy.load('en_core_web_sm')
my_stopwords = spacy.lang.en.stop_words.STOP_WORDS
STOPWORDS = set(stopwords.words('spanish'))
my_stopwords.update(STOPWORDS)

In [5]:
def spacy_tok(x):
    x= re.sub(r'[^a-zA-Z\s]','',x)
    x= re.sub(r'[\n]','',x)
    return [tok.text for tok in my_tok.tokenizer(x)]

Set up Text and Label to read csv files as torchText Tabular Dataset

In [6]:
TEXT = data.Field(lower=True, tokenize=spacy_tok,eos_token='EOS',stop_words=my_stopwords,include_lengths=True)
LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None)

In [7]:
dataFields = [("Details", TEXT),
             ("Auto & Transport", LABEL), ("Bills & Utilities", LABEL),
             ("Entertainment", LABEL), ("Fees & Charges", LABEL),  
             ("Food & Dining", LABEL), ("Gifts & Donations", LABEL),
             ("Health & Fitness", LABEL), ("Shopping", LABEL),
             ("Transfer", LABEL), ("Travel", LABEL), ("Withdrawal", LABEL)]

In [8]:
train, val = data.TabularDataset.splits(path="Datasets/",train="train.csv", validation="val.csv", format="csv", fields=dataFields, skip_header=True)

Build the vocabulary

In [10]:
TEXT.build_vocab(train,val, vectors=['fasttext.simple.300d'])

In [11]:
len(TEXT.vocab)

2854

Convert tabular dataset to vocabulary vectors

In [12]:
vectors= train.fields['Details'].vocab.vectors.to("cpu")

Load the model

In [13]:
model = LSTM_Model(len(columns),len(TEXT.vocab),vectors,1)
model.load_state_dict(torch.load("LSTM_transactions_model.pt"))

<All keys matched successfully>

Read csv test file

In [14]:
dfTest = pd.read_csv("Datasets/test.csv")
dataFields = [("Details", TEXT)]
testDataset= data.TabularDataset(path='Datasets/test.csv', 
                                            format='csv',
                                            fields=dataFields, 
                                            skip_header=True)
test_iter1 = torchtext.data.Iterator(testDataset, batch_size=32, device=torch.device('cpu'), sort=False, sort_within_batch=False, repeat=False,shuffle=False)

Process predictions

In [15]:
myPreds=[]
with torch.no_grad():
    model.eval()
    for obj in test_iter1:
        text= torch.transpose(obj.Details[0],0,1)[:2]
        pred= model(obj.Details[0],obj.Details[1])        
        pred= torch.sigmoid(pred)
        myPreds.append(pred.detach().numpy())
        del pred;del obj;

In [16]:
myPreds= np.vstack(myPreds)
for i, col in enumerate(columns):
    dfTest[col] = myPreds[:, i]

In [17]:
for i, col in enumerate(columns):
    dfTest[col] = myPreds[:, i]

In [18]:
dictOfCategories = { i : columns[i] for i in range(0, len(columns) ) }

In [19]:
best = []
for i in range(dfTest.shape[0]):
    best.append(dictOfCategories[dfTest.iloc[i,1:].values.argmax()])

Output predictions on a csv file

In [20]:
dfTest["Prediction"] = best

In [21]:
dfTest = dfTest[["Details","Prediction"]]

In [22]:
dfTest.to_csv("Datasets/output.csv", index=False)

In [23]:
pd.read_csv("Datasets/output.csv").head()

Unnamed: 0,Details,Prediction
0,8150123131lgr 150220p98cafe baristi cosm,Food & Dining
1,8150123130fosa860123fi1rest public house,Food & Dining
2,paypal uber bv 35314369001 nl,Auto & Transport
3,paypal uber bv 35314369001 nl,Auto & Transport
4,paypal uber bv 35314369001 nl,Auto & Transport
