In [1]:
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
import spacy
from spacy.lang.en.examples import sentences

In [3]:
my_tok = spacy.load('en_core_web_sm')
my_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [4]:
def spacy_tok(x):
    x= re.sub(r'[^a-zA-Z\s]','',x)
    x= re.sub(r'[\n]','',x)
    #x = re.sub(r'[A-Za-z]+|\d+',x)
    #SPLIT_NUMBERS = re.compile(r'([+-]?\d*\.?\d+(?:[eE][-+]?\d+)?)')
    #x = ' '.join(SPLIT_NUMBERS.split(x))
    return [tok.text for tok in my_tok.tokenizer(x)]

In [5]:
TEXT = data.Field(lower=True, tokenize=spacy_tok,eos_token='EOS',stop_words=my_stopwords,include_lengths=True)
LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None)

In [40]:
df = pd.read_csv("Datasets/training-data.csv", header = 0, names=["Category", "Details"])

In [7]:
df = df.join(pd.get_dummies(df["Category"]))

In [8]:
df.drop("Category", axis=1, inplace=True)

In [29]:
#df.rename(columns={"Auto & Transport":"Auto_and_Transport", "Bills & Utilities":"Bills_and_Utilities", "Fees & Charges":"Fees_and_Charges", "Food & Dining":"Food_and_Dining", "Gifts & Donations":"Gifts_and_Donations", "Health & Fitness":"Health_and_Fitness"}, inplace=True)

In [9]:
df.head()

Unnamed: 0,Details,Auto & Transport,Bills & Utilities,Entertainment,Fees & Charges,Food & Dining,Gifts & Donations,Health & Fitness,Shopping,Transfer,Travel,Withdrawal
0,REST PUBLIC HOUSE 2 TIJ,0,0,0,0,1,0,0,0,0,0,0
1,8254833557VPC 160113L21DANTES GASTROMED,0,0,0,0,1,0,0,0,0,0,0
2,7412040258EINI5408116L9REST DAS CORTEZ C,0,0,0,0,1,0,0,0,0,0,0
3,58522132620267080680112REST PUBLIC HOUSE,0,0,0,0,1,0,0,0,0,0,0
4,9497635894DCA 0105286Q2LA EUROPEA molestias,0,0,0,0,1,0,0,0,0,0,0


In [10]:
train, val = train_test_split(df, test_size=0.2)

In [11]:
train.to_csv("Datasets/train.csv", index_label=False, index=False)
val.to_csv("Datasets/val.csv", index_label=False, index=False)

In [12]:
dataFields = [("Details", TEXT),
             ("Auto & Transport", LABEL), ("Bills & Utilities", LABEL),
             ("Entertainment", LABEL), ("Fees & Charges", LABEL),  
             ("Food & Dining", LABEL), ("Gifts & Donations", LABEL),
             ("Health & Fitness", LABEL), ("Shopping", LABEL),
             ("Transfer", LABEL), ("Travel", LABEL), ("Withdrawal", LABEL)]

In [13]:
train,val = data.TabularDataset.splits(path="Datasets/",train="train.csv", validation="val.csv", format="csv", fields=dataFields, skip_header=True)

In [14]:
#vec = torchtext.vocab.Vectors('cc.es.300.vec', cache='./Datasets/')

In [15]:
#TEXT.build_vocab(train,val, vectors=['fasttext.simple.300d',vec])

In [16]:
TEXT.build_vocab(train,val, vectors=['fasttext.simple.300d'])

In [17]:
len(TEXT.vocab)

2863

In [18]:
traindl, valdl = torchtext.data.BucketIterator.splits(datasets=(train, val),
                                            batch_sizes=(64, 64),
                                            sort_key=lambda x: len(x.Details),
                                            device=torch.device('cpu'),
                                            sort_within_batch=True
                                                     )

In [19]:
vectors= train.fields['Details'].vocab.vectors.to("cpu")

In [20]:
class BatchGenerator:
    def __init__(self, dl):
        self.dl = dl
        self.yFields= df.columns[1:]
        self.x= 'Details'
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x)
            y = torch.transpose( torch.stack([getattr(batch, y) for y in self.yFields]),0,1)
            yield (X,y)

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

In [22]:
class MyModel(nn.Module):
    def __init__(self,op_size,n_tokens,pretrained_vectors,nl=2,bidirectional=True,emb_sz=300,n_hiddenUnits=100):
        super(MyModel, self).__init__()
        self.n_hidden= n_hiddenUnits
        self.embeddings= nn.Embedding(n_tokens,emb_sz)
        self.embeddings.weight.data.copy_(pretrained_vectors)
#         self.embeddings.weight.requires_grad = False
        self.rnn= nn.LSTM(emb_sz,n_hiddenUnits,num_layers=2,bidirectional=True,dropout=0.2)
        self.lArr=[]
        if bidirectional:
            n_hiddenUnits= 2* n_hiddenUnits
        self.bn1 = nn.BatchNorm1d(num_features=n_hiddenUnits)
        for i in range(nl):
            if i==0:
                self.lArr.append(nn.Linear(n_hiddenUnits*3,n_hiddenUnits))
            else:
                self.lArr.append(nn.Linear(n_hiddenUnits,n_hiddenUnits))
        self.lArr= nn.ModuleList(self.lArr)
        self.l1= nn.Linear(n_hiddenUnits,op_size)
        
    def forward(self,data,lengths):
        #torch.to("cpu").empty_cache()
        bs= data.shape[1]
        self.h= self.init_hidden(bs)
        embedded= self.embeddings(data)
        embedded= nn.Dropout()(embedded)
        #embedded = pack_padded_sequence(embedded, torch.as_tensor(lengths)) #
        rnn_out, self.h = self.rnn(embedded, (self.h,self.h))
        #rnn_out, lengths = pad_packed_sequence(rnn_out,padding_value=1) #
        avg_pool= F.adaptive_avg_pool1d(rnn_out.permute(1,2,0),1).view(bs,-1)
        max_pool= F.adaptive_max_pool1d(rnn_out.permute(1,2,0),1).view(bs,-1)
        ipForLinearLayer= torch.cat([avg_pool,max_pool,rnn_out[-1]],dim=1)
        for linearlayer in self.lArr:
            outp= linearlayer(ipForLinearLayer)
            ipForLinearLayer= self.bn1(F.relu(outp))
            ipForLinearLayer= nn.Dropout(p=0.6)(ipForLinearLayer)
        outp = self.l1(ipForLinearLayer)
        del embedded;del rnn_out;del self.h;
        #torch.to("cpu").empty_cache()
        return outp
        
    def init_hidden(self, batch_size):
        return torch.zeros((4,batch_size,self.n_hidden),device="cpu")

In [23]:
def getValidationLoss(valdl,model,loss_func):
    model.eval()
    runningLoss=0
    valid_batch_it = BatchGenerator(valdl)
    allPreds= []
    allActualPreds= []
    with torch.no_grad():
        for i,obj in enumerate(valid_batch_it):
            obj= ( (obj[0][0].to("cpu"),obj[0][1].to("cpu")),obj[1] )
            preds = model(obj[0][0],obj[0][1])
            loss = loss_func(preds,obj[1].float())
            runningLoss+= loss.item()
            allPreds.append(preds.detach().numpy())
            allActualPreds.append(obj[1].detach().numpy())
        rocLoss= roc_auc_score(np.vstack(allActualPreds),np.vstack(allPreds), average="micro")
        return runningLoss/len(valid_batch_it),rocLoss

In [24]:
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence
def oneEpoch(lr):
    train_batch_it = BatchGenerator(traindl)
    opt = optim.Adam(model.parameters(),lr)
    runningLoss= 0
    allPreds=[]
    allActualPreds=[]
    for i,obj in enumerate(train_batch_it):
        obj= ( (obj[0][0].to("cpu"),obj[0][1].to("cpu")),obj[1] )
        model.train()
        opt.zero_grad()
        preds = model(obj[0][0],obj[0][1])
        loss = loss_func(preds,obj[1].float())
        runningLoss+= loss.item()
        loss.backward()
        opt.step()
        allPreds.append(preds.detach().numpy())
        allActualPreds.append(obj[1].detach().numpy())
        del obj;del preds
    trainRocLoss= roc_auc_score(np.vstack(allActualPreds),np.vstack(allPreds), average="micro")
    runningLoss= runningLoss/len(train_batch_it)
    valLoss,valRocLoss= getValidationLoss(valdl,model,loss_func)
    torch.cuda.empty_cache()
    return runningLoss,valLoss,trainRocLoss,valRocLoss

In [25]:
epochs= 10
trainLossArr=[]
valLossArr=[]
rocTrainLoss=[]
rocValLoss=[]
model= MyModel(11,len(TEXT.vocab),vectors,1)
loss_func= torch.nn.BCEWithLogitsLoss()
model = model.to("cpu")
for i in range(epochs):
    %time tLoss,vLoss,tRocLoss,vRocLoss= oneEpoch(1e-4)
    print(f"Epoch - {i}")
    print(f"Train Loss - {tLoss} vs Val Loss is {vLoss}")
    print(f"Train ROC - {tRocLoss} vs Val ROC is {vRocLoss}")
    trainLossArr.append(tLoss)
    valLossArr.append(vLoss)
    rocTrainLoss.append(tRocLoss)
    rocValLoss.append(vRocLoss)

CPU times: user 51.2 s, sys: 1.86 s, total: 53 s
Wall time: 5.19 s
Epoch - 0
Train Loss - 0.6695120593775874 vs Val Loss is 0.6276562213897705
Train ROC - 0.7693400929752067 vs Val ROC is 0.867627861570248
CPU times: user 51.7 s, sys: 1.7 s, total: 53.4 s
Wall time: 5.13 s
Epoch - 1
Train Loss - 0.5993452897106392 vs Val Loss is 0.5702847599983215
Train ROC - 0.8898218278667356 vs Val ROC is 0.9084915289256199
CPU times: user 54.2 s, sys: 1.99 s, total: 56.2 s
Wall time: 5.43 s
Epoch - 2
Train Loss - 0.5409396534812623 vs Val Loss is 0.5027064153126308
Train ROC - 0.9219483935950414 vs Val ROC is 0.935451590909091
CPU times: user 55.1 s, sys: 1.48 s, total: 56.5 s
Wall time: 5.34 s
Epoch - 3
Train Loss - 0.48127008909764496 vs Val Loss is 0.4426100790500641
Train ROC - 0.9352385847107437 vs Val ROC is 0.9336237603305786
CPU times: user 55.4 s, sys: 1.73 s, total: 57.1 s
Wall time: 5.47 s
Epoch - 4
Train Loss - 0.4192818772533666 vs Val Loss is 0.38547778044428144
Train ROC - 0.94749286

In [26]:
import matplotlib.pyplot as plt 
plt.plot(trainLossArr,color='b')
plt.plot(valLossArr,color='g')
plt.plot(rocTrainLoss,color='r')
plt.plot(rocValLoss,color='c')
plt.show()

<Figure size 640x480 with 1 Axes>

In [27]:
torch.save(model.state_dict(), "LSTM_transactions_model.pt")

In [28]:
dfTest = pd.read_csv("Datasets/test.csv")

In [29]:
dfTest.head()

Unnamed: 0,Details
0,8150123131lgr 150220p98cafe baristi cosm
1,8150123130fosa860123fi1rest public house
2,paypal uber bv 35314369001 nl
3,paypal uber bv 35314369001 nl
4,paypal uber bv 35314369001 nl


In [30]:

dataFields = [("Details", TEXT)]

testDataset= data.TabularDataset(path='Datasets/test.csv', 
                                            format='csv',
                                            fields=dataFields, 
                                            skip_header=True)

In [31]:
test_iter1 = torchtext.data.Iterator(testDataset, batch_size=32, device=torch.device('cpu'), sort=False, sort_within_batch=False, repeat=False,shuffle=False)

In [32]:
myPreds=[]
with torch.no_grad():
    model.eval()
    for obj in test_iter1:
        text= torch.transpose(obj.Details[0],0,1)[:2]
        pred= model(obj.Details[0],obj.Details[1])        
        pred= torch.sigmoid(pred)
        myPreds.append(pred.detach().numpy())
        del pred;del obj;

In [33]:
myPreds= np.vstack(myPreds)
for i, col in enumerate(df.columns[1:]):
    dfTest[col] = myPreds[:, i]

In [34]:
for i, col in enumerate(df.columns[1:]):
    dfTest[col] = myPreds[:, i]

In [35]:
dictOfCategories = { i : dfTest.columns[1:][i] for i in range(0, len(dfTest.columns[1:]) ) }

In [36]:
best = []
for i in range(dfTest.shape[0]):
    best.append(dictOfCategories[dfTest.iloc[i,1:].values.argmax()])

In [37]:
dfTest["Prediction"] = best

In [38]:
dfTest = dfTest[["Details","Prediction"]]

In [39]:
dfTest.to_csv("testPredictions.csv", index=False)