In [1]:
import pandas as pd
import torch
import torchtext

from pandarallel import pandarallel
# Initialization
pandarallel.initialize()

#string manupulation libs
import re
import string
from string import digits
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from ECB_vocab import *
from ECB_datasets import *
from ECB_dataloader import *
from ECB_classifier import *

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
data = r'data/all_ECB_speeches.csv'

In [3]:
df = pd.read_csv(data,sep='|')
print(len(df.index))
df.head()

2526


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-10-20,Frank Elderson,Overcoming the tragedy of the horizon: requiri...,"Keynote speech by Frank Elderson, Member of th...",SPEECH Overcoming the tragedy of the horiz...
1,2021-10-19,Fabio Panetta,“Hic sunt leones” – open research questions on...,"Speech by Fabio Panetta, Member of the Executi...",SPEECH “Hic sunt leones” – open research q...
2,2021-10-19,Frank Elderson,The role of supervisors and central banks in t...,"Keynote speech by Frank Elderson, Member of th...",SPEECH The role of supervisors and central...
3,2021-10-16,Christine Lagarde,Globalisation after the pandemic,2021 Per Jacobsson Lecture by Christine Lagard...,SPEECH Globalisation after the pandemic ...
4,2021-10-14,Christine Lagarde,IMFC Statement,"Statement by Christine Lagarde, President of t...",SPEECH IMFC Statement Statement by Chri...


In [4]:
df = df.loc[(df['speakers'] != 'Willem F. Duisenberg,Eugenio Domingo Solans')&
            (df['speakers'] != 'Alexandre Lamfalussy')&
            (df['speakers'] != 'Frank Elderson')]
print(len(df.index))

2511


In [5]:
targets = list()
df['targets'] = ''
for i in enumerate(df.speakers.unique().tolist()):
    index = df['speakers'].isin([i[1]])
    df.loc[index,'targets'] = i[0]

df = df[['targets','contents']]

df = df.dropna().drop_duplicates()
print(len(df.index))

2477


In [6]:
df.contents = df.contents.parallel_apply(lambda x: re.sub("'", '',x).lower())
#remove special chars
exclude = set(string.punctuation)#set of all special chars
#remove all the special chars
df.contents = df.contents.parallel_apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [7]:
remove_digits = str.maketrans('','',digits)
df.contents  = df.contents.parallel_apply(lambda x: x.translate(remove_digits))

# Remove extra spaces
df.contents =df.contents .parallel_apply(lambda x: x.strip())
df.contents =df.contents .parallel_apply(lambda x: re.sub(" +", " ", x))

In [8]:
#######################################################
#               Create Train and Valid sets
#######################################################

val_frac = 0.1 #precentage data in val
val_split_idx = int(len(df)*val_frac) #index on which to split
data_idx = list(range(len(df))) #create a list of ints till len of data
np.random.shuffle(data_idx)

#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))

#create the sets
train = df.iloc[train_idx].reset_index().drop('index',axis=1)
val = df.iloc[val_idx].reset_index().drop('index',axis=1)

len of train:  2230
len of val:  247


In [9]:
train

Unnamed: 0,targets,contents
0,15,does the euro area need an economic government...
1,12,reformen in der eurozone rede von jörg asmusse...
2,19,the euro area first experience and perspective...
3,9,hearing at the committee on economic and monet...
4,7,the known unknowns of financial regulation pan...
...,...,...
2225,7,la situation économique dans la zone euro et l...
2226,17,wirtschafts und finanzpolitische herausforderu...
2227,15,financial globalisation economic policies in a...
2228,11,presentation of the ecb annual report to the c...


In [10]:
train_dataset = Train_Dataset(train, 'contents', 'targets')
print(train.loc[1])
train_dataset[1]

targets                                                    12
contents    reformen in der eurozone rede von jörg asmusse...
Name: 1, dtype: object


(12, tensor([   1, 3297,    8,  ..., 2190,    3,    2]))

In [11]:
train_loader = get_train_loader(train_dataset, 32)
source = next(iter(train_loader))[1]
target = next(iter(train_loader))[0]

print('source: \n', source)

print('source shape: ',source.shape)
print('target shape: ', len(target))

source: 
 tensor([[    1,     1,     1,  ...,     1,     1,     1],
        [ 8236,   163,   413,  ...,   269,    68,   269],
        [    5,    52,   860,  ..., 13984,   525,   682],
        ...,
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0]])
source shape:  torch.Size([10127, 32])
target shape:  32


  return torch.tensor(target), torch.tensor(source)


In [12]:
sentence_list=train.contents.tolist()
#create a vocab class with freq_threshold=0 and max_size=100
voc = Vocabulary(2, 100000)
#build vocab
voc.build_vocabulary(sentence_list)
vocab_size = len(voc)
num_classes = len(train.targets.unique()) + 1
print(num_classes)

25


In [13]:
train.targets.unique()

array([15, 12, 19, 9, 7, 11, 16, 10, 6, 22, 13, 14, 18, 3, 20, 8, 0, 1,
       23, 17, 21, 4, 2, 24], dtype=object)

In [14]:
device = 'cpu'

In [16]:
model = LSTMClassifier(vocab_size ,64,32,num_classes).to(device)

In [17]:
def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,epochs=100):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    loss_fn = loss_fn.to(device)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for epoch in range(epochs):
        for labels,features in dataloader:
            features = torch.transpose(features, 0, 1)
            #print(labels,features)
            #print(labels.shape,features.shape)
            optimizer.zero_grad()
            features, labels = features.to(device), labels.to(device)
            out = net(features)
            loss = loss_fn(out,labels) #cross_entropy(out,labels)
            loss.backward()
            optimizer.step()
            total_loss+=loss
            _,predicted = torch.max(out,1)
            acc+=(predicted==labels).sum()
            count+=len(labels)
            i+=1
        print(f"Epoch:{epoch}, acc={acc.item()/count}, loss={total_loss}")
    return total_loss.item()/count, acc.item()/count

In [None]:
train_epoch(model,train_loader, lr=0.001)