In [1]:
import pandas as pd
import torch
import torchtext

from pandarallel import pandarallel
# Initialization
pandarallel.initialize()

#string manupulation libs
import re
import string
from string import digits
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from ECB_vocab import *
from ECB_datasets import *
from ECB_dataloader import *
from ECB_classifier import *

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
data = r'data/all_ECB_speeches.csv'

In [3]:
df = pd.read_csv(data,sep='|')
print(len(df.index))
df.head()

2526


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-10-20,Frank Elderson,Overcoming the tragedy of the horizon: requiri...,"Keynote speech by Frank Elderson, Member of th...",SPEECH Overcoming the tragedy of the horiz...
1,2021-10-19,Fabio Panetta,“Hic sunt leones” – open research questions on...,"Speech by Fabio Panetta, Member of the Executi...",SPEECH “Hic sunt leones” – open research q...
2,2021-10-19,Frank Elderson,The role of supervisors and central banks in t...,"Keynote speech by Frank Elderson, Member of th...",SPEECH The role of supervisors and central...
3,2021-10-16,Christine Lagarde,Globalisation after the pandemic,2021 Per Jacobsson Lecture by Christine Lagard...,SPEECH Globalisation after the pandemic ...
4,2021-10-14,Christine Lagarde,IMFC Statement,"Statement by Christine Lagarde, President of t...",SPEECH IMFC Statement Statement by Chri...


In [4]:
df = df.loc[(df['speakers'] != 'Willem F. Duisenberg,Eugenio Domingo Solans')&
            (df['speakers'] != 'Alexandre Lamfalussy')&
            (df['speakers'] != 'Frank Elderson')]
print(len(df.index))

2511


In [5]:
targets = list()
df['targets'] = ''
for i in enumerate(df.speakers.unique().tolist()):
    index = df['speakers'].isin([i[1]])
    df.loc[index,'targets'] = i[0]

df = df[['targets','contents']]

df = df.dropna().drop_duplicates()
print(len(df.index))

2477


In [6]:
df.contents = df.contents.parallel_apply(lambda x: re.sub("'", '',x).lower())
#remove special chars
exclude = set(string.punctuation)#set of all special chars
#remove all the special chars
df.contents = df.contents.parallel_apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [7]:
remove_digits = str.maketrans('','',digits)
df.contents  = df.contents.parallel_apply(lambda x: x.translate(remove_digits))

# Remove extra spaces
df.contents =df.contents .parallel_apply(lambda x: x.strip())
df.contents =df.contents .parallel_apply(lambda x: re.sub(" +", " ", x))

In [8]:
#######################################################
#               Create Train and Valid sets
#######################################################

val_frac = 0.1 #precentage data in val
val_split_idx = int(len(df)*val_frac) #index on which to split
data_idx = list(range(len(df))) #create a list of ints till len of data
np.random.shuffle(data_idx)

#get indexes for validation and train
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))

#create the sets
train = df.iloc[train_idx].reset_index().drop('index',axis=1)
val = df.iloc[val_idx].reset_index().drop('index',axis=1)

len of train:  2230
len of val:  247


In [9]:
train

Unnamed: 0,targets,contents
0,1,laudatory speech laudatory speech by christine...
1,19,topic of the year professor otmar issing mitgl...
2,16,remise du prix francoallemand de la culture mo...
3,9,euro area economic outlook the ecb’s monetary ...
4,13,la situación actual de la economía europea dis...
...,...,...
2225,19,der euro – eine stabile währung für europa red...
2226,6,remarks at the ‘challenges in understanding th...
2227,11,implications of the ssm on the esfs speech by ...
2228,16,monetary policy and private expectations zolot...


In [10]:
train_dataset = Train_Dataset(train, 'contents', 'targets')
print(train.loc[1])
train_dataset[1]

targets                                                    19
contents    topic of the year professor otmar issing mitgl...
Name: 1, dtype: object


(19, tensor([   1, 1411,    5,  ..., 2470, 4480,    2]))

In [11]:
train_loader = get_train_loader(train_dataset, 32)
source = next(iter(train_loader))[1]
target = next(iter(train_loader))[0]

print('source: \n', source)

print('source shape: ',source.shape)
print('target shape: ', len(target))

source: 
 tensor([[    1,     1,     1,  ...,     1,     1,     1],
        [  270,  3124, 25023,  ...,     4,  1235,    80],
        [14202,    13,     5,  ...,   483,  1184,  1212],
        ...,
        [    0,     0,     4,  ...,     0,     0,     0],
        [    0,     0,    70,  ...,     0,     0,     0],
        [    0,     0,     2,  ...,     0,     0,     0]])
source shape:  torch.Size([6862, 32])
target shape:  32


  return torch.tensor(target), torch.tensor(source)


In [12]:
sentence_list=train.contents.tolist()
#create a vocab class with freq_threshold=0 and max_size=100
voc = Vocabulary(2, 100000)
#build vocab
voc.build_vocabulary(sentence_list)
vocab_size = len(voc)
num_classes = len(train.targets.unique()) + 1
print(num_classes)

25


In [13]:
train.targets.unique()

array([1, 19, 16, 9, 13, 12, 14, 20, 15, 11, 17, 7, 22, 6, 8, 0, 4, 24,
       23, 21, 18, 3, 10, 2], dtype=object)

In [14]:
device = 'cpu'

In [15]:
model = EmbedClassifier(vocab_size ,64,num_classes).to(device)

In [16]:
def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,epochs=100):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    loss_fn = loss_fn.to(device)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for epoch in range(epochs):
        for labels,features in dataloader:
            features = torch.transpose(features, 0, 1)
            #print(labels,features)
            #print(labels.shape,features.shape)
            optimizer.zero_grad()
            features, labels = features.to(device), labels.to(device)
            out = net(features)
            loss = loss_fn(out,labels) #cross_entropy(out,labels)
            loss.backward()
            optimizer.step()
            total_loss+=loss
            _,predicted = torch.max(out,1)
            acc+=(predicted==labels).sum()
            count+=len(labels)
            i+=1
        print(f"Epoch:{epoch}, acc={acc.item()/count}, loss={total_loss}")
    return total_loss.item()/count, acc.item()/count

In [17]:
train_epoch(model,train_loader, lr=0.001)

Epoch:0, acc=0.10089686098654709, loss=214.8420867919922
Epoch:1, acc=0.11614349775784753, loss=423.82000732421875
Epoch:2, acc=0.12257100149476831, loss=631.9033813476562
Epoch:3, acc=0.12556053811659193, loss=839.083984375
Epoch:4, acc=0.12771300448430492, loss=1044.8726806640625
Epoch:5, acc=0.12937219730941704, loss=1249.8751220703125
Epoch:6, acc=0.13087764253683537, loss=1453.5308837890625
Epoch:7, acc=0.1320627802690583, loss=1656.346923828125
Epoch:8, acc=0.1334828101644245, loss=1858.5841064453125
Epoch:9, acc=0.13452914798206278, loss=2059.676513671875
Epoch:10, acc=0.13562984101100692, loss=2259.665283203125
Epoch:11, acc=0.13643497757847534, loss=2458.6142578125
Epoch:12, acc=0.1374956881683339, loss=2656.971923828125
Epoch:13, acc=0.13869314541960281, loss=2854.116455078125
Epoch:14, acc=0.14002989536621824, loss=3050.561767578125
Epoch:15, acc=0.14111547085201795, loss=3246.419921875
Epoch:16, acc=0.14220522289633342, loss=3440.5517578125
Epoch:17, acc=0.14342301943198804

(0.0665612913396861, 0.3983408071748879)