In [8]:
import pandas as pd
from pandarallel import pandarallel
# Initialization
pandarallel.initialize()

#string manupulation libs
import re
import string
from string import digits
import spacy

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
data = r'data/all_ECB_speeches.csv'

In [12]:
df = pd.read_csv(data,sep='|')
print(len(df.index))
df.head()

2526


Unnamed: 0,date,speakers,title,subtitle,contents
0,2021-10-20,Frank Elderson,Overcoming the tragedy of the horizon: requiri...,"Keynote speech by Frank Elderson, Member of th...",SPEECH Overcoming the tragedy of the horiz...
1,2021-10-19,Fabio Panetta,“Hic sunt leones” – open research questions on...,"Speech by Fabio Panetta, Member of the Executi...",SPEECH “Hic sunt leones” – open research q...
2,2021-10-19,Frank Elderson,The role of supervisors and central banks in t...,"Keynote speech by Frank Elderson, Member of th...",SPEECH The role of supervisors and central...
3,2021-10-16,Christine Lagarde,Globalisation after the pandemic,2021 Per Jacobsson Lecture by Christine Lagard...,SPEECH Globalisation after the pandemic ...
4,2021-10-14,Christine Lagarde,IMFC Statement,"Statement by Christine Lagarde, President of t...",SPEECH IMFC Statement Statement by Chri...


In [13]:
targets = list()
df['targets'] = ''
for i in enumerate(df['speakers'].unique().tolist()):
    index = df['speakers'].isin([i[1]])
    df.loc[index,'targets'] = i[0]

df = df[['targets','contents']]

df = df.dropna().drop_duplicates()
print(len(df.index))

2492


In [14]:
df.contents = df.contents.parallel_apply(lambda x: re.sub("'", '',x).lower())
#remove special chars
exclude = set(string.punctuation)#set of all special chars
#remove all the special chars
df.contents = df.contents.parallel_apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [15]:
remove_digits = str.maketrans('','',digits)
df.contents  = df.contents.parallel_apply(lambda x: x.translate(remove_digits))

# Remove extra spaces
df.contents =df.contents .parallel_apply(lambda x: x.strip())
df.contents =df.contents .parallel_apply(lambda x: re.sub(" +", " ", x))

In [17]:
data_list = df.contents.tolist()

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()
X.shape

(2492, 79184)

In [23]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [29]:
import pickle
import numpy as np
pkl_filename = "language_model.pkl"

# Load from file
with open(pkl_filename, 'rb') as file:
    lang_model = pickle.load(file)
    
# Load from file
pkl_filename = "cv_model.pkl"
with open(pkl_filename, 'rb') as file:
    cv_model = pickle.load(file)
    
# Load from file
pkl_filename = "le_model.pkl"
with open(pkl_filename, 'rb') as file:
    le_model = pickle.load(file)

In [41]:
def predict(text):
    x = cv_model.transform([text]).toarray() # converting text to bag of words model (Vector)
    lang = lang_model.predict(x) # predicting the language
    lang = le_model.inverse_transform(lang) # finding the language corresponding the the predicted value
    #print(lang[0]) # printing the language
    return lang[0]

In [43]:
for txt in df.contents:
    pred = predict(txt)
    if pred != "English":
        print(f'Predicted language: {pred},\nActual text: {txt}')

Predicted language: French,
Actual text: la situation économique dans la zone euro et le rôle de la bce fiches de présentation présentation de benoît cœuré membre du directoire de la bce medef – commission economie et financements paris le janvier fiches de présentation
Predicted language: French,
Actual text: les décisions de la bce depuis la crise un tour d’horizon fiches de présentation présentation de benoît cœuré membre du directoire de la bce conservatoire national des arts et métiers cnam paris le novembre fiches de présentation
Predicted language: German,
Actual text: zentralbankunabhängigkeit auf dem prüfstand beitrag von yves mersch mitglied des direktoriums der ezb für handelsblatt veröffentlicht am mai zunehmend treten kritiker auf den plan die die unabhängigkeit von zentralbanken in frage stellen zwei vorwürfe stehen dabei besonders im raum erstens mit dem einsatz von unkonventionellen maßnahmen wie dem ankauf von wertpapieren als quantitative easing bekannt oder negativen