In [1]:
import pandas as pd

In [87]:
df = pd.read_csv('press_release.csv')
df_source = pd.read_excel('Dataset_EA-MPD.xlsx', header=0, sheet_name=3)

#Flipping
df_source = df_source.loc[::-1].reset_index(drop=True)

In [104]:
df.head(6)

Unnamed: 0,Date,Text,char_cnt,word_cnt,avg_word_length
0,06/06/2024,the governing council today decided to lower t...,4929,757,6.511229
1,11/04/2024,the governing council today decided to keep th...,3919,586,6.687713
2,07/03/2024,the governing council today decided to keep th...,4257,649,6.559322
3,25/01/2024,the governing council today decided to keep th...,3663,541,6.770795
4,14/12/2023,the governing council today decided to keep th...,5179,799,6.481852
5,26/10/2023,the governing council today decided to keep th...,3720,549,6.775956


In [90]:
#Very simple data processing

# Change to lower case
df['Text'] = df['Text'].str.lower()

# Find the length of each text
df['char_cnt'] = df['Text'].str.len()

# Count the number of words in each text
df['word_cnt'] = df['Text'].str.split().str.len()

# Find the average length of word
df['avg_word_length'] = df['char_cnt'] / df['word_cnt']

# Print the first 5 rows of these columns
print(df[['Text', 'char_cnt', 'word_cnt', 'avg_word_length']])


                                                  Text  char_cnt  word_cnt  \
0    the governing council today decided to lower t...      4929       757   
1    the governing council today decided to keep th...      3919       586   
2    the governing council today decided to keep th...      4257       649   
3    the governing council today decided to keep th...      3663       541   
4    the governing council today decided to keep th...      5179       799   
..                                                 ...       ...       ...   
296  at today's meeting the governing council of th...       368        58   
297  at today's meeting, the governing council of t...       416        65   
298  at today's meeting the governing council of th...       684       115   
299  at today's meeting the governing council of th...       777       124   
300  at today's meeting the governing council revie...       928       147   

     avg_word_length  
0           6.511229  
1           6.687

In [105]:
#Fill with the correct lines
df_X = df.loc[5:,'Text']
df_y = df_source.loc[0:295,'OIS_SW']


#TIME SERIES BEHAVIOUR

In [61]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42, stratify=y)

# Instantiate TfidfVectorizer
tv = TfidfVectorizer(max_features=100, stop_words='english', ngram_range=(3,3))

# Fit the vectroizer and transform the data
tv_transformed = tv.fit_transform(X_train)

# Create a DataFrame with these features
tv_df = pd.DataFrame(tv_transformed.toarray(), 
                     columns=tv.get_feature_names_out()).add_prefix('TFIDF_')
print(tv_df.head())

# Isolate the row to be examined
sample_row = tv_df.iloc[0]

# Print the top 5 words of the sorted output
print(sample_row.sort_values(ascending=False).head())


   TFIDF_00 00 00  TFIDF_00 00 respectively  TFIDF_00 respectively president  \
0             0.0                       0.0                              0.0   
1             0.0                       0.0                              0.0   
2             0.0                       0.0                              0.0   
3             0.0                       0.0                              0.0   
4             0.0                       0.0                              0.0   

   TFIDF_14 30 cet  TFIDF_25 basis points  TFIDF_25 percentage point  \
0              0.0               0.247255                        0.0   
1              0.0               0.000000                        0.0   
2              0.0               0.000000                        0.0   
3              0.0               0.000000                        0.0   
4              0.0               0.000000                        0.0   

   TFIDF_30 cet today  TFIDF_75 75 75  TFIDF_75 75 respectively  \
0                 0

In [20]:
#Bayesian

from sklearn.linear_model import BayesianRidge
brr = BayesianRidge(compute_score=True, max_iter=30).fit(X, y)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |
 |  For an example of usage, see
 |  :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`.
 |
 |  For an efficiency comparison of the different feature extractors, see
 |  :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
 |
 |  Read more in the :ref:`User Guide <

In [86]:
#Eavluate and aparameter tuning

In [107]:
df_X

5      the governing council today decided to keep th...
6      inflation continues to decline but is still ex...
7      inflation continues to decline but is still ex...
8      inflation has been coming down but is projecte...
9      the inflation outlook continues to be too high...
                             ...                        
296    at today's meeting the governing council of th...
297    at today's meeting, the governing council of t...
298    at today's meeting the governing council of th...
299    at today's meeting the governing council of th...
300    at today's meeting the governing council revie...
Name: Text, Length: 296, dtype: object

In [91]:
df['word_cnt'].mean()

171.95016611295682

In [56]:
# Sentence to search for
search_sentence = "the president of the ecb will comment on the considerations underlying these decisions at a press conference"

# Find indexes of rows containing the search sentence
indexes = df[~df['Text'].str.contains(search_sentence, case=False)].index.tolist()

# Print the indexes
print("Indexes with the specified sentence:", indexes)

Indexes with the specified sentence: [42, 49, 99, 114, 126, 138, 146, 147, 157, 159, 160, 161, 163, 164, 165, 166, 167, 168, 170, 172, 173, 175, 177, 178, 180, 181, 186, 187, 188, 190, 192, 194, 196, 197, 199, 201, 203, 205, 207, 209, 211, 212, 214, 215, 217, 219, 220, 221, 222]


In [65]:
df_source = pd.read_excel('Dataset_EA-MPD.xlsx', header=0, sheet_name=3)

In [66]:
df_source

Unnamed: 0,date,OIS_SW,OIS_1M,OIS_3M,OIS_6M,OIS_1Y,OIS_2Y,OIS_3Y,OIS_4Y,OIS_5Y,...,ES5Y,FR5Y,ES10Y,FR10Y,IT10Y,STOXX50,SX7E,EURUSD,EURGBP,EURJPY
0,1999-01-07 00:00:00,,-5.000000,-0.500000,-5.250000,-0.250000,,,,,...,2.050000,1.000000,2.050000,2.150000,,-1.231443,-1.177058,0.145748,0.084890,-0.001927
1,1999-01-21 00:00:00,,0.000000,0.000000,1.000000,0.000000,,,,,...,-0.900000,-0.500000,0.150000,-0.450000,,-0.212126,-0.245174,0.064838,-0.085409,0.101767
2,1999-02-18 00:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,,,,,...,-0.900000,-0.200000,-0.100000,0.150000,,0.174157,0.080538,0.022258,0.036386,0.040914
3,1999-03-04 00:00:00,-1.250000,0.000000,-0.500000,0.000000,0.000000,,,,,...,0.400000,-0.200000,0.650000,0.250000,,0.082407,-0.164442,0.050669,0.096357,0.340143
4,1999-03-18 00:00:00,0.500000,-0.500000,0.000000,-0.500000,1.000000,,,,,...,0.500000,0.400000,0.550000,0.750000,,0.060922,0.094642,0.045430,0.029581,-0.015405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,2023-05-04 00:00:00,-3.500009,-4.449987,-5.599999,-6.369996,-6.700015,-6.850004,-5.900002,-6.049991,-5.949998,...,-4.949999,-4.130006,-2.400017,-2.699995,-1.950026,0.506973,0.580817,-0.470666,-0.387108,-0.619926
295,2023-06-15 00:00:00,0.300002,0.250006,0.950003,1.990008,1.374984,-0.830007,-1.599979,-2.099991,-2.349997,...,-3.650022,-3.355002,-6.349993,-5.349994,-5.999994,0.141154,-0.192120,0.755142,0.151847,0.183084
296,2023-07-27 12:15:00,0.000000,-0.090003,-1.134992,-2.095008,-3.600001,-4.700017,-4.800010,-4.550004,-4.699993,...,-5.850005,-4.544997,-5.099988,-3.950000,-6.250000,0.577907,-0.758382,-1.147098,-0.291409,-0.710486
297,2023-09-14 12:15:00,5.799985,7.579994,6.220007,4.885006,2.745008,-0.300002,-1.600003,-2.349997,-2.399993,...,-3.999996,-3.710008,-5.150008,-3.200006,-7.299995,1.020578,1.568556,-0.626368,-0.168747,-0.628158


In [70]:
df_source = df_source.loc[::-1].reset_index(drop=True)

In [80]:
df_source

Unnamed: 0,date,OIS_SW,OIS_1M,OIS_3M,OIS_6M,OIS_1Y,OIS_2Y,OIS_3Y,OIS_4Y,OIS_5Y,...,ES5Y,FR5Y,ES10Y,FR10Y,IT10Y,STOXX50,SX7E,EURUSD,EURGBP,EURJPY
0,2023-10-26 12:15:00,-0.099993,-0.725007,-0.990009,-1.459980,-3.295016,-5.835009,-6.800008,-6.250000,-6.500006,...,-6.800008,-7.125020,-6.649995,-6.100011,-9.450006,0.472046,1.388453,0.075888,-0.189451,-0.037864
1,2023-09-14 12:15:00,5.799985,7.579994,6.220007,4.885006,2.745008,-0.300002,-1.600003,-2.349997,-2.399993,...,-3.999996,-3.710008,-5.150008,-3.200006,-7.299995,1.020578,1.568556,-0.626368,-0.168747,-0.628158
2,2023-07-27 12:15:00,0.000000,-0.090003,-1.134992,-2.095008,-3.600001,-4.700017,-4.800010,-4.550004,-4.699993,...,-5.850005,-4.544997,-5.099988,-3.950000,-6.250000,0.577907,-0.758382,-1.147098,-0.291409,-0.710486
3,2023-06-15 00:00:00,0.300002,0.250006,0.950003,1.990008,1.374984,-0.830007,-1.599979,-2.099991,-2.349997,...,-3.650022,-3.355002,-6.349993,-5.349994,-5.999994,0.141154,-0.192120,0.755142,0.151847,0.183084
4,2023-05-04 00:00:00,-3.500009,-4.449987,-5.599999,-6.369996,-6.700015,-6.850004,-5.900002,-6.049991,-5.949998,...,-4.949999,-4.130006,-2.400017,-2.699995,-1.950026,0.506973,0.580817,-0.470666,-0.387108,-0.619926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1999-03-18 00:00:00,0.500000,-0.500000,0.000000,-0.500000,1.000000,,,,,...,0.500000,0.400000,0.550000,0.750000,,0.060922,0.094642,0.045430,0.029581,-0.015405
295,1999-03-04 00:00:00,-1.250000,0.000000,-0.500000,0.000000,0.000000,,,,,...,0.400000,-0.200000,0.650000,0.250000,,0.082407,-0.164442,0.050669,0.096357,0.340143
296,1999-02-18 00:00:00,0.000000,0.000000,0.000000,0.000000,0.000000,,,,,...,-0.900000,-0.200000,-0.100000,0.150000,,0.174157,0.080538,0.022258,0.036386,0.040914
297,1999-01-21 00:00:00,,0.000000,0.000000,1.000000,0.000000,,,,,...,-0.900000,-0.500000,0.150000,-0.450000,,-0.212126,-0.245174,0.064838,-0.085409,0.101767


In [83]:
df.iloc[:,1:2]

Unnamed: 0,Date
2,07/03/2024
3,25/01/2024
4,15/12/2022
5,27/10/2022
6,08/09/2022
...,...
218,06/05/1999
219,22/04/1999
220,08/04/1999
221,18/03/1999


In [85]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Text,char_cnt,word_cnt,avg_word_length
0,0,06/06/2024,the governing council today decided to lower t...,4929,757,6.511229
1,1,11/04/2024,the governing council today decided to keep th...,3919,586,6.687713
2,2,07/03/2024,the governing council today decided to keep th...,4257,649,6.559322
3,3,25/01/2024,the governing council today decided to keep th...,3663,541,6.770795
4,4,15/12/2022,the governing council today decided to raise t...,6218,952,6.531513
