Libraries

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
from stopwords import get_stopwords

Files

In [7]:
speeches = pd.read_csv("speeches.csv", sep="|")
speeches.columns = speeches.columns.str.lower()
speeches = speeches[["date", "contents"]]

fx = pd.read_csv("fx.csv")
fx.columns = fx.columns.str.lower()
fx.rename(columns={fx.columns[2]: "rate"}, inplace=True)

Merge

In [8]:
data = fx.merge(speeches, how="left", on="date")
data["date"] = pd.to_datetime(data["date"], errors="coerce")

data = data.sort_values("date")
data["rate"] = data["rate"].ffill()
data = data.dropna(subset=["rate"])

Exchange rate return

In [9]:
data["return"] = 100 * (data["rate"] / data["rate"].shift(1) - 1)
data["good_news"] = (data["return"] > 0.5).astype(int)
data["bad_news"] = (data["return"] < -0.5).astype(int)

Find words and remove artciles...

In [23]:
data = data.dropna(subset=["contents"])

#stopwords
stop_en = get_stopwords("en")
stop_pt = get_stopwords("pt")
stop_es = get_stopwords("es")
stop_fr = get_stopwords("fr")
stop_de = get_stopwords("de")
stop_words = set(stop_en + stop_pt + stop_es + stop_fr + stop_de)


def tokenize(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}\\d]", " ", text)
    words = text.split()
    return [word for word in words if word not in stop_words and len(word) > 1]

rows = []
for _, row in data[["contents", "good_news", "bad_news"]].iterrows():
    tokens = tokenize(row["contents"])
    for word in tokens:
        rows.append({"word": word, "good_news": row["good_news"], "bad_news": row["bad_news"]})

data_words = pd.DataFrame(rows)

Good indicator

In [24]:
good_indicators = (
    data_words[data_words["good_news"] == 1]
    .groupby("word").size()
    .sort_values(ascending=False)
    .head(20)
    .reset_index(name="n")
)
good_indicators.to_csv("good_indicators.csv", index=False)

Bad Indicator

In [25]:
bad_indicators = (
    data_words[data_words["bad_news"] == 1]
    .groupby("word").size()
    .sort_values(ascending=False)
    .head(20)
    .reset_index(name="n")
)
bad_indicators.to_csv("bad_indicators.csv", index=False)