# Data Preprocessing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

### Load the data into a Pandas dataframe

In [None]:
data = pd.read_csv("../data/combined_headlines_new.csv")
data

In [None]:
data.columns = ['Date', 'Headlines']

In [None]:
data.info()

### Convert date column to datetime

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Date'] = data['Date'].dt.tz_localize(None)

### Determine date range covered by the dataset

In [None]:
print("Start: ", data["Date"].min().date())
print("End:   ", data["Date"].max().date())

In [None]:
plt.plot(data["Date"])

### Sort data chronologically (will make it easier to verify that financial data is added correctly)

In [None]:
data.sort_values(by='Date', inplace=True)
data = data.reset_index(drop=True)

In [None]:
data

In [None]:
plt.plot(data["Date"])

## Collect financial data from Yahoo Finance

In [None]:
import yfinance as yf

In [None]:
ticker = yf.Ticker("SPY")
start_date = data["Date"].min().date() - pd.Timedelta(days=2) # buffer to account for weekends
end_date = data["Date"].max().date() + pd.Timedelta(days=10)
spy_data = ticker.history(start=start_date, end=end_date)
spy_data

### Reindex to daily frequency, filling missing dates (i.e., when the markets are closed) with previous values

In [None]:
spy_data = spy_data.asfreq("D", method="ffill").reset_index()

In [None]:
spy_data.info()

In [None]:
spy_data["Date"] = spy_data["Date"].dt.tz_localize(None)

### Add columns containing closing values up to 7 days after the current date

In [None]:
for i in range(1, 8):
    spy_data[f"Close+{i}"] = spy_data["Close"].shift(-i)

### Calculate percentage shift in value up to 7 days after the current date

In [None]:
for i in range(1, 8):
    spy_data[f"PercentageD+{i}"] = ( ( spy_data["Close"].shift(-i) - spy_data["Close"] ) / spy_data["Close"] ) * 100

### Add category labels characterizing observed trends (tentative)
    - percentage shift < -0.5% --> 0
    - percentage shift between -0.5% and +0.5% --> 1
    - percentage shift > 0.5% --> 2

In [None]:
for i in range(1, 8):
    spy_data[f"TrendD+{i}"] = spy_data[f"PercentageD+{i}"].apply(
        lambda x: 1 if -0.5 <= x <= 0.5 
        else (0 if x < -0.5 
        else 2)
    )

In [None]:
spy_data

### Merge financial data into original dataframe

In [None]:
data = data.merge(spy_data[["Date", "Close", "Close+1", "Close+2", "Close+3", "Close+4", "Close+5", "Close+6", "Close+7", 
                     "PercentageD+1", "PercentageD+2", "PercentageD+3", "PercentageD+4", "PercentageD+5", "PercentageD+6", "PercentageD+7",
                     "TrendD+1", "TrendD+2", "TrendD+3", "TrendD+4", "TrendD+5", "TrendD+6", "TrendD+7"]], on='Date', how='left')

### Save the dataframe to csv

In [None]:
data.to_csv("../data/jvdm_data_prep.csv", index=False, float_format="%0.4f")