In [1]:
from matplotlib import pyplot as plt
from IPython.core.interactiveshell import InteractiveShell

import warnings
import numpy as np
import pandas as pd
import yfinance as yf

In [19]:
plt.rc('font', family='GULIM')
warnings.filterwarnings(action='ignore')
InteractiveShell.ast_node_interactivity = "all"

# Data Collecting

In [20]:
CL = yf.download('CL=F', start='2000-08-23')
HO = yf.download('HO=F', start='2000-09-01')
NG = yf.download('NG=F', start='2000-08-30')
RB = yf.download('RB=F', start='2000-11-01')
BZ = yf.download('BZ=F', start='2007-07-30')
ZL = yf.download('ZL=F', start='2000-03-15')
SP = pd.read_csv('Database/WTI.csv', index_col=0)
SP.sort_index(inplace=True)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


# Numerical Data Preprocessing

In [21]:
SP['price'] = pd.to_numeric(SP['price'], errors='coerce')
SP = SP.dropna()

In [22]:
CL['return'] = CL['Adj Close'].pct_change().apply(lambda x: np.log(1 + x))
HO['return'] = HO['Adj Close'].pct_change().apply(lambda x: np.log(1 + x))
NG['return'] = NG['Adj Close'].pct_change().apply(lambda x: np.log(1 + x))
RB['return'] = RB['Adj Close'].pct_change().apply(lambda x: np.log(1 + x))
BZ['return'] = BZ['Adj Close'].pct_change().apply(lambda x: np.log(1 + x))
ZL['return'] = ZL['Adj Close'].pct_change().apply(lambda x: np.log(1 + x))
SP['return'] = SP['price'].pct_change().apply(lambda x: np.log(1 + x))

In [23]:
CL = CL.dropna()
HO = HO.dropna()
NG = NG.dropna()
RB = RB.dropna()
BZ = BZ.dropna()
ZL = ZL.dropna()
SP = SP.dropna()

In [24]:
CL_vol = CL['return'].rolling(window=5).std()
HO_vol = HO['return'].rolling(window=5).std()
NG_vol = NG['return'].rolling(window=5).std()
RB_vol = RB['return'].rolling(window=5).std()
BZ_vol = BZ['return'].rolling(window=5).std()
ZL_vol = ZL['return'].rolling(window=5).std()
SP_vol = SP['return'].rolling(window=5).std()

In [25]:
CL_vol = CL_vol.dropna()
HO_vol = HO_vol.dropna()
NG_vol = NG_vol.dropna()
RB_vol = RB_vol.dropna()
BZ_vol = BZ_vol.dropna()
ZL_vol = ZL_vol.dropna()
SP_vol = SP_vol.dropna()

In [27]:
CL_vol.index = pd.to_datetime(CL_vol.index)
HO_vol.index = pd.to_datetime(HO_vol.index)
NG_vol.index = pd.to_datetime(NG_vol.index)
RB_vol.index = pd.to_datetime(RB_vol.index)
BZ_vol.index = pd.to_datetime(BZ_vol.index)
ZL_vol.index = pd.to_datetime(ZL_vol.index)
SP_vol.index = pd.to_datetime(SP_vol.index)

# Text Data Preprocessing

In [28]:
news = pd.read_csv('Database/news_newsdata_embedding.csv')
book = pd.read_csv('Database/books_newsdata_embedding.csv')

In [29]:
news.iloc[468, 3] = '2003-05-21'
book['published_date'] = pd.to_datetime(book['published_date'], errors='coerce')
news['published_date'] = pd.to_datetime(news['published_date'], errors='coerce')

In [30]:
book_notin_news = [p for p in book['published_date'] if p not in news['published_date']]
add_df = book[['published_date', 'embedding']][book['published_date'] == book_notin_news]
news = pd.concat([news, add_df])

In [31]:
NLP_vec = news[['published_date', 'embedding']]
NLP_vec.reset_index(inplace=True)
NLP_vec.index = NLP_vec['published_date']

In [32]:
NLP_vec = NLP_vec[~NLP_vec.index.duplicated(keep='first')]
NLP_vec = NLP_vec['embedding']

In [33]:
def str_to_array(s):
    list_array = [float(x) for x in s.strip('[]').split()]
    return np.array(list_array)

vol_list = [CL_vol, HO_vol, NG_vol, RB_vol, BZ_vol, ZL_vol, SP_vol]
for i, vol in enumerate(vol_list):
    vol_list[i] = pd.concat([vol, NLP_vec], axis=1)
    vol_list[i] = vol_list[i].dropna(subset=['return'])
    nan_index = vol_list[i][vol_list[i]['embedding'].isna()].index
    vol_list[i].loc[nan_index, 'embedding'] = str(np.zeros(384))
    
    for j in range(384):
        vol_list[i][f'{j}']=None
    array_series = vol_list[i].iloc[:,1].apply(str_to_array)
    
    for k in range(384):
        vol_list[i][f'{k}'] = array_series.apply(lambda x: x[k])

In [34]:
CL_vol = vol_list[0].drop(columns=['embedding'])
HO_vol = vol_list[1].drop(columns=['embedding'])
NG_vol = vol_list[2].drop(columns=['embedding'])
RB_vol = vol_list[3].drop(columns=['embedding'])
BZ_vol = vol_list[4].drop(columns=['embedding'])
ZL_vol = vol_list[5].drop(columns=['embedding'])
SP_vol = vol_list[6].drop(columns=['embedding'])

In [35]:
CL_vol=CL_vol.loc['2015-01-01':]
HO_vol=HO_vol.loc['2015-01-01':]
NG_vol=NG_vol.loc['2015-01-01':]
RB_vol=RB_vol.loc['2015-01-01':]
BZ_vol=BZ_vol.loc['2015-01-01':]
ZL_vol=ZL_vol.loc['2015-01-01':]
SP_vol=SP_vol.loc[:'2015-01-01']

# Save Files

In [36]:
CL_vol.to_csv('Database/oil_std/CL_vol.csv')
HO_vol.to_csv('Database/oil_std/HO_vol.csv')
NG_vol.to_csv('Database/oil_std/NG_vol.csv')
RB_vol.to_csv('Database/oil_std/RB_vol.csv')
BZ_vol.to_csv('Database/oil_std/BZ_vol.csv')
ZL_vol.to_csv('Database/oil_std/ZL_vol.csv')
SP_vol.to_csv('Database/SP_vol.csv')