# Load packages

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib as mpl
import yfinance as yf
import seaborn as sns
import missingno as msno
import warnings

from matplotlib import pyplot as plt
from statsmodels.tsa.stattools import adfuller, kpss

# Configuration

In [None]:
# Korean font
from matplotlib import font_manager, rc
try:
    font_path = "C:/Windows/Fonts/malgun.TTF"
    Kfont = font_manager.FontProperties(fname=font_path).get_name()
    rc('font', family=Kfont)
except:
    pass

# Fix minus presentation
mpl.rcParams['axes.unicode_minus'] = False

In [None]:
# Ignore warings
warnings.filterwarnings('ignore')

# Custom functions

In [None]:
def to_date_inv(df):
    df["날짜"] = pd.to_datetime(df["날짜"], format = "%Y년 %m월 %d일")
    df = df.sort_values(by = "날짜", ascending=True)
    df.set_index("날짜", inplace=True)

    return df

def to_numeric_inv(df):
    try:
        df["종가"] = df["종가"].replace(",", "")
        df["종가"] = pd.to_numeric(df["종가"])
    except:
        df["종가"] = df["종가"].str.replace(",", "")
        df["종가"] = pd.to_numeric(df["종가"])

    return df

def dtype_chg(df: pd.DataFrame) -> pd.DataFrame:
    return to_date_inv(to_numeric_inv(df))

# Load data

In [None]:
# Load several index information by yfinance
snp500 = yf.Ticker('^GSPC')
snp500.history(period='max')

# Get data path from local directory
path_brent_crude_future = 'data/external_data/브렌트유 선물 내역.csv'
path_wti_crude_future = 'data/external_data/WTI유 선물 내역.csv'
path_natural_gas = 'data/external_data/천연가스 선물 내역.csv'
path_kospi = 'data/external_data/코스피지수 내역.csv'
path_carbon_credit = 'data/external_data/탄소배출권 선물 내역.csv'
path_exchange_rate = 'data/external_data/EUR_KRW 내역.csv'
path_gold = 'data/external_data/금 선물 내역.csv'
path_nickel = 'data/external_data/니켈 선물 내역.csv'
path_use_corn = 'data/external_data/미국 옥수수 선물 내역.csv'

In [None]:
# Load brent crude oil future data
df_brent_crude_fut = pd.read_csv(path_brent_crude_future)
df_brent_crude_fut

In [None]:
# Load west texas crude oil future data
df_wti_crude_fut = pd.read_csv(path_wti_crude_future)
df_wti_crude_fut

In [None]:
# Load natural gas data
df_natural_gas = pd.read_csv(path_natural_gas)
df_natural_gas

In [None]:
# Load kospi data
df_kospi = pd.read_csv(path_kospi)
df_kospi

In [None]:
# Load carbon credit data
df_carbon_credit = pd.read_csv(path_carbon_credit)
df_carbon_credit

In [None]:
# Load exchange rate data
df_exch_rate = pd.read_csv(path_exchange_rate)
df_exch_rate

# Preprocessing & Visualization

In [None]:
# Simple dtype change : date to datetime & Close price comma fix
df_brent_crude_fut = dtype_chg(df_brent_crude_fut)
df_wti_crude_fut = dtype_chg(df_wti_crude_fut)
df_natural_gas = dtype_chg(df_natural_gas)
df_kospi = dtype_chg(df_kospi)
df_exch_rate = dtype_chg(df_exch_rate)
df_carbon_credit = dtype_chg(df_carbon_credit)

In [None]:
# Concat close price of all time series data
df_tot = pd.concat([
    df_carbon_credit['종가'],
    df_brent_crude_fut['종가'],
    df_wti_crude_fut['종가'],
    df_kospi['종가'],
    df_natural_gas['종가'],
    df_exch_rate['종가'],
],
axis=1)

tot_col_nm = [
    'carbon_credit', 
    'brent_crude_future', 
    'wti_crude_future', 
    'kospi', 
    'natural_gas',
    'exchange_rate'
    ]

df_tot.columns = tot_col_nm

df_tot

In [None]:
# Check missing value of each column and drop missing
msno.matrix(df_tot, figsize=(12, 6))
print('Missing value before drop and fill na\n', df_tot.isnull().sum())

df_tot_f = df_tot.interpolate(limit_are='outside', limit=10)
msno.matrix(df_tot_f, figsize=(12, 6))
print('Missing value After fill na\n', df_tot_f.isnull().sum())

df_tot_f = df_tot_f.dropna()
msno.matrix(df_tot_f, figsize=(12, 6))
print('Missing value After fill and drop na\n', df_tot_f.isnull().sum())

In [None]:
# plot - Check data by simple line plot
num_col = len(df_tot_f.columns)

df_tot_f.plot(
    subplots=True, 
    layout=(int(np.ceil(num_col*2/3)), int(np.ceil(num_col*1/3))),
    figsize=(5*int(np.ceil(num_col*2/3)), 8*int(np.ceil(num_col*1/3))),
    )

In [None]:
# Test - adf and kpss test
x_test = np.cos(np.linspace(0, 100/np.pi, 500)) 
x_test2 = df_carbon_credit['종가']
plt.plot(x)
print(f'ADF {adfuller(x_test)}')
print(f'KPSS {kpss(x_test)}')

In [None]:
# plot - visualize each time series data by lab plot and stationarity test
fig_lag_plot, ax_lag_plot = plt.subplots(len(tot_col_nm), 2, figsize=(14, 7*len(tot_col_nm)))

for idx, col_nm in enumerate(tot_col_nm):
    
    pd.plotting.lag_plot(df_tot_f[col_nm].dropna(), ax=ax_lag_plot[idx, 0])
    r_adf = adfuller(df_tot_f[col_nm].dropna())
    r_kpss = kpss(df_tot_f[col_nm].dropna())
    ax_lag_plot[idx, 0].set_title(f'{col_nm} || No diff\nADF[{r_adf[1]:.2}] || KPSS [{r_kpss[1]:.2}]', fontsize=15, family='bold')

    pd.plotting.lag_plot(df_tot_f[col_nm].diff().dropna(), ax=ax_lag_plot[idx, 1])
    r_adf = adfuller(df_tot_f[col_nm].diff().dropna())
    r_kpss = kpss(df_tot_f[col_nm].diff().dropna())
    ax_lag_plot[idx, 1].set_title(f'{col_nm} || Diff 1\nADF[{r_adf[1]:.2}] || KPSS [{r_kpss[1]:.2}]', fontsize=15, family='bold')

fig_lag_plot.tight_layout()

In [None]:
# Difference total DataFrame
df_tot_f.info()
df_tot_f_diff = df_tot_f.diff().dropna()
df_tot_f_diff.info()

In [None]:
# plot - Check data by simple line plot
num_col = len(df_tot_f_diff.columns)

df_tot_f_diff.plot(
    subplots=True, 
    layout=(int(np.ceil(num_col*2/3)), int(np.ceil(num_col*1/3))),
    figsize=(5*int(np.ceil(num_col*2/3)), 8*int(np.ceil(num_col*1/3))),
    )


# Save preprocessed data 

In [None]:
df_tot_f_diff.to_csv('data/preprocessed_data.csv')