# Data Processing

In [1]:
import os
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib

In [2]:
folder_path = "data"

In [3]:
file_name = "DATA_DAILY.xlsx"
file_path = os.path.join(folder_path, file_name)
sheet_list = pd.ExcelFile(file_path).sheet_names
data_daily = dict()
for sheet in tqdm(sheet_list):
    temp = pd.read_excel(file_path,
                         sheet_name=sheet,
                         header=8,
                         index_col=0,
                         skiprows=range(9,14))
    temp = temp.astype("float64")
    temp.index.name = "Date"
    data_daily[sheet] = temp
    temp = pd.DataFrame()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:03<00:00, 24.65s/it]


In [4]:
accounting_lag = 3 # lagging for accounting variables
nan_replacements = ['N/A(IFRS)', '완전잠식', '적전', '흑전', '적지']
file_name = "DATA_MONTHLY_ACCOUNTING.xlsx"
file_path = os.path.join(folder_path, file_name)
sheet_list = pd.ExcelFile(file_path).sheet_names
data_monthly_acc = dict()
for sheet in tqdm(sheet_list):
    temp = pd.read_excel(file_path,
                         sheet_name=sheet,
                         header=8,
                         index_col=0,
                         skiprows=range(9,14))
    temp = temp.replace(nan_replacements, np.nan)
    temp = temp.astype("float64")
    temp.index.name = "Date"
    temp = temp.fillna(method="ffill")
    temp = temp.shift(accounting_lag)
    data_monthly_acc[sheet] = temp
    temp = pd.DataFrame()

100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:22<00:00,  1.25s/it]


In [5]:
file_name = "DATA_MONTHLY_MARKET.xlsx"
file_path = os.path.join(folder_path, file_name)
sheet_list = pd.ExcelFile(file_path).sheet_names
data_monthly_mkt = dict()
for sheet in tqdm(sheet_list):
    temp = pd.read_excel(file_path,
                         sheet_name=sheet,
                         header=8,
                         index_col=0,
                         skiprows=range(9,14))
    temp = temp.astype("float64")
    temp.index.name = "Date"
    data_monthly_mkt[sheet] = temp
    temp = pd.DataFrame()

100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [00:30<00:00,  1.26s/it]


In [6]:
file_name = "DATA_INDEX.xlsx"
file_path = os.path.join(folder_path, file_name)
sheet_list = pd.ExcelFile(file_path).sheet_names
data_index = dict()
for sheet in tqdm(sheet_list):
    temp = pd.read_excel(file_path,
                         sheet_name=sheet,
                         header=8,
                         index_col=0,
                         skiprows=range(9,14))
    temp = temp.astype("float64")
    temp.index.name = "Date"
    temp.drop(temp.columns[-1], axis=1, inplace=True)
    temp.rename(columns={temp.columns[-1]:sheet}, inplace=True)
    data_index[sheet] = temp
    temp = pd.DataFrame()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  6.04it/s]


In [7]:
file_name = "DATA_CLASSIFICATION.xlsx"
file_path = os.path.join(folder_path, file_name)
sheet_list = pd.ExcelFile(file_path).sheet_names
data_classification = dict()
for sheet in tqdm(sheet_list):
    temp = pd.read_excel(file_path,
                         sheet_name=sheet,
                         header=8,
                         index_col=0,
                         skiprows=range(9,14))
    temp.index.name = "Date"
    data_classification[sheet] = temp
    temp = pd.DataFrame()

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.21s/it]


In [8]:
data_weight = dict()
path_kodex200 = os.path.join(folder_path, "Kodex_200")
for file_name in tqdm(os.listdir(path_kodex200)):
    date_str = os.path.splitext(file_name)[0][-8:]
    date = datetime.datetime.strptime(date_str, "%Y%m%d")
    directory = os.path.join(path_kodex200, file_name)
    temp = pd.read_excel(directory, header=2)
    temp = temp[temp['종목명']!="원화예금"]
    temp = temp[['종목코드', '비중(%)']]
    temp.rename(columns={'비중(%)':'비중'}, inplace=True)
    temp['종목코드'] = "A" + temp['종목코드']
    data_weight[date] = temp.set_index("종목코드").to_dict()['비중']

100%|███████████████████████████████████████████████████████████████████████████████| 155/155 [00:01<00:00, 108.84it/s]


In [9]:
datas = [data_daily, data_monthly_acc, data_monthly_mkt,
        data_index, data_classification, data_weight]
joblib.dump(datas, os.path.join(folder_path, 'data.sav'), compress=1)

['data\\data.sav']