In [5]:
"""
FROM 
    ETL
"""
import pandas as pd

info_df = pd.read_csv("./data/dynamic/total_info_df.csv", index_col=0)
ohlcv_df = pd.read_csv("./data/dynamic/total_ohlcv_df.csv", index_col=0, low_memory=False)
fundamental_df = pd.read_csv("./data/dynamic/total_fundamental_df.csv", index_col=0)

# Static
kosdaq_sector_df = pd.read_csv("./data/static/kosdaq_20231027.csv", encoding="cp949")
kospi_sector_df = pd.read_csv("./data/static/kospi_20231027.csv", encoding="cp949")

In [6]:
"""
Preprocessing
"""

class PPS:
    def get_pps_ohlcv_df(self, ohlcv_df):
        column_rename_dict = {
            "stck_bsop_date": "Date",
            "stck_clpr": "Close",
            "stck_oprc": "Open",
            "stck_hgpr": "High",
            "stck_lwpr": "Low",
            "acml_vol": "Volume",
        }
        main_columns = ["StockCode", "Date", "Close", "Open", "High", "Low", "Volume"]

        pps_ohlcv_df = self._rename_columns(ohlcv_df, column_rename_dict)
        pps_ohlcv_df = self._filter_main_columns(pps_ohlcv_df, main_columns)
        pps_ohlcv_df = self._filter_na(pps_ohlcv_df)
        pps_ohlcv_df = self._format_date(pps_ohlcv_df, "Date")
        return pps_ohlcv_df

    def get_pps_info_df(self, info_df):
        column_rename_dict = {
            "vol_tnrt": "VolumeTurnOverRatio",
            "lstn_stcn": "TotalShare",
            "stck_shrn_iscd": "StockCode",
            "hts_kor_isnm": "StockName",
        }
        main_columns = ["StockCode", "StockName", "VolumeTurnOverRatio", "TotalShare"]
        pps_info_df = self._rename_columns(info_df, column_rename_dict)
        pps_info_df = self._filter_main_columns(pps_info_df, main_columns)
        pps_info_df = self._filter_na(pps_info_df)
        return pps_info_df

    def get_pps_fundamental_df(self, fundamental_df):
        column_rename_dict = {
            "stock_code": "StockCode",
            "fs_nm": "FSName",
            "account_nm": "AccountName",
            "frmtrm_amount": "Amount",
        }
        main_columns = ["StockCode", "FSName", "AccountName", "Amount"]
        pps_fundamental_df = self._rename_columns(fundamental_df, column_rename_dict)
        pps_fundamental_df = self._filter_main_columns(pps_fundamental_df, main_columns)
        pps_fundamental_df = self._preprocess_amount(pps_fundamental_df)
        pps_fundamental_df = self._filter_na(pps_fundamental_df)
        return pps_fundamental_df

    def get_pps_sector_df(self, kosdaq_sector_df, kospi_sector_df):
        column_rename_dict = {
            "종목코드": "StockCode",
            "종목명": "StockName",
            "시장구분": "MarketName",
            "업종명": "SectorName",
        }
        main_columns = ["StockCode", "StockName", "MarketName", "SectorName"]
        sector_df = pd.concat([kosdaq_sector_df, kospi_sector_df], axis=0)
        pps_sector_df = self._rename_columns(sector_df, column_rename_dict)
        pps_sector_df = self._filter_main_columns(pps_sector_df, main_columns)
        return pps_sector_df

    @staticmethod
    def _filter_na(df, axis=0):
        filtered_df = df.dropna(axis=axis)
        return filtered_df

    @staticmethod
    def _rename_columns(df, column_rename_dict):
        renamed_df = df.rename(columns=column_rename_dict)
        return renamed_df

    @staticmethod
    def _format_date(df, column):
        df[column] = pd.to_datetime(df[column].apply(lambda x: (str(int(x)))), errors="ignore")
        return df

    @staticmethod
    def _filter_main_columns(df, main_columns):
        main_df = df.loc[:, main_columns]
        return main_df

    @staticmethod
    def _preprocess_amount(df):
        def __preprocess_amount(amount):
            if amount == "-":
                return None
            else:
                amount = int(amount.replace(",", ""))
                return amount

        df["Amount"] = df["Amount"].apply(lambda x: __preprocess_amount(x))
        return df

## Codes

In [7]:
pps = PPS()

pps_fundamental_df = pps.get_pps_fundamental_df(fundamental_df)
pps_info_df = pps.get_pps_info_df(info_df)
pps_ohlcv_df = pps.get_pps_ohlcv_df(ohlcv_df)
pps_sector_df = pps.get_pps_sector_df(kosdaq_sector_df, kospi_sector_df)

In [8]:
# Dynamic Data
## Not Locally saved in production
pps_ohlcv_df.to_csv("./data/dynamic/pps_ohlcv_df.csv")
pps_fundamental_df.to_csv("./data/dynamic/pps_fundamental_df.csv")
pps_info_df.to_csv("./data/dynamic/pps_info_df.csv")
pps_sector_df.to_csv("./data/dynamic/pps_sector_df.csv")