# KRX price-volume dataset 

단계별 sandbox

## Import libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from pathlib import Path

cur_path = Path('.').resolve()
cur_path

WindowsPath('E:/VSCodeProjects/FinanceDashboard')

In [4]:
from pricevolume.processor import Preprocessor, Lv2Converter

## Load data

lv1 data load

In [5]:
kosdaq_df = pd.read_pickle(cur_path / "cache" / "KSQ_20150101_to_20211031_lv1_df.pkl")
kospi_df = pd.read_pickle(cur_path / "cache" / "STK_20150101_to_20211031_lv1_df.pkl")

In [6]:
kospi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2244861 entries, 0 to 2244860
Data columns (total 17 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   ISU_SRT_CD     object
 1   ISU_ABBRV      object
 2   MKT_NM         object
 3   SECT_TP_NM     object
 4   TDD_CLSPRC     object
 5   FLUC_TP_CD     object
 6   CMPPREVDD_PRC  object
 7   FLUC_RT        object
 8   TDD_OPNPRC     object
 9   TDD_HGPRC      object
 10  TDD_LWPRC      object
 11  ACC_TRDVOL     object
 12  ACC_TRDVAL     object
 13  MKTCAP         object
 14  LIST_SHRS      object
 15  MKT_ID         object
 16  trdDd          int64 
dtypes: int64(1), object(16)
memory usage: 291.2+ MB


In [7]:
kospi_df.iloc[8000]

ISU_SRT_CD              001755
ISU_ABBRV                한양증권우
MKT_NM                   KOSPI
SECT_TP_NM                    
TDD_CLSPRC               6,600
FLUC_TP_CD                   2
CMPPREVDD_PRC              -90
FLUC_RT                  -1.35
TDD_OPNPRC               6,500
TDD_HGPRC                6,600
TDD_LWPRC                6,500
ACC_TRDVOL                  21
ACC_TRDVAL             136,600
MKTCAP           3,465,000,000
LIST_SHRS              525,000
MKT_ID                     STK
trdDd                 20150109
Name: 8000, dtype: object

In [8]:
kospi_df.columns[4:15]

Index(['TDD_CLSPRC', 'FLUC_TP_CD', 'CMPPREVDD_PRC', 'FLUC_RT', 'TDD_OPNPRC',
       'TDD_HGPRC', 'TDD_LWPRC', 'ACC_TRDVOL', 'ACC_TRDVAL', 'MKTCAP',
       'LIST_SHRS'],
      dtype='object')

## Preprocess

In [9]:
kospi_df = Preprocessor.comma_number_2_float(
    kospi_df, 
    columns=kospi_df.columns[4:15]
    )

In [10]:
kospi_df = Preprocessor.nullstr_2_nan(
    kospi_df,
    columns=kospi_df.columns[4:15],
    nullstr="-"
)

## lv2 conversion

### method 1: dataframe pivot 

(selected method)

In [11]:
lv2_df = kospi_df.pivot(index="trdDd", columns="ISU_SRT_CD", values="MKTCAP")
lv2_df

ISU_SRT_CD,000020,000030,000040,000050,000060,000070,000075,000080,000087,000100,...,381970,383220,383800,38380K,395400,900050,900140,950010,950100,950210
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20150101,,,,,,,,,,,...,,,,,,,,,,
20150102,154461029100,6620765252090,124711741055,514036312500,1335133800000,769071535800,12451175100,1585019608600,20342484000,1884780274000,...,,,,,,412322493600,907127176600,3314412930,427338000000,
20150103,,,,,,,,,,,...,,,,,,,,,,
20150104,,,,,,,,,,,...,,,,,,,,,,
20150105,153623085000,6512560712730,125905154845,508553258500,1313941200000,759650837700,12299146100,1616579733550,19890428800,1873627728000,...,,,,,,387799280775,899946750400,3314412930,427338000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211027,458076108000,,97098741680,364623091000,3558437500000,924941268000,21436089000,2507276593250,23789404900,4310334274400,...,1197354671700,6596478315000,693391472100,20513990000,1002982842040,,335164350890,,,1508413490500
20211028,473438416500,,99502175880,360510800500,3510187500000,894966319500,21223248400,2475716468300,23563377300,4254355907200,...,1204567651650,6688415295000,681949368600,20367461500,992131404800,,333175244950,,,1496394259500
20211029,456679534500,,99502175880,364623091000,3341312500000,882119913000,20949596200,2440649662800,23619884200,4191380244100,...,1178120058500,6650108220000,672795685800,20220933000,993681610120,,332180691980,,,1448317335500
20211030,,,,,,,,,,,...,,,,,,,,,,


### method 2: numpy iteration

slower than pandas pivot. 

could be improved with `numba`

In [25]:
date_list = kospi_df.loc[:, "trdDd"].unique()
sid_list = kospi_df.loc[:, "ISU_SRT_CD"].unique()

lv2_arr = np.empty((len(date_list), len(sid_list)))
lv2_arr[:] = np.nan

In [42]:
date_list_mapper = dict(zip(date_list, range(len(date_list))))
sid_list_mapper = dict(zip(sid_list, range(len(sid_list))))

In [43]:
mktcap_df = kospi_df.loc[:, ["trdDd", "ISU_SRT_CD", "MKTCAP"]].copy()

In [44]:
mktcap_arr = np.array(mktcap_df)

In [50]:
for row in mktcap_arr:
    date_idx = date_list_mapper[row[0]]
    sid_idx = sid_list_mapper[row[1]]
    value = row[2]

    lv2_arr[date_idx, sid_idx] = value

### method 3: numpy broadcasting

very slow

In [87]:
date_list = kospi_df.loc[:, "trdDd"].unique()
sid_list = kospi_df.loc[:, "ISU_SRT_CD"].unique()

lv2_arr = np.empty((len(date_list), len(sid_list)))
lv2_arr[:] = np.nan

In [54]:
date_list_mapper = dict(zip(date_list, range(len(date_list))))
sid_list_mapper = dict(zip(sid_list, range(len(sid_list))))

In [94]:
date_list_arr = np.array(list(map(lambda x: date_list_mapper[x], kospi_df["trdDd"]))) # row
date_list_arr = date_list_arr[:, None]

sid_list_arr = np.array(list(map(lambda x: sid_list_mapper[x], kospi_df["ISU_SRT_CD"]))) # column
sid_list_arr = sid_list_arr[None, :]

In [98]:
value_list_arr = np.array(kospi_df.loc[:, "MKTCAP"])

In [101]:
lv2_arr[date_list_arr, sid_list_arr] = value_list_arr

## Get derived dataset

### trading dates

date range를 선택할 땐 이렇게. 

```python
dr = pd.date_range(start='20210101', end='20211231')
dr
```

In [40]:
trdDd_df = kospi_df.groupby(by='trdDd')['MKTCAP'].sum(min_count=1).isnull()
trdDd_df

trdDd
20150101     True
20150102    False
20150103     True
20150104     True
20150105    False
            ...  
20211027    False
20211028    False
20211029    False
20211030     True
20211031     True
Name: MKTCAP, Length: 2496, dtype: bool

In [41]:
trading_dates = trdDd_df[trdDd_df].index
trading_dates

Int64Index([20150101, 20150103, 20150104, 20150110, 20150111, 20150117,
            20150118, 20150124, 20150125, 20150131,
            ...
            20211004, 20211009, 20211010, 20211011, 20211016, 20211017,
            20211023, 20211024, 20211030, 20211031],
           dtype='int64', name='trdDd', length=817)

In [42]:
lv2_df

ISU_SRT_CD,000020,000030,000040,000050,000060,000070,000075,000080,000087,000100,...,381970,383220,383800,38380K,395400,900050,900140,950010,950100,950210
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20150101,,,,,,,,,,,...,,,,,,,,,,
20150102,154461029100,6620765252090,124711741055,514036312500,1335133800000,769071535800,12451175100,1585019608600,20342484000,1884780274000,...,,,,,,412322493600,907127176600,3314412930,427338000000,
20150103,,,,,,,,,,,...,,,,,,,,,,
20150104,,,,,,,,,,,...,,,,,,,,,,
20150105,153623085000,6512560712730,125905154845,508553258500,1313941200000,759650837700,12299146100,1616579733550,19890428800,1873627728000,...,,,,,,387799280775,899946750400,3314412930,427338000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211027,458076108000,,97098741680,364623091000,3558437500000,924941268000,21436089000,2507276593250,23789404900,4310334274400,...,1197354671700,6596478315000,693391472100,20513990000,1002982842040,,335164350890,,,1508413490500
20211028,473438416500,,99502175880,360510800500,3510187500000,894966319500,21223248400,2475716468300,23563377300,4254355907200,...,1204567651650,6688415295000,681949368600,20367461500,992131404800,,333175244950,,,1496394259500
20211029,456679534500,,99502175880,364623091000,3341312500000,882119913000,20949596200,2440649662800,23619884200,4191380244100,...,1178120058500,6650108220000,672795685800,20220933000,993681610120,,332180691980,,,1448317335500
20211030,,,,,,,,,,,...,,,,,,,,,,


In [43]:
lv2_df.loc[trading_dates, :]

ISU_SRT_CD,000020,000030,000040,000050,000060,000070,000075,000080,000087,000100,...,381970,383220,383800,38380K,395400,900050,900140,950010,950100,950210
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20150101,,,,,,,,,,,...,,,,,,,,,,
20150103,,,,,,,,,,,...,,,,,,,,,,
20150104,,,,,,,,,,,...,,,,,,,,,,
20150110,,,,,,,,,,,...,,,,,,,,,,
20150111,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211017,,,,,,,,,,,...,,,,,,,,,,
20211023,,,,,,,,,,,...,,,,,,,,,,
20211024,,,,,,,,,,,...,,,,,,,,,,
20211030,,,,,,,,,,,...,,,,,,,,,,


In [44]:
universe_df = ~lv2_df.isnull()
universe_df

ISU_SRT_CD,000020,000030,000040,000050,000060,000070,000075,000080,000087,000100,...,381970,383220,383800,38380K,395400,900050,900140,950010,950100,950210
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20150101,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20150102,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,True,True,True,True,False
20150103,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20150104,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20150105,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,True,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211027,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,True,False,False,True
20211028,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,True,False,False,True
20211029,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,True,False,False,True
20211030,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Subuniverse

In [None]:
lv2_df

In [None]:
ss = kospi_df.reindex(index=trdDd_df.index, columns=kospi_df["ISU_SRT_CD"].unique(),) # lv2를 load할 때 필요. 
ss

In [100]:
ss.loc[20150101, '068400'] = 1

In [106]:
# TODO: Use numba

np.array(kospi_df)[7000]


array(['005740', '크라운제과', 'KOSPI', '', '206,000', '1', '1,000', '0.49',
       '205,000', '208,000', '199,000', '12,798', '2,602,134,500',
       303545944000.0, '1,473,524', 'STK', 20150108], dtype=object)