# 자체 생성 KRX 데이터셋 소개

KRX 스크래핑을 통해 만든 자체 데이터셋

- survivorship bias 없음
- adjusted open, high, low, close 있음
- 추가적인 volume, dollarvolume, marketcap 데이터


In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
BASE_PATH = Path('.').resolve()
DATA_PATH = BASE_PATH / 'data'

## 1. 수정주가

### 데이터 불러오기

In [8]:
adjOpen_df = pd.read_pickle(DATA_PATH / 'adjOpen_20140101_20230705.pickle')
adjHigh_df = pd.read_pickle(DATA_PATH / 'adjHigh_20140101_20230705.pickle')
adjLow_df = pd.read_pickle(DATA_PATH / 'adjLow_20140101_20230705.pickle')
adjClose_df = pd.read_pickle(DATA_PATH / 'adjClose_20140101_20230705.pickle')
return_df = pd.read_pickle(DATA_PATH / 'return_20140101_20230705.pickle')

In [16]:
return_df

ISU_SRT_CD,000020,000040,000050,000060,000070,000075,000080,000087,000100,000105,...,405920,439090,440320,450050,451700,454640,455250,460850,460860,419700
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,,,,,,,,,,,...,,,,,,,,,,
2014-01-02,0.011390,0.000000,0.022936,0.023102,-0.014745,-0.041176,-0.013605,0.015060,-0.024031,0.011861,...,,,,,,,,,,
2014-01-03,0.022523,0.004484,0.022422,-0.006452,-0.023129,0.024540,-0.013793,0.000000,-0.011009,-0.003156,...,,,,,,,,,,
2014-01-04,,,,,,,,,,,...,,,,,,,,,,
2014-01-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-01,,,,,,,,,,,...,,,,,,,,,,
2023-07-02,,,,,,,,,,,...,,,,,,,,,,
2023-07-03,-0.004812,0.005068,-0.001006,,0.005510,0.007326,0.020833,-0.003064,-0.001656,0.001821,...,0.000000,-0.006402,0.078095,0.002353,0.002463,0.000000,0.000000,-0.004533,-0.018484,
2023-07-04,0.004836,-0.018487,-0.017120,,0.002740,-0.001818,-0.013605,0.000615,-0.014925,-0.009091,...,0.031680,-0.003866,0.177856,-0.007042,0.002457,0.000000,-0.002347,-0.016393,-0.024482,


### 대회에 맞게 수정

KOSPI + KOSDAQ + KONEX, 2014-01-01 ~ 2023-07-05 까지의 기간동안 존재한 모든 주식들이 들어있기 때문에, 

대회에서 주어진 기간에 존재하는 종목만 남겨야 합니다. (2021-06-01 ~ 2023-05-30)

날짜도 거래일만 남깁니다. 

In [21]:
## date list

holidays = adjClose_df.isnull().all(axis=1)
tradingdays = ~holidays

holidays = holidays.index[holidays]
tradingdays = tradingdays.index[tradingdays]

In [18]:
START = pd.to_datetime('2021-06-01', format='%Y-%m-%d')
END = pd.to_datetime('2023-05-30', format='%Y-%m-%d')

In [22]:
tradingdays = tradingdays[(tradingdays >= START) & (tradingdays <= END)]
tradingdays

DatetimeIndex(['2021-06-01', '2021-06-02', '2021-06-03', '2021-06-04',
               '2021-06-07', '2021-06-08', '2021-06-09', '2021-06-10',
               '2021-06-11', '2021-06-14',
               ...
               '2023-05-16', '2023-05-17', '2023-05-18', '2023-05-19',
               '2023-05-22', '2023-05-23', '2023-05-24', '2023-05-25',
               '2023-05-26', '2023-05-30'],
              dtype='datetime64[ns]', name='trdDd', length=470, freq=None)

In [27]:
return_df.loc[tradingdays, :].dropna(axis='columns', how='all')

ISU_SRT_CD,000020,000040,000050,000060,000070,000075,000080,000087,000100,000105,...,453340,012210,271830,304360,340810,417790,420770,434480,446840,456190
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,-0.003413,0.004274,-0.003333,0.014006,0.004405,0.006519,0.000000,0.016563,-0.007598,0.000000,...,,,,,,,,,,
2021-06-02,-0.006849,0.029787,0.010033,-0.002762,0.004386,-0.010363,0.012626,0.000000,-0.013821,-0.006465,...,,,,,,,,,,
2021-06-03,0.006897,-0.008264,0.019868,-0.008310,0.000000,0.000000,-0.004988,0.004073,-0.001546,-0.001645,...,,,,,,,,,,
2021-06-04,0.006849,-0.004167,-0.025974,-0.002793,-0.008734,0.007853,-0.017544,-0.008114,-0.009341,-0.003277,...,,,,,,,,,,
2021-06-07,0.030612,0.012552,-0.006667,-0.011204,-0.004405,0.005195,0.011480,-0.016360,0.003143,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-23,0.052802,-0.015235,-0.002885,,0.000000,0.005484,-0.004184,-0.005917,0.020408,0.018484,...,0.000000,-0.134021,,0.017230,0.299489,0.027060,,0.116955,0.004796,0.0
2023-05-24,-0.003071,0.025316,0.000000,,0.001332,-0.007273,0.002101,0.002381,-0.005000,-0.009074,...,-0.007252,-0.000595,0.065109,-0.024467,-0.070826,-0.043713,,-0.070632,-0.004773,0.0
2023-05-25,0.030801,-0.032922,0.000964,,-0.009309,0.005495,-0.006289,-0.000594,-0.006700,0.000000,...,-0.007305,-0.001191,-0.028213,-0.012862,-0.047187,-0.007514,-0.055696,-0.036000,0.002398,0.0
2023-05-26,-0.018924,-0.007092,0.005780,,-0.024161,0.001821,-0.010549,0.002377,0.003373,0.001832,...,-0.040883,0.118664,-0.008065,-0.036482,-0.039365,0.037224,0.002681,-0.044952,0.004785,0.0


In [28]:
sid_list = return_df.loc[tradingdays, :].dropna(axis='columns', how='all').columns
sid_list

Index(['000020', '000040', '000050', '000060', '000070', '000075', '000080',
       '000087', '000100', '000105',
       ...
       '453340', '012210', '271830', '304360', '340810', '417790', '420770',
       '434480', '446840', '456190'],
      dtype='object', name='ISU_SRT_CD', length=2829)

- 000060: 메리츠화재

'어닝서프라이즈' 메리츠화재, 20일 뒤 상장폐지 된다 (2023-02-03)

메리츠화재가 어닝서프라이즈를 장식하며 이달 21일 상장폐지된다. 메리츠금융지주와의 주식스왑을 마무리하고 지주의 완전 자회사로 편입됐다. 역대 최대 실적을 끝으로 메리츠화재 주식은 매매정지에 들어갔다.

- 012210: 삼미금속

2023-05-16 상장

### 그런데

음... 근데 생각해보니 어차피 예선 submission은 survivorship bias가 존재하는, train.csv에 존재하는 종목으로만 해야 하네요. 

- 전략 백테스팅을 엄밀하게 하기 위해 위의 데이터셋을 필터 없이 사용하거나 (survivorship bias 없음)
- 그냥 대회에서 주어진 2000종목을 필터하고 데이터만 수정주가로 바꿔서 쓰거나 (survivorship bias 있음)

해야 할 것 같습니다. 

In [34]:
krx_df = pd.read_csv(DATA_PATH / 'train.csv')
krx_df.columns = ['date', 'code', 'name', 'volume', 'open', 'high', 'low', 'close']
krx_df['date'] = pd.to_datetime(krx_df['date'], format='%Y%m%d')

dacon_sid_list = [ii[1:] for ii in krx_df['code'].unique()]

In [38]:
return_df.loc[tradingdays, dacon_sid_list] # 이런 식으로. 

ISU_SRT_CD,060310,095570,006840,054620,265520,211270,027410,282330,126600,138930,...,243070,084110,145020,024060,010240,189980,000540,003280,037440,238490
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,0.010381,-0.023649,-0.028249,0.023973,-0.001718,0.052288,0.004274,0.013774,0.014263,0.005031,...,-0.001582,-0.018570,0.003484,0.015439,0.025806,0.023196,0.023204,0.000000,0.107948,-0.004329
2021-06-02,-0.006849,0.003460,0.059593,-0.030100,0.003442,-0.015528,0.004255,0.008152,-0.004687,-0.001252,...,-0.023762,-0.046587,-0.002976,0.005848,0.000000,0.021411,0.018359,0.000000,0.134904,-0.008696
2021-06-03,0.000000,-0.013793,-0.004115,0.003448,0.006861,-0.022082,0.001412,0.016173,-0.009419,-0.006266,...,0.000000,0.009171,0.024876,0.032558,0.006289,-0.016030,-0.006363,0.000000,0.023585,0.013158
2021-06-04,0.017241,-0.026224,-0.011019,-0.020619,-0.008518,-0.009677,-0.002821,-0.029178,0.020602,0.007566,...,0.009736,0.030260,0.062621,-0.023649,0.014583,-0.015038,0.006403,0.000000,-0.027650,0.017316
2021-06-07,0.067797,-0.026930,-0.030641,0.045614,-0.001718,0.032573,0.004243,0.049180,-0.021739,0.001252,...,0.009642,-0.005880,0.004568,-0.014994,-0.026694,-0.021628,-0.011665,0.000000,-0.052133,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-23,-0.012959,0.004535,-0.002494,-0.027826,0.002375,-0.031111,-0.008343,-0.005949,0.014706,0.000000,...,0.006061,0.002421,0.010830,-0.001776,-0.001502,0.023609,0.010753,-0.012126,-0.010428,-0.041729
2023-05-24,-0.002188,0.004515,-0.001500,-0.042934,0.004739,-0.038991,0.009615,0.034820,0.003953,0.000000,...,0.003012,0.002415,0.004464,0.017794,-0.007519,0.008237,0.006079,-0.005776,-0.016860,-0.017107
2023-05-25,-0.015351,-0.007865,-0.016024,0.051402,-0.002358,-0.006563,0.000000,-0.025762,-0.017060,0.002950,...,-0.012012,-0.016867,-0.020444,0.003497,-0.019697,0.008170,-0.004532,-0.015977,-0.015005,0.001582
2023-05-26,0.296214,-0.020385,-0.012214,-0.008889,-0.007092,-0.091892,-0.014286,-0.013492,-0.018692,-0.011765,...,-0.007599,0.012255,-0.001815,-0.012195,-0.029366,-0.027553,-0.018209,-0.006642,0.001088,0.000000


## 2. 기타 데이터

- volume (not adjusted)
- dollarvolume (volume * unadjusted price)
    - = 유동성
- marketcap
    - 총 시가총액으로 보임. (유동주식 x)
    - KRX에 별 설명없이 시가총액이라고 나와있고, 네이버 증권 정보와 일치. 
- market category (KOSPI, KOSDAQ, KONEX)

In [39]:
volume_df = pd.read_pickle(DATA_PATH / 'volume_df_20140101_20230705.pickle')
dollarvolume_df = pd.read_pickle(DATA_PATH / 'dollarvolume_df_20140101_20230705.pickle')
marketcap_df = pd.read_pickle(DATA_PATH / 'marketcap_df_20140101_20230705.pickle')
market_cat_df = pd.read_pickle(DATA_PATH / 'market_cat_df_20140101_20230705.pickle')

In [42]:
dollarvolume_df.loc[tradingdays, dacon_sid_list]

ISU_SRT_CD,060310,095570,006840,054620,265520,211270,027410,282330,126600,138930,...,243070,084110,145020,024060,010240,189980,000540,003280,037440,238490
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,4.874560e+08,3.714400e+08,3.580614e+09,6.605258e+09,3.831133e+09,9.820027e+09,2.438079e+09,7.234372e+09,4.040961e+09,1.034730e+10,...,4.495156e+09,2.187627e+10,5.427204e+09,6.368878e+09,4.365115e+09,2.910126e+09,917804860.0,0.0,7.656530e+10,1.228097e+09
2021-06-02,3.887956e+08,2.274799e+08,7.067376e+09,2.476580e+09,3.275446e+09,5.837519e+09,2.668186e+09,4.823126e+09,2.106360e+09,8.984141e+09,...,7.545659e+09,2.583707e+10,4.265713e+09,1.918429e+10,2.384016e+09,1.137642e+10,870020490.0,0.0,1.176564e+11,6.397364e+08
2021-06-03,4.196683e+08,4.458985e+08,4.265965e+09,2.944094e+09,5.204517e+09,3.303811e+09,2.254491e+09,7.518697e+09,2.359949e+09,1.009959e+10,...,3.690912e+09,1.643655e+10,1.118883e+10,4.261731e+10,4.173583e+09,2.686582e+09,833253210.0,0.0,4.588978e+10,8.967736e+08
2021-06-04,2.840374e+09,1.097634e+09,2.408190e+09,1.213409e+09,3.086342e+09,2.968876e+09,2.991015e+09,6.367442e+09,3.289530e+09,2.281765e+10,...,6.226065e+09,2.269498e+10,3.848989e+10,5.065369e+09,2.542921e+09,1.369134e+09,922612840.0,0.0,1.813047e+10,1.000769e+09
2021-06-07,2.929678e+09,9.137958e+08,3.674688e+09,2.889126e+09,3.327606e+09,4.891162e+09,1.749588e+09,8.445268e+09,3.516365e+09,6.622256e+09,...,4.944401e+09,1.446767e+10,1.198800e+10,4.484540e+09,2.058471e+09,1.524905e+09,721614030.0,0.0,1.379373e+10,6.889412e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-23,2.672013e+08,3.696248e+08,1.353981e+08,5.859243e+09,1.591561e+09,1.402829e+10,3.260473e+08,4.054561e+09,1.257598e+09,3.583622e+09,...,6.046415e+08,1.439050e+08,3.821219e+09,2.507889e+08,1.992469e+08,5.508579e+09,49648765.0,162553600.0,2.609604e+09,2.662077e+08
2023-05-24,1.529127e+08,1.112857e+09,6.575499e+07,1.921732e+09,9.507347e+08,1.433276e+10,4.133310e+08,5.822567e+09,1.103507e+09,2.009493e+09,...,1.720486e+09,3.552556e+08,2.276703e+09,3.650234e+09,8.522629e+07,2.397411e+09,119485065.0,180396448.0,2.043797e+09,1.236356e+08
2023-05-25,2.074794e+08,3.692479e+08,2.813543e+08,5.168703e+09,1.681690e+09,8.682289e+09,3.153353e+08,8.510456e+09,1.495469e+09,5.139083e+09,...,1.047007e+09,3.445306e+08,5.380137e+09,1.531942e+09,1.374130e+08,1.719546e+09,121354685.0,138656493.0,1.829468e+09,4.086213e+07
2023-05-26,3.719003e+10,9.013574e+08,1.286980e+08,1.447042e+09,1.797020e+09,1.757343e+10,2.503709e+08,5.090645e+09,1.178733e+10,3.868385e+09,...,8.872376e+08,1.675054e+08,3.766218e+09,4.867517e+08,1.851595e+08,1.093983e+09,80660070.0,119972459.0,1.287421e+09,5.662154e+07


In [43]:
marketcap_df.loc[tradingdays, dacon_sid_list]

ISU_SRT_CD,060310,095570,006840,054620,265520,211270,027410,282330,126600,138930,...,243070,084110,145020,024060,010240,189980,000540,003280,037440,238490
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,1.351128e+11,2.706329e+11,4.557161e+11,3.048936e+11,4.206506e+11,2.428251e+11,6.748034e+11,3.180239e+12,1.351401e+11,2.604223e+12,...,6.852452e+11,8.264809e+11,2.517068e+12,1.282500e+11,1.175585e+11,1.523969e+11,2.974434e+11,3.011700e+10,1.300359e+11,1.300907e+11
2021-06-02,1.341874e+11,2.715693e+11,4.828736e+11,2.957162e+11,4.220986e+11,2.390545e+11,6.776749e+11,3.206165e+12,1.345066e+11,2.600963e+12,...,6.689557e+11,7.879839e+11,2.509576e+12,1.290000e+11,1.175585e+11,1.556598e+11,3.029041e+11,3.011700e+10,1.475782e+11,1.289595e+11
2021-06-03,1.341874e+11,2.678235e+11,4.808865e+11,2.967359e+11,4.249947e+11,2.337757e+11,6.786320e+11,3.258016e+12,1.332397e+11,2.584667e+12,...,6.689557e+11,7.952021e+11,2.572004e+12,1.332000e+11,1.182979e+11,1.531646e+11,3.009768e+11,3.011700e+10,1.510589e+11,1.306563e+11
2021-06-04,1.365010e+11,2.608002e+11,4.755874e+11,2.906176e+11,4.213746e+11,2.315134e+11,6.767177e+11,3.162955e+12,1.359847e+11,2.604223e+12,...,6.754715e+11,8.192627e+11,2.733066e+12,1.300500e+11,1.200231e+11,1.508614e+11,3.029041e+11,3.011700e+10,1.468821e+11,1.329188e+11
2021-06-07,1.457553e+11,2.537768e+11,4.610151e+11,3.038739e+11,4.206506e+11,2.390545e+11,6.795892e+11,3.318510e+12,1.330285e+11,2.607482e+12,...,6.819873e+11,8.144506e+11,2.745552e+12,1.281000e+11,1.168192e+11,1.475985e+11,2.993707e+11,3.011700e+10,1.392248e+11,1.329188e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-23,1.109062e+11,2.074228e+11,2.649512e+11,2.280074e+11,3.224380e+11,2.630354e+11,3.981819e+11,3.176782e+12,3.101958e+11,2.209841e+12,...,3.962602e+11,2.612649e+11,1.387171e+12,8.430000e+10,8.194593e+10,1.218183e+11,2.113583e+11,3.329885e+11,1.321243e+11,7.273768e+10
2023-05-24,1.106635e+11,2.083592e+11,2.645538e+11,2.182182e+11,3.239661e+11,2.527794e+11,4.020105e+11,3.287399e+12,3.114219e+11,2.209841e+12,...,3.974538e+11,2.618960e+11,1.393364e+12,8.580000e+10,8.132979e+10,1.228218e+11,2.126432e+11,3.310651e+11,1.298967e+11,7.149333e+10
2023-05-25,1.089648e+11,2.067204e+11,2.603146e+11,2.294350e+11,3.232021e+11,2.511204e+11,4.020105e+11,3.202708e+12,3.061089e+11,2.216360e+12,...,3.926796e+11,2.574784e+11,1.364877e+12,8.610000e+10,7.972784e+10,1.238252e+11,2.116795e+11,3.257757e+11,1.279475e+11,7.160645e+10
2023-05-26,1.412416e+11,2.025064e+11,2.571352e+11,2.273956e+11,3.209098e+11,2.280444e+11,3.962675e+11,3.159498e+12,3.003873e+11,2.190285e+12,...,3.896957e+11,2.606338e+11,1.362400e+12,8.505000e+10,7.738653e+10,1.204135e+11,2.078250e+11,3.236119e+11,1.280868e+11,7.160645e+10


In [46]:
market_cat_inrange = market_cat_df[market_cat_df['trdDd'].isin(tradingdays)]
market_cat_inrange

Unnamed: 0,ISU_SRT_CD,MKT_NM,trdDd,is_KOSPI,is_KOSDAQ,is_KONEX
6123995,060310,KOSDAQ,2021-06-01,False,True,False
6123996,095570,KOSPI,2021-06-01,True,False,False
6123997,006840,KOSPI,2021-06-01,True,False,False
6123998,054620,KOSDAQ,2021-06-01,False,True,False
6123999,265520,KOSDAQ,2021-06-01,False,True,False
...,...,...,...,...,...,...
7952133,000547,KOSPI,2023-05-30,True,False,False
7952134,000545,KOSPI,2023-05-30,True,False,False
7952135,003280,KOSPI,2023-05-30,True,False,False
7952136,037440,KOSDAQ,2023-05-30,False,True,False


In [49]:
KOSPI_sid_list = market_cat_inrange[market_cat_inrange['is_KOSPI'] == True]['ISU_SRT_CD'].unique()
KOSDAQ_sid_list = market_cat_inrange[market_cat_inrange['is_KOSDAQ'] == True]['ISU_SRT_CD'].unique()
KONEX_sid_list = market_cat_inrange[market_cat_inrange['is_KONEX'] == True]['ISU_SRT_CD'].unique()

이 기간 내에 코스닥 종목이 코스피로 옮겨가는 등의 케이스가 존재하는가? 

--> 존재한다. 

In [50]:
set(KOSPI_sid_list) & set(KOSDAQ_sid_list)

{'097520', '100090', '108320', '178920'}

In [51]:
set(KOSPI_sid_list) & set(KONEX_sid_list)

set()

In [52]:
set(KOSDAQ_sid_list) & set(KONEX_sid_list)

{'058970',
 '067370',
 '084440',
 '148780',
 '179530',
 '199800',
 '200350',
 '203400',
 '211050',
 '222160',
 '232680',
 '260970',
 '270660',
 '344860',
 '393210'}

DACON에서 주어진 2000 종목은 어디 속하나? 

어차피 survivorship bias 있으니, 가장 최신 날짜 기준으로 확인하겠음. 

--> KOSPI, KOSDAQ, KONEX 다 있다. 

In [60]:
market_cat_lastday =  market_cat_inrange[market_cat_inrange['trdDd'] == '20230530']

In [61]:
KOSPI_sid_list = market_cat_lastday[market_cat_lastday['is_KOSPI'] == True]['ISU_SRT_CD'].unique()
KOSDAQ_sid_list = market_cat_lastday[market_cat_lastday['is_KOSDAQ'] == True]['ISU_SRT_CD'].unique()
KONEX_sid_list = market_cat_lastday[market_cat_lastday['is_KONEX'] == True]['ISU_SRT_CD'].unique()

In [68]:
len(set(dacon_sid_list) & set(KOSPI_sid_list))

751

In [69]:
len(set(dacon_sid_list) & set(KOSDAQ_sid_list))

1199

In [70]:
len(set(dacon_sid_list) & set(KONEX_sid_list))

1