# KRX에서 산업 분류 현황을 가져온다. 

https://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020203#

[12025] 업종분류 현황

다행히도 산업 분류는 조회일자를 설정할 수 있어, 중간에 상장폐지된 종목의 industry도 구할 수 있어 survivorship bias에서 자유로울 것으로 보인다. 

In [2]:
from pathlib import Path
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

import pickle

In [3]:
import FinanceDataReader as fdr
import quantstats as qs

In [4]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [5]:
## custom libs

from korquanttools.pricevolume.loader import KRXPriceDM
from korquanttools.pricevolume.utils import DateUtil
from korquanttools.pricevolume.config import PathConfig

## Initial setting and import basis data

In [6]:
# Global parameters

START = 20140101
END = 20221202

WINDOW = 60 # trading day 기준

In [32]:
## stock return
return_df = pd.read_pickle(PathConfig.cache_path / f"temp_return_{START}_{END}")

In [33]:
holidays = return_df.isnull().all(axis=1)
tradingdays = ~holidays

holidays = holidays.index[holidays]
tradingdays = tradingdays.index[tradingdays]

tradingdays

DatetimeIndex(['2014-01-02', '2014-01-03', '2014-01-06', '2014-01-07',
               '2014-01-08', '2014-01-09', '2014-01-10', '2014-01-13',
               '2014-01-14', '2014-01-15',
               ...
               '2022-11-21', '2022-11-22', '2022-11-23', '2022-11-24',
               '2022-11-25', '2022-11-28', '2022-11-29', '2022-11-30',
               '2022-12-01', '2022-12-02'],
              dtype='datetime64[ns]', name='trdDd', length=2190, freq=None)

In [34]:
return_df = return_df.loc[tradingdays, :].copy()

## Get sample data (crawling setup)

In [10]:
request_url="http://data.krx.co.kr/comm/bldAttendant/getJsonData.cmd"

In [11]:
# Get funda data
request_headers={
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-US,en;q=0.9,ko-KR;q=0.8,ko;q=0.7,ja;q=0.6",
    "Connection": "keep-alive",
    "Content-Length": "98",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    # "Cookie": "__smVisitorID=MOg5nSVvce5; JSESSIONID=Zm2tghqTUaVeWZwBoqGf5XDEl5p1ay0OKwMa1bMRQnDGQJ8xpyLTaXTZyGHWcjVY.bWRjX2RvbWFpbi9tZGNvd2FwMi1tZGNhcHAxMQ==",
    "Host": "data.krx.co.kr",
    "Origin": "http://data.krx.co.kr",
    "Referer": "https://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020203",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
    }

In [12]:
retry_strategy={
    "total": 10,
    "status_forcelist": [413, 429, 500, 502, 503, 504],
    "allowed_methods": ["GET", "POST"],
    "backoff_factor": 2,
    }

In [13]:
session = requests.session()
assert_status_hook = lambda response, *args, **kwargs: response.raise_for_status()
session.hooks["response"] = [assert_status_hook]
retry_strategy = Retry(**retry_strategy)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)

KOSPI, KOSDAQ 따로 해줘야 한다. ALL은 되지 않는다. 

KRX 에서도 ALL이랑 KOSDAQ GLOBAL은 greyed out 되어있음. 

In [14]:
POST_data={
    "bld": "dbms/MDC/STAT/standard/MDCSTAT03901",
    "mktId": "KSQ", # KOSPI: "STK", KOSDAQ: "KSQ", KONEX: "KNX"
    "trdDd": None, # format like: "20211029"
    "money": 1, # 화폐단위: 원, 천원, 백만원, 십억원
    "csvxls_isNo": "false",
    }

In [15]:
POST_data['trdDd'] = '20140101'

데이터를 가져오려면 주식의 short code가 아닌 full code가 필요함. 

In [16]:
response = session.post(
    request_url,
    data=POST_data,
    headers=request_headers
)

In [17]:
df = pd.DataFrame(response.json()['block1'])
df

Unnamed: 0,ISU_SRT_CD,ISU_ABBRV,MKT_TP_NM,IDX_IND_NM,TDD_CLSPRC,CMPPREVDD_PRC,FLUC_RT,MKTCAP,FLUC_TP_CD
0,060310,3S,KOSDAQ,반도체,-,-,-,-,-
1,013340,AJS,KOSDAQ,금속,-,-,-,-,-
2,054620,AP시스템,KOSDAQ,반도체,-,-,-,-,-
3,090470,AST젯텍,KOSDAQ,반도체,-,-,-,-,-
4,032040,C&S자산관리,KOSDAQ,기타서비스,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...
1009,065510,휴비츠,KOSDAQ,의료·정밀기기,-,-,-,-,-
1010,084110,휴온스,KOSDAQ,제약,-,-,-,-,-
1011,024060,흥구석유,KOSDAQ,유통,-,-,-,-,-
1012,010240,흥국,KOSDAQ,기계·장비,-,-,-,-,-


## Map industry 2 code

### Make industry 2 code mapper

In [18]:
INDUSTRY_NAME2CODE = {}

In [19]:
def update_industry_mapper(industry_name):
    codes = INDUSTRY_NAME2CODE.values()
    counter = 0
    if industry_name not in INDUSTRY_NAME2CODE:
        while counter in codes:
            counter += 1
        
        INDUSTRY_NAME2CODE[industry_name] = counter
    
    return INDUSTRY_NAME2CODE[industry_name]

### Make full dataset

In [51]:
industry_df = return_df.copy()

industry_df[:] = np.nan

#### KOSPI

In [52]:
POST_data={
    "bld": "dbms/MDC/STAT/standard/MDCSTAT03901",
    "mktId": "STK", # KOSPI: "STK", KOSDAQ: "KSQ", KONEX: "KNX"
    "trdDd": None, # format like: "20211029"
    "money": 1, # 화폐단위: 원, 천원, 백만원, 십억원
    "csvxls_isNo": "false",
    }

In [53]:
for di in tqdm(tradingdays):
    intdate = DateUtil.timestamp_2_intDate(di)

    POST_data['trdDd'] = str(intdate)
    response = session.post(
        request_url,
        data=POST_data,
        headers=request_headers
        )
    
    df = pd.DataFrame(response.json()['block1'])

    sid_list = df['ISU_SRT_CD'].values
    ind_list = df['IDX_IND_NM'].values
    
    ind_code_list = [update_industry_mapper(ind_name) for ind_name in ind_list]

    industry_df.loc[di, sid_list] = ind_code_list

100%|██████████| 2190/2190 [05:36<00:00,  6.51it/s]


#### KOSDAQ

In [54]:
POST_data={
    "bld": "dbms/MDC/STAT/standard/MDCSTAT03901",
    "mktId": "KSQ", # KOSPI: "STK", KOSDAQ: "KSQ", KONEX: "KNX"
    "trdDd": None, # format like: "20211029"
    "money": 1, # 화폐단위: 원, 천원, 백만원, 십억원
    "csvxls_isNo": "false",
    }

In [55]:
for di in tqdm(tradingdays):
    intdate = DateUtil.timestamp_2_intDate(di)

    POST_data['trdDd'] = str(intdate)
    response = session.post(
        request_url,
        data=POST_data,
        headers=request_headers
        )
    
    df = pd.DataFrame(response.json()['block1'])

    sid_list = df['ISU_SRT_CD'].values
    ind_list = df['IDX_IND_NM'].values
    
    ind_code_list = [update_industry_mapper(ind_name) for ind_name in ind_list]

    industry_df.loc[di, sid_list] = ind_code_list

100%|██████████| 2190/2190 [06:43<00:00,  5.43it/s]


In [56]:
industry_df

ISU_SRT_CD,000020,000040,000050,000060,000070,000075,000080,000087,000100,000105,...,417500,419080,425420,438220,438580,440200,440790,446070,439410,442130
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-02,11.0,17.0,2.0,16.0,0.0,0.0,4.0,4.0,11.0,11.0,...,,,,,,,,,,
2014-01-03,11.0,17.0,2.0,16.0,0.0,0.0,4.0,4.0,11.0,11.0,...,,,,,,,,,,
2014-01-06,11.0,17.0,2.0,16.0,0.0,0.0,4.0,4.0,11.0,11.0,...,,,,,,,,,,
2014-01-07,11.0,17.0,2.0,16.0,0.0,0.0,4.0,4.0,11.0,11.0,...,,,,,,,,,,
2014-01-08,11.0,17.0,2.0,16.0,0.0,0.0,4.0,4.0,11.0,11.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-28,11.0,17.0,7.0,16.0,1.0,1.0,4.0,4.0,11.0,11.0,...,24.0,32.0,24.0,41.0,41.0,41.0,41.0,13.0,,
2022-11-29,11.0,17.0,7.0,16.0,1.0,1.0,4.0,4.0,11.0,11.0,...,24.0,32.0,24.0,41.0,41.0,41.0,41.0,13.0,,
2022-11-30,11.0,17.0,7.0,16.0,1.0,1.0,4.0,4.0,11.0,11.0,...,24.0,32.0,24.0,41.0,41.0,41.0,41.0,13.0,,
2022-12-01,11.0,17.0,7.0,16.0,1.0,1.0,4.0,4.0,11.0,11.0,...,24.0,32.0,24.0,41.0,41.0,41.0,41.0,13.0,41.0,41.0


In [58]:
industry_df.to_pickle(f'krx_industry_df_{START}_{END}.pickle')

In [57]:
INDUSTRY_NAME2CODE

{'서비스업': 0,
 '기타금융': 1,
 '섬유의복': 2,
 '운수창고업': 3,
 '음식료품': 4,
 '화학': 5,
 '철강금속': 6,
 '유통업': 7,
 '건설업': 8,
 '증권': 9,
 '전기전자': 10,
 '의약품': 11,
 '기계': 12,
 '종이목재': 13,
 '통신업': 14,
 '기타제조업': 15,
 '보험': 16,
 '운수장비': 17,
 '전기가스업': 18,
 '비금속광물': 19,
 '은행': 20,
 '광업': 21,
 '농업, 임업 및 어업': 22,
 '의료정밀': 23,
 '반도체': 24,
 '금속': 25,
 '기타서비스': 26,
 '방송서비스': 27,
 '유통': 28,
 '제약': 29,
 '통신장비': 30,
 'IT부품': 31,
 '기계·장비': 32,
 '오락·문화': 33,
 '운송장비·부품': 34,
 '건설': 35,
 '전기·가스·수도': 36,
 '소프트웨어': 37,
 '인터넷': 38,
 '디지털컨텐츠': 39,
 '출판·매체복제': 40,
 '금융': 41,
 '운송': 42,
 '통신서비스': 43,
 '일반전기전자': 44,
 '섬유·의류': 45,
 '컴퓨터서비스': 46,
 '음식료·담배': 47,
 '비금속': 48,
 '종이·목재': 49,
 '의료·정밀기기': 50,
 '기타제조': 51,
 '정보기기': 52,
 '숙박·음식': 53}

In [59]:
with open('INDUSTRY_NAME2CODE.pickle', 'wb') as handle:
    pickle.dump(INDUSTRY_NAME2CODE, handle)