In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from multiprocessing.pool import ThreadPool

from data_io import download_stock_data, load_stock_data
from data_io import download_stock_data_from_naver

In [2]:
kospi = pd.read_csv('metadata/kospi.csv', error_bad_lines=False)
kospi[0:2]

b'Skipping line 437: expected 12 fields, saw 13\n'


Unnamed: 0,번호,종목코드,기업명,업종코드,업종,상장주식수(주),자본금(원),액면가(원),통화구분,대표전화,주소,총카운트
0,1,95570,AJ네트웍스,147603.0,산업용 기계 및 장비 임대업,46822295,46822295000,1000,원(KRW),02-6363-9999,"서울특별시 송파구 정의로8길 9 (문정동,AJ빌딩)",789.0
1,2,68400,AJ렌터카,147601.0,운송장비 임대업,22146300,11073150000,500,원(KRW),1544-1600,서울특별시 구로구 서부샛길 822,789.0


In [3]:
stocklist = pd.read_pickle('metadata/stocklist.pkl')
stocklist['종목코드'] = stocklist['종목코드'].apply(lambda x: str(int(x)).zfill(6))
stocklist.to_pickle('metadata/stocklist.pkl')
stocklist[0:2]

Unnamed: 0,회사명,종목코드,업종,주요제품,상장일,결산월,대표자명,홈페이지,지역
0,DSR,155660,1차 비철금속 제조업,합섬섬유로프,2013-05-15,12월,홍석빈,http://www.dsr.com,부산광역시
1,GS글로벌,1250,상품 종합 도매업,"수출입업(시멘트,철강금속,전기전자,섬유,기계화학),상품중개,광업,채석업/하수처리 서...",1976-06-26,12월,김태형,http://www.gsgcorp.com,서울특별시


In [4]:
df_kospi_metadata = pd.DataFrame(columns=['기업명', '종목코드', '상장일'])
df_kospi_metadata['기업명'] = kospi['기업명']
df_kospi_metadata['종목코드'] = kospi['종목코드']

In [5]:
def get_ipo_date(ticker) : 
    data = stocklist[stocklist['종목코드'] == ticker]['상장일']
    try : 
        if data.shape[0] != 0 :
            return data.values[0]
        else : 
            return 0
    except Exception as e :
        print(e)
        return 0

df_kospi_metadata['상장일'] = df_kospi_metadata['종목코드'].apply(lambda ticker : get_ipo_date(ticker))

In [6]:
wrong_idx = df_kospi_metadata[df_kospi_metadata['상장일']==0].index
df_kospi_metadata = df_kospi_metadata.drop(index=wrong_idx)

In [7]:
df_kospi_metadata['상장일'] = pd.to_datetime(df_kospi_metadata['상장일'])

# Process with single core

In [8]:
def download_stock_data(df_metadata, folder_name) : 
    df_result = pd.DataFrame(columns=['종목코드', '상장일', '결과'])
    outer = tqdm(total=df_metadata.shape[0], desc='Epoch', position=0)
    
    args = []
    for i in range(df_metadata.shape[0]) :
        args.append((i, i+1))
    
    for idx in range(0, df_metadata.shape[0]) :
        try : 
            download_stock_data('data/%s/%s.data'%(folder_name, df_metadata['종목코드'].iloc[idx]),
                                df_metadata['종목코드'].iloc[idx],
                                int(df_metadata['상장일'].dt.year.iloc[idx]),
                                int(df_metadata['상장일'].dt.month.iloc[idx]),
                                int(df_metadata['상장일'].dt.day.iloc[idx]),
                                pd.Timestamp.today().year,
                                pd.Timestamp.today().month,
                                pd.Timestamp.today().day)
            df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                                  df_metadata['상장일'].iloc[idx],
                                  'Yahoo']
        except Exception as e:
            try : 
                download_stock_data_from_naver('data/%s/%s.data'%(folder_name, df_metadata['종목코드'].iloc[idx]),
                                               df_metadata['종목코드'].iloc[idx],
                                               'day',
                                               '100000')
                df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                                  df_metadata['상장일'].iloc[idx],
                                  'Naver']
            except Exception as e : 
                df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                                  df_metadata['상장일'].iloc[idx],
                                  e]  
        outer.update(1)
    return df_result
    
result = download_stock_data(df_kospi_metadata, 'kospi')
result.to_pickle('history/kospi_download_result.pkl')

Epoch: 100%|██████████| 788/788 [21:19<00:00,  1.69s/it]

In [12]:
result

Unnamed: 0,종목코드,상장일,결과
0,095570,2015-08-21,Naver
1,068400,2012-07-27,Naver
2,006840,1999-08-11,Naver
3,027410,2014-05-19,Naver
4,282330,2017-12-08,Naver
...,...,...,...
783,079980,2012-02-23,Naver
784,005010,1973-06-29,Naver
785,069260,2002-10-07,Naver
786,000540,1974-12-05,Naver


# Process with multi core
- not tested

In [None]:
def inner_function(df_metadata, folder_name, idx) :
    try : 
        download_stock_data('data/%s/%s.data'%(folder_name, df_metadata['종목코드'].iloc[idx]),
                            df_metadata['종목코드'].iloc[idx],
                            int(df_metadata['상장일'].dt.year.iloc[idx]),
                            int(df_metadata['상장일'].dt.month.iloc[idx]),
                            int(df_metadata['상장일'].dt.day.iloc[idx]),
                            pd.Timestamp.today().year,
                            pd.Timestamp.today().month,
                            pd.Timestamp.today().day)
        df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                              df_metadata['상장일'].iloc[idx],
                              'Yahoo']
        pbar.update()
    except Exception as e:
        try : 
            download_stock_data_from_naver('data/%s/%s.data'%(folder_name, df_metadata['종목코드'].iloc[idx]),
                                           df_metadata['종목코드'].iloc[idx],
                                           'day',
                                           '100000')
            df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                              df_metadata['상장일'].iloc[idx],
                              'Naver']
            pbar.update()
        except Exception as e : 
            df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                              df_metadata['상장일'].iloc[idx],
                              e]
            pbar.update() 
    
    return

def download_stock_data_renewed(df_metadata, folder_name) : 
    df_result = pd.DataFrame(columns=['종목코드', '상장일', '결과'])
    
    args = []
    for j in range(df_metadata.shape[0]) :
        args.append((df_metadata, folder_name, j))
    
    pool = ThreadPool(24)   
    with tqdm(total=len(args)) as pbar:
        for i in range(len(args)):
            pool.apply_async(inner_function, args[i])
        pool.close()
        pool.join()

    return df_result
    
result = download_stock_data_renewed(df_kospi_metadata, 'kospi')
result.to_pickle('kospi_download_result.pkl')