In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import multiprocessing
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import tqdm
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [3]:
from data_io import load_data_by_ticker, get_call_data_for_given_ticker, display_all

In [4]:
calls = pd.read_pickle('data/CALLS_TRANSFORMED.pkl')
kospi_downloaded = pd.read_pickle('history/kospi_download_result.pkl')
kospi = pd.read_csv('metadata/kospi.csv', error_bad_lines=False)

b'Skipping line 437: expected 12 fields, saw 13\n'


In [5]:
tickers = kospi_downloaded['종목코드']

In [6]:
_, columns_for_df = get_call_data_for_given_ticker(calls, kospi, tickers[0])
columns_for_df = columns_for_df.columns.insert(0, 'Time lag').insert(0, 'Correlation with').insert(0, 'ticker').insert(0, '기업명')
df_corr = pd.DataFrame(columns=columns_for_df)
df_corr

Unnamed: 0,기업명,ticker,Correlation with,Time lag,High,Low,Open,Close,Volume,Adj Close,...,"중랑구, 20대, 남","중랑구, 20대, 여","중랑구, 30대, 남","중랑구, 30대, 여","중랑구, 40대, 남","중랑구, 40대, 여","중랑구, 50대, 남","중랑구, 50대, 여","중랑구, 60대, 남","중랑구, 60대, 여"


In [7]:
def get_correlation_by_ticker(calls, kospi, ticker, idx) : 

    company_name, df_totest = get_call_data_for_given_ticker(calls, kospi, ticker)
    
    
    time_lags = [0,1,2,5,10,25,50]
    
    # for time lag
    for idx_time_lag, time_lag in enumerate(time_lags) : 
        df = df_totest.copy()
        if time_lag != 0 :
            df[['High', 'Low', 'Open', 'Close', 'Volume']] = df[['High', 'Low', 'Open', 'Close', 'Volume']].shift(periods=time_lag*(-1))
        
        # for price and volume
        candidates = ['High', 'Low', 'Open', 'Close', 'Volume']
        for idx_candidate, candidate in enumerate(candidates) : 
            temp = df.corr()[candidate]
            temp = pd.concat([pd.Series([company_name, ticker, candidate, time_lag], index=['기업명', 'ticker', 'Correlation with', 'Time lag']), temp])
            df_corr.loc[idx*(len(time_lags)*len(candidates))+idx_time_lag*len(candidates)+idx_candidate] = temp

    pbar.update(1)
    return

#idx = 0
#get_correlation_by_ticker(calls, kospi, tickers[idx], idx)
#df_corr.head()

In [9]:
pool = ThreadPool(24)
with tqdm.tqdm(total=tickers.shape[0]) as pbar :
    for idx in range(tickers.shape[0]) : 
        pool.apply_async(get_correlation_by_ticker, [calls, kospi, tickers[idx], idx])
        
    pool.close()
    pool.join()
    
df_corr.to_pickle('metadata/df_corr.pkl')

 54%|█████▍    | 428/788 [41:11<34:38,  5.77s/it]  


In [28]:
kospi.head()

Unnamed: 0,번호,종목코드,기업명,업종코드,업종,상장주식수(주),자본금(원),액면가(원),통화구분,대표전화,주소,총카운트
0,1,95570,AJ네트웍스,147603.0,산업용 기계 및 장비 임대업,46822295,46822295000,1000,원(KRW),02-6363-9999,"서울특별시 송파구 정의로8길 9 (문정동,AJ빌딩)",789.0
1,2,68400,AJ렌터카,147601.0,운송장비 임대업,22146300,11073150000,500,원(KRW),1544-1600,서울특별시 구로구 서부샛길 822,789.0
2,3,6840,AK홀딩스,116409.0,기타 금융업,13247561,66237805000,5000,원(KRW),02-768-2923,서울특별시 마포구 양화로 188 -,789.0
3,4,27410,BGF,116409.0,기타 금융업,95716791,95716791000,1000,,1577-3663,서울특별시 강남구 테헤란로 405,789.0
4,5,282330,BGF리테일,74701.0,종합 소매업,17283906,17283906000,1000,원(KRW),02-1577-8007,서울특별시 강남구 테헤란로 405 BGF사옥,789.0


In [29]:
df_corr_original = pd.read_pickle('metadata/df_corr.pkl')
df_corr = df_corr_original.copy()
df_corr = df_corr.drop(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], axis=1)
df_corr = df_corr.sort_index()

def get_company_sector(ticker) : 
    df = kospi[kospi['종목코드']==ticker]
    return df['업종'].values[0]

df_corr['업종'] = df_corr['ticker'].apply(lambda x : get_company_sector(x))

print('df_corr.shape : ', df_corr.shape)
print("df_corr['기업명'].unique().shape : ", df_corr['기업명'].unique().shape)
display(df_corr.head())

df_corr.shape :  (10461, 306)
df_corr['기업명'].unique().shape :  (773,)


Unnamed: 0,기업명,ticker,Correlation with,Time lag,총건수,"강남구, 10대, 남","강남구, 10대, 여","강남구, 20대, 남","강남구, 20대, 여","강남구, 30대, 남",...,"중랑구, 20대, 여","중랑구, 30대, 남","중랑구, 30대, 여","중랑구, 40대, 남","중랑구, 40대, 여","중랑구, 50대, 남","중랑구, 50대, 여","중랑구, 60대, 남","중랑구, 60대, 여",업종
0,AJ네트웍스,95570,High,0,0.28672,0.059425,0.030947,0.432957,0.490182,0.318886,...,0.414426,0.353341,0.402528,0.13733,0.099658,0.021042,0.032105,-0.14041,-0.100652,산업용 기계 및 장비 임대업
1,AJ네트웍스,95570,Low,0,0.288794,0.063129,0.039238,0.430153,0.482587,0.318194,...,0.405722,0.351589,0.398827,0.141923,0.104124,0.026743,0.040405,-0.136346,-0.097204,산업용 기계 및 장비 임대업
3,AJ네트웍스,95570,Close,0,0.351568,0.1325,0.107238,0.537662,0.579584,0.364488,...,0.499572,0.424915,0.470909,0.152728,0.130431,0.040273,0.031837,-0.11393,-0.067042,산업용 기계 및 장비 임대업
4,AJ네트웍스,95570,Volume,0,0.01466,0.05894,0.056624,-0.048749,-0.045125,-0.01424,...,0.006909,0.03749,0.01244,0.03791,0.042405,-0.019231,0.049486,0.0237,0.060874,산업용 기계 및 장비 임대업
5,AJ네트웍스,95570,High,1,0.285336,0.051969,0.0309,0.4272,0.483351,0.31564,...,0.414774,0.35401,0.403931,0.136317,0.09465,0.029805,0.027707,-0.142246,-0.096205,산업용 기계 및 장비 임대업


In [None]:
df_corr.groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values().to_csv('metadata/correlation_sector_time_lag.csv')

In [35]:
display_all(df_corr.groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values())

업종                                     Time lag  Correlation with
소프트웨어 개발 및 공급업                         0         Close              -0.536362
기반조성 및 시설물 축조관련 전문공사업                  10        Low                -0.475132
선박 및 보트 건조업                            0         Open               -0.475026
기반조성 및 시설물 축조관련 전문공사업                  2         Low                -0.474873
                                       10        Open               -0.473630
선박 및 보트 건조업                            5         Low                -0.473235
기반조성 및 시설물 축조관련 전문공사업                  5         High               -0.473109
                                       2         Close              -0.473060
                                       0         Open               -0.472965
                                       1         Low                -0.472737
                                       10        Close              -0.472510
                                                 High               -0.47163