In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import os
import multiprocessing
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import tqdm
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [4]:
from data_io import load_data_by_ticker, get_call_data_for_given_ticker, display_all, calls_amount

# Load Data

In [5]:
calls = pd.read_pickle('data/CALLS_TRANSFORMED.pkl')
kospi_downloaded = pd.read_pickle('history/kospi_download_result.pkl')
kospi = pd.read_csv('metadata/kospi.csv', error_bad_lines=False)

b'Skipping line 437: expected 12 fields, saw 13\n'


In [5]:
tickers = kospi_downloaded['종목코드']

In [6]:
_, columns_for_df = get_call_data_for_given_ticker(calls, kospi, tickers[0])
columns_for_df = columns_for_df.columns.insert(0, 'Time lag').insert(0, 'Correlation with').insert(0, 'ticker').insert(0, '기업명')
df_corr = pd.DataFrame(columns=columns_for_df)
df_corr

Unnamed: 0,기업명,ticker,Correlation with,Time lag,High,Low,Open,Close,Volume,Adj Close,...,"중랑구, 20대, 남","중랑구, 20대, 여","중랑구, 30대, 남","중랑구, 30대, 여","중랑구, 40대, 남","중랑구, 40대, 여","중랑구, 50대, 남","중랑구, 50대, 여","중랑구, 60대, 남","중랑구, 60대, 여"


# Get Correlation

In [7]:
def get_correlation_by_ticker(calls, kospi, ticker, idx) : 

    company_name, df_totest = get_call_data_for_given_ticker(calls, kospi, ticker)
    
    
    time_lags = [0,1,2,5,10,25,50]
    
    # for time lag
    for idx_time_lag, time_lag in enumerate(time_lags) : 
        df = df_totest.copy()
        if time_lag != 0 :
            df[['High', 'Low', 'Open', 'Close', 'Volume']] = df[['High', 'Low', 'Open', 'Close', 'Volume']].shift(periods=time_lag*(-1))
        
        # for price and volume
        candidates = ['High', 'Low', 'Open', 'Close', 'Volume']
        for idx_candidate, candidate in enumerate(candidates) : 
            temp = df.corr()[candidate]
            temp = pd.concat([pd.Series([company_name, ticker, candidate, time_lag], index=['기업명', 'ticker', 'Correlation with', 'Time lag']), temp])
            df_corr.loc[idx*(len(time_lags)*len(candidates))+idx_time_lag*len(candidates)+idx_candidate] = temp

    pbar.update(1)
    return

#idx = 0
#get_correlation_by_ticker(calls, kospi, tickers[idx], idx)
#df_corr.head()

In [9]:
pool = ThreadPool(24)
with tqdm.tqdm(total=tickers.shape[0]) as pbar :
    for idx in range(tickers.shape[0]) : 
        pool.apply_async(get_correlation_by_ticker, [calls, kospi, tickers[idx], idx])
        
    pool.close()
    pool.join()
    
df_corr.to_pickle('metadata/df_corr.pkl')

 54%|█████▍    | 428/788 [41:11<34:38,  5.77s/it]  


In [6]:
kospi.head()

Unnamed: 0,번호,종목코드,기업명,업종코드,업종,상장주식수(주),자본금(원),액면가(원),통화구분,대표전화,주소,총카운트
0,1,95570,AJ네트웍스,147603.0,산업용 기계 및 장비 임대업,46822295,46822295000,1000,원(KRW),02-6363-9999,"서울특별시 송파구 정의로8길 9 (문정동,AJ빌딩)",789.0
1,2,68400,AJ렌터카,147601.0,운송장비 임대업,22146300,11073150000,500,원(KRW),1544-1600,서울특별시 구로구 서부샛길 822,789.0
2,3,6840,AK홀딩스,116409.0,기타 금융업,13247561,66237805000,5000,원(KRW),02-768-2923,서울특별시 마포구 양화로 188 -,789.0
3,4,27410,BGF,116409.0,기타 금융업,95716791,95716791000,1000,,1577-3663,서울특별시 강남구 테헤란로 405,789.0
4,5,282330,BGF리테일,74701.0,종합 소매업,17283906,17283906000,1000,원(KRW),02-1577-8007,서울특별시 강남구 테헤란로 405 BGF사옥,789.0


# Correlation Matrix

In [7]:
df_corr_original = pd.read_pickle('metadata/df_corr.pkl')
df_corr = df_corr_original.copy()
df_corr = df_corr.drop(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], axis=1)
df_corr = df_corr.sort_index()

def get_company_sector(ticker) : 
    df = kospi[kospi['종목코드']==ticker]
    return df['업종'].values[0]

df_corr['업종'] = df_corr['ticker'].apply(lambda x : get_company_sector(x))

print('df_corr.shape : ', df_corr.shape)
print("df_corr['기업명'].unique().shape : ", df_corr['기업명'].unique().shape)
display(df_corr.head())

df_corr.shape :  (10461, 306)
df_corr['기업명'].unique().shape :  (773,)


Unnamed: 0,기업명,ticker,Correlation with,Time lag,총건수,"강남구, 10대, 남","강남구, 10대, 여","강남구, 20대, 남","강남구, 20대, 여","강남구, 30대, 남",...,"중랑구, 20대, 여","중랑구, 30대, 남","중랑구, 30대, 여","중랑구, 40대, 남","중랑구, 40대, 여","중랑구, 50대, 남","중랑구, 50대, 여","중랑구, 60대, 남","중랑구, 60대, 여",업종
0,AJ네트웍스,95570,High,0,0.28672,0.059425,0.030947,0.432957,0.490182,0.318886,...,0.414426,0.353341,0.402528,0.13733,0.099658,0.021042,0.032105,-0.14041,-0.100652,산업용 기계 및 장비 임대업
1,AJ네트웍스,95570,Low,0,0.288794,0.063129,0.039238,0.430153,0.482587,0.318194,...,0.405722,0.351589,0.398827,0.141923,0.104124,0.026743,0.040405,-0.136346,-0.097204,산업용 기계 및 장비 임대업
3,AJ네트웍스,95570,Close,0,0.351568,0.1325,0.107238,0.537662,0.579584,0.364488,...,0.499572,0.424915,0.470909,0.152728,0.130431,0.040273,0.031837,-0.11393,-0.067042,산업용 기계 및 장비 임대업
4,AJ네트웍스,95570,Volume,0,0.01466,0.05894,0.056624,-0.048749,-0.045125,-0.01424,...,0.006909,0.03749,0.01244,0.03791,0.042405,-0.019231,0.049486,0.0237,0.060874,산업용 기계 및 장비 임대업
5,AJ네트웍스,95570,High,1,0.285336,0.051969,0.0309,0.4272,0.483351,0.31564,...,0.414774,0.35401,0.403931,0.136317,0.09465,0.029805,0.027707,-0.142246,-0.096205,산업용 기계 및 장비 임대업


# Mean correlation per sector

In [21]:
df_corr.head()

Unnamed: 0,기업명,ticker,Correlation with,Time lag,총건수,"강남구, 10대, 남","강남구, 10대, 여","강남구, 20대, 남","강남구, 20대, 여","강남구, 30대, 남",...,"중랑구, 20대, 여","중랑구, 30대, 남","중랑구, 30대, 여","중랑구, 40대, 남","중랑구, 40대, 여","중랑구, 50대, 남","중랑구, 50대, 여","중랑구, 60대, 남","중랑구, 60대, 여",업종
0,AJ네트웍스,95570,High,0,0.28672,0.059425,0.030947,0.432957,0.490182,0.318886,...,0.414426,0.353341,0.402528,0.13733,0.099658,0.021042,0.032105,-0.14041,-0.100652,산업용 기계 및 장비 임대업
1,AJ네트웍스,95570,Low,0,0.288794,0.063129,0.039238,0.430153,0.482587,0.318194,...,0.405722,0.351589,0.398827,0.141923,0.104124,0.026743,0.040405,-0.136346,-0.097204,산업용 기계 및 장비 임대업
3,AJ네트웍스,95570,Close,0,0.351568,0.1325,0.107238,0.537662,0.579584,0.364488,...,0.499572,0.424915,0.470909,0.152728,0.130431,0.040273,0.031837,-0.11393,-0.067042,산업용 기계 및 장비 임대업
4,AJ네트웍스,95570,Volume,0,0.01466,0.05894,0.056624,-0.048749,-0.045125,-0.01424,...,0.006909,0.03749,0.01244,0.03791,0.042405,-0.019231,0.049486,0.0237,0.060874,산업용 기계 및 장비 임대업
5,AJ네트웍스,95570,High,1,0.285336,0.051969,0.0309,0.4272,0.483351,0.31564,...,0.414774,0.35401,0.403931,0.136317,0.09465,0.029805,0.027707,-0.142246,-0.096205,산업용 기계 및 장비 임대업


In [12]:
mean_corr_per_sector = df_corr.groupby(['업종']).mean()
display(mean_corr_per_sector.head())

Unnamed: 0_level_0,총건수,"강남구, 10대, 남","강남구, 10대, 여","강남구, 20대, 남","강남구, 20대, 여","강남구, 30대, 남","강남구, 30대, 여","강남구, 40대, 남","강남구, 40대, 여","강남구, 50대, 남",...,"중랑구, 20대, 남","중랑구, 20대, 여","중랑구, 30대, 남","중랑구, 30대, 여","중랑구, 40대, 남","중랑구, 40대, 여","중랑구, 50대, 남","중랑구, 50대, 여","중랑구, 60대, 남","중랑구, 60대, 여"
업종,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1차 비철금속 제조업,-0.097576,-0.081651,-0.081729,-0.14493,-0.171943,-0.122483,-0.137503,-0.027517,0.033339,0.043686,...,-0.158462,-0.169514,-0.145435,-0.136072,-0.05952,-0.030679,0.01711,0.040549,0.108375,0.101745
1차 철강 제조업,0.020296,0.00503,0.005292,-0.003687,-0.012871,0.001594,-0.012251,0.007421,0.021423,0.027428,...,0.001887,-0.007519,0.004742,0.001926,0.01688,0.01738,0.020295,0.035175,0.032004,0.033148
가구 제조업,0.012597,-0.049743,-0.087599,0.009434,0.006246,-0.003743,0.032007,-0.000713,0.062322,-0.034983,...,-0.002596,-0.002885,-0.001161,0.02578,0.000217,0.015651,0.007723,0.00299,-0.044243,-0.027172
가전제품 및 정보통신장비 소매업,0.24385,0.221267,0.213346,0.3153,0.363227,0.272635,0.293664,0.103641,-0.035414,-0.071842,...,0.37643,0.358079,0.37431,0.3395,0.197694,0.136746,0.017984,0.012892,-0.188809,-0.136121
가정용 기기 제조업,-0.288167,-0.243736,-0.228141,-0.453483,-0.495524,-0.342074,-0.385391,-0.037783,0.089753,0.112565,...,-0.444171,-0.474776,-0.401579,-0.393591,-0.125226,-0.052785,0.098145,0.169605,0.321585,0.257228


In [35]:
df_corr.columns

Index(['기업명', 'ticker', 'Correlation with', 'Time lag', '총건수', '강남구, 10대, 남',
       '강남구, 10대, 여', '강남구, 20대, 남', '강남구, 20대, 여', '강남구, 30대, 남',
       ...
       '중랑구, 20대, 여', '중랑구, 30대, 남', '중랑구, 30대, 여', '중랑구, 40대, 남',
       '중랑구, 40대, 여', '중랑구, 50대, 남', '중랑구, 50대, 여', '중랑구, 60대, 남',
       '중랑구, 60대, 여', '업종'],
      dtype='object', length=306)

# Get 

# Get mean corrleation for given condition per sector

In [1]:
calls_original = pd.read_pickle('data/CALLS_CLIMATE.pkl')
display(calls_original.head())

sector_unique = df_corr['업종'].unique()
area_unique = calls_original['시군구'].unique()
sex_unique = calls_original['성별'].unique()
age_unique = calls_original['연령대'].unique()

df_corr_mean = pd.DataFrame(columns=['업종', '지역', '연령대', '성별', 'correlation'])
display(df_corr_mean)
display(df_corr.columns)

def get_correlation_for_given_combination(idx, sector, area, age, sex) : 
    #print('%s, %s, %s, %s' % (sector, area, age, sex))
    column_name = '%s, %s, %s' % (area, age, sex)
    target = df_corr[df_corr['Correlation with']=='Close']
    display(target)
    target = target[target['업종']==sector][column_name]
    display(target)
    mean_corr = target.mean()
    #print('mean corr for given combination : %s' % mean_corr)
    df_corr_mean.loc[idx] = [sector, area, age, sex, mean_corr]
    pbar.update(1)
    
    return
    
i = 0 
with tqdm.tqdm(total=(sector_unique.shape[0] * area_unique.shape[0] * age_unique.shape[0] * sex_unique.shape[0])) as pbar :
    for sector in sector_unique : 
        for area in area_unique : 
            for age in age_unique : 
                for sex in sex_unique : 
                    get_correlation_for_given_combination(i, sector, area, age, sex)
                    i += 1

NameError: name 'pd' is not defined

In [60]:
df_corr_mean.to_pickle('metadata/df_corr_mean.pkl')
df_corr_mean.to_csv('metadata/df_corr_mean.csv')

In [None]:
df_corr_mean = pd.read_pickle('metadata/df_corr_mean.pkl')
df_corr_mean

# Plot Mean Correlations per Sector

In [None]:
colors = ['orange', 'skyblue', 'red', 'green', 'blue', 'brown']

In [None]:
for sector in df_corr_mean['업종'].unique() : 
    print(sector)
    target = df_corr_mean[df_corr_mean['업종']==sector]
    
    plt.figure(figsize=(20, 5))
    for idx, age in enumerate(df_corr_mean['연령대'].unique()) : 
        target_segmented = target[target['연령대']==age]
        
        plt.plot(target_segmented[target_segmented['성별']=='남']['correlation'].reset_index(drop=True), label='%s male, female' % age, c=colors[idx], alpha=0.3)
        plt.plot(target_segmented[target_segmented['성별']=='여']['correlation'].reset_index(drop=True), c=colors[idx], alpha=0.3)
    
    temp = target.groupby(['업종', '지역'])['correlation'].mean()
    mean = temp.iloc[temp.index.get_level_values('업종') == sector]

    plt.plot(mean.values, c='black', label='mean') 
    plt.xlabel('area')
    plt.ylabel('correlation')
    plt.legend()
    plt.show()

# Display Correlation

In [42]:
df_corr = df_corr[df_corr['Correlation with'] != 'High']
df_corr = df_corr[df_corr['Correlation with'] != 'Low']
df_corr = df_corr[df_corr['Correlation with'] != 'Volume']
df_corr = df_corr[df_corr['Correlation with'] != 'Open']

In [67]:
idx = 0
column_name = calls_amount[idx]

df_corr.groupby(['업종', 'Time lag', 'Correlation with'])[column_name].mean().sort_values()

업종                        Time lag  Correlation with
소프트웨어 개발 및 공급업            0         Close              -0.536362
기반조성 및 시설물 축조관련 전문공사업     2         Close              -0.473060
                          10        Close              -0.472510
                          50        Close              -0.458782
석유 정제품 제조업                5         Close              -0.444110
                                                          ...   
합성고무 및 플라스틱 물질 제조업        2         Close               0.497212
석탄 광업                     2         Close               0.499183
                          1         Close               0.500857
                          0         Close               0.501070
건축기술, 엔지니어링 및 관련 기술 서비스업  1         Close               0.503192
Name: 총건수, Length: 622, dtype: float64

In [49]:
time_lags = [0,1,2,5,10,25,50]

In [50]:
idx_time_lag = 0
mean_correlation = df_corr[df_corr['Time lag']==time_lags[idx_time_lag]].groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values()
display_all(mean_correlation)

업종                                     Time lag  Correlation with
소프트웨어 개발 및 공급업                         0         Close              -0.536362
신발 및 신발 부분품 제조업                        0         Close              -0.438949
기타 비금속광물 광업                            0         Close              -0.414397
컴퓨터 프로그래밍, 시스템 통합 및 관리업                0         Close              -0.410137
기타 전기장비 제조업                            0         Close              -0.397717
비료, 농약 및 살균, 살충제 제조업                   0         Close              -0.392454
육상 여객 운송업                              0         Close              -0.323178
가정용 기기 제조업                             0         Close              -0.316837
기계장비 및 관련 물품 도매업                       0         Close              -0.259661
금융 지원 서비스업                             0         Close              -0.257121
골판지, 종이 상자 및 종이용기 제조업                  0         Close              -0.245262
의료용 기기 제조업                             0         Close              -0.23091

In [51]:
idx_time_lag = 1
mean_correlation = df_corr[df_corr['Time lag']==time_lags[idx_time_lag]].groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values()
display_all(mean_correlation)

업종                                     Time lag  Correlation with
기반조성 및 시설물 축조관련 전문공사업                  1         Close              -0.443611
영상 및 음향기기 제조업                          1         Close              -0.430170
기타 전기장비 제조업                            1         Close              -0.399677
비료, 농약 및 살균, 살충제 제조업                   1         Close              -0.394840
직물직조 및 직물제품 제조업                        1         Close              -0.374695
섬유, 의복, 신발 및 가죽제품 소매업                  1         Close              -0.346344
가정용 기기 제조업                             1         Close              -0.316324
구조용 금속제품, 탱크 및 증기발생기 제조업               1         Close              -0.294973
골판지, 종이 상자 및 종이용기 제조업                  1         Close              -0.280872
기계장비 및 관련 물품 도매업                       1         Close              -0.256284
육상 여객 운송업                              1         Close              -0.237220
의료용 기기 제조업                             1         Close              -0.23584

In [52]:
idx_time_lag = 2
mean_correlation = df_corr[df_corr['Time lag']==time_lags[idx_time_lag]].groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values()
display_all(mean_correlation)

업종                                 Time lag  Correlation with
기반조성 및 시설물 축조관련 전문공사업              2         Close              -0.473060
기타 비금속 광물제품 제조업                    2         Close              -0.435046
비료, 농약 및 살균, 살충제 제조업               2         Close              -0.393998
전자부품 제조업                           2         Close              -0.383837
골판지, 종이 상자 및 종이용기 제조업              2         Close              -0.383591
직물직조 및 직물제품 제조업                    2         Close              -0.374173
화학섬유 제조업                           2         Close              -0.371832
기타 운송관련 서비스업                       2         Close              -0.321981
도축, 육류 가공 및 저장 처리업                 2         Close              -0.318824
가정용 기기 제조업                         2         Close              -0.317399
반도체 제조업                            2         Close              -0.316703
기타 금속 가공제품 제조업                     2         Close              -0.279451
내화, 비내화 요업제품 제조업                   2         Close

In [53]:
idx_time_lag = 3
mean_correlation = df_corr[df_corr['Time lag']==time_lags[idx_time_lag]].groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values()
display_all(mean_correlation)

업종                                     Time lag  Correlation with
석유 정제품 제조업                             5         Close              -0.444110
건축기술, 엔지니어링 및 관련 기술 서비스업               5         Close              -0.436889
기타 비금속광물 광업                            5         Close              -0.413577
자료처리, 호스팅, 포털 및 기타 인터넷 정보매개 서비스업       5         Close              -0.396188
화학섬유 제조업                               5         Close              -0.371046
구조용 금속제품, 탱크 및 증기발생기 제조업               5         Close              -0.344101
비료, 농약 및 살균, 살충제 제조업                   5         Close              -0.327262
가정용 기기 제조업                             5         Close              -0.321102
의약품 제조업                                5         Close              -0.254586
내화, 비내화 요업제품 제조업                       5         Close              -0.251931
전자부품 제조업                               5         Close              -0.226244
가구 제조업                                 5         Close              -0.19843

In [54]:
idx_time_lag = 4
mean_correlation = df_corr[df_corr['Time lag']==time_lags[idx_time_lag]].groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values()
display_all(mean_correlation)

업종                                     Time lag  Correlation with
기반조성 및 시설물 축조관련 전문공사업                  10        Close              -0.472510
건축자재, 철물 및 난방장치 도매업                    10        Close              -0.426311
영상 및 음향기기 제조업                          10        Close              -0.423370
비료, 농약 및 살균, 살충제 제조업                   10        Close              -0.420431
기타 비금속광물 광업                            10        Close              -0.409488
여행사 및 기타 여행보조 서비스업                     10        Close              -0.349297
반도체 제조업                                10        Close              -0.338703
가정용 기기 제조업                             10        Close              -0.326148
절연선 및 케이블 제조업                          10        Close              -0.268056
육상 여객 운송업                              10        Close              -0.261055
컴퓨터 프로그래밍, 시스템 통합 및 관리업                10        Close              -0.257308
무기 및 총포탄 제조업                           10        Close              -0.22231

In [55]:
idx_time_lag = 5
mean_correlation = df_corr[df_corr['Time lag']==time_lags[idx_time_lag]].groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values()
display_all(mean_correlation)

업종                                 Time lag  Correlation with
건축자재, 철물 및 난방장치 도매업                25        Close              -0.426042
비료, 농약 및 살균, 살충제 제조업               25        Close              -0.417338
기타 전기장비 제조업                        25        Close              -0.400052
기타 비금속광물 광업                        25        Close              -0.393580
화학섬유 제조업                           25        Close              -0.381732
도축, 육류 가공 및 저장 처리업                 25        Close              -0.363673
여행사 및 기타 여행보조 서비스업                 25        Close              -0.340040
가정용 기기 제조업                         25        Close              -0.327834
내화, 비내화 요업제품 제조업                   25        Close              -0.319636
반도체 제조업                            25        Close              -0.312704
골판지, 종이 상자 및 종이용기 제조업              25        Close              -0.296500
기계장비 및 관련 물품 도매업                   25        Close              -0.230251
1차 비철금속 제조업                        25        Close

In [56]:
idx_time_lag = 6
mean_correlation = df_corr[df_corr['Time lag']==time_lags[idx_time_lag]].groupby(['업종', 'Time lag', 'Correlation with'])['총건수'].mean().sort_values()
display_all(mean_correlation)

업종                                     Time lag  Correlation with
기반조성 및 시설물 축조관련 전문공사업                  50        Close              -0.458782
건축자재, 철물 및 난방장치 도매업                    50        Close              -0.422748
기타 전기장비 제조업                            50        Close              -0.415287
비료, 농약 및 살균, 살충제 제조업                   50        Close              -0.376637
도축, 육류 가공 및 저장 처리업                     50        Close              -0.373434
선박 및 보트 건조업                            50        Close              -0.364573
골판지, 종이 상자 및 종이용기 제조업                  50        Close              -0.352815
측정, 시험, 항해, 제어 및 기타 정밀기기 제조업; 광학기기 제외  50        Close              -0.352105
석유 정제품 제조업                             50        Close              -0.330144
가정용 기기 제조업                             50        Close              -0.327002
건축기술, 엔지니어링 및 관련 기술 서비스업               50        Close              -0.221280
1차 비철금속 제조업                            50        Close              -0.20427