In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import FinanceDataReader as fdr
import statsmodels.api as sm
import seaborn as sns
import pickle
from sklearn.decomposition import PCA

%matplotlib inline

In [3]:
kscap = pd.read_csv('data/코스피시가총액.csv', index_col=0, parse_dates=True)
samcap = pd.read_csv('data/삼성전자시가총액.csv', index_col=0, parse_dates = True)

In [5]:
kscap.head(2)

Unnamed: 0_level_0,Large,Large Point,Mid,Mid Point,Small,Small Point,Kospi Total,Kospi Point
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009-01-02,491624628.0,1132.1,59002319.0,1398.72,16699437.0,764.58,593842360.0,1157.4
2009-01-05,500247280.0,1151.94,58410046.0,1384.63,16768471.0,767.62,602228320.0,1173.57


In [6]:
samcap.head(2)

Unnamed: 0_level_0,Samsung
date,Unnamed: 1_level_1
2009-01-02,69230688
2009-01-05,70114484


In [7]:
ks_star = kscap["Kospi Total"] - samcap["Samsung"]
ks_star.head(2)

date
2009-01-02    524611672.0
2009-01-05    532113836.0
dtype: float64

In [14]:
cap_korea = pd.concat([kscap[["Large Point", "Mid Point", 'Small Point', 'Kospi Point']], ks_star], axis=1)
ret_korea = cap_korea.pct_change().dropna()

rename = ['large', 'mid', 'small', 'kospi', 'new_kospi']
ret_korea.columns = rename
ret_korea.head(2)

Unnamed: 0_level_0,large,mid,small,kospi,new_kospi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-01-05,0.017525,-0.010073,0.003976,0.013971,0.0143
2009-01-06,0.01909,0.013217,0.01605,0.017647,0.014486


In [20]:
stk_prices = pd.read_pickle('data/수정주가.pkl')
df = pd.DataFrame()
for i in list(stk_prices.keys()):
    df[i] = stk_prices[i][i]
stk_prices = df.copy()
stk_prices.head(2)

Unnamed: 0_level_0,000020,000030,000040,000050,000060,000070,000075,000080,000087,000100,...,285130,28513K,286940,293940,294870,298000,298020,298040,298050,300720
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-07-14,9900,,692,11020,6666,43972,28050,,,155010,...,,,,,,,,,,
2009-07-15,9900,,714,11287,7052,44491,29400,,,154172,...,,,,,,,,,,


In [23]:
ret_kr_stks = stk_prices.pct_change()["2010":"2017"].dropna(axis=1)
ret_kr_stks.head(2)

Unnamed: 0_level_0,000020,000040,000050,000060,000070,000075,000080,000100,000105,000120,...,104700,105560,105630,107590,108670,108675,109070,111770,114090,118000
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,-0.007916,0.062945,0.015563,0.052663,0.00447,-0.007937,0.003769,-0.011236,-0.017724,-0.038801,...,0.006803,-0.005025,0.046481,0.007109,-0.004167,-0.003846,0.004057,-0.029915,0.005089,-0.149235
2010-01-05,-0.00266,-0.008939,0.00373,0.019436,-0.005568,-0.002667,0.006258,0.005679,0.061373,-0.001835,...,0.032658,-0.011785,0.073008,0.122353,-0.012552,0.0,-0.016236,-0.013216,0.050633,0.0


# PCA

In [24]:
centered = ret_kr_stks - ret_kr_stks.mean()
cov = centered.cov()
eigval, eigvec = np.linalg.eig(cov)

In [25]:
reorder_idx = eigval.argsort()[::-1]
eigval = eigval[reorder_idx]
eigvec = eigvec[reorder_idx]

In [28]:
eigval[0] / eigval.sum()

0.09564175165676816

In [26]:
pc1 = centered.dot(eigvec[:,0])

In [27]:
# Correlation : PC1 ~ Samsung

pc1.corr(ret_kr_stks['005930'])

0.3360973629366128

In [29]:
# Correlation : PC1 ~ Equal weight market

pc1.corr(ret_kr_stks.mean(axis=1))

0.9988309412530412

# Regression

## Making factor returns (smb, hml)

In [33]:
kr_factor = pd.read_excel("data/kr_factor_data.xlsx", index_col=0, parse_dates=True)
kr_factor.head(2)

Unnamed: 0_level_0,대형 - High,대형 - Medium,대형 - Low,소형 - High,소형 - Medium,소형 - Low
Symbol Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,0.74,0.92,0.83,1.33,2.26,3.17
2010-01-05,-1.73,0.22,-0.42,0.56,0.92,1.69


In [35]:
col_nms = kr_factor.columns
col_nms

Index(['대형 - High', '대형 - Medium', '대형 - Low', '소형 - High', '소형 - Medium',
       '소형 - Low'],
      dtype='object')

In [36]:
large_ = col_nms[:3]
small_ = col_nms[3:]
small_

Index(['소형 - High', '소형 - Medium', '소형 - Low'], dtype='object')

In [37]:
kr_smb = 1/3*(kr_factor[small_].sum(axis=1) - kr_factor[large_].sum(axis=1))
kr_hml = 1/2*((kr_factor['대형 - High']+kr_factor['소형 - High']) - (kr_factor['대형 - Low']+kr_factor['소형 - Low']))

In [39]:
ret_kr_factor = pd.concat([kr_smb, kr_hml], axis=1) / 100
ret_kr_factor.columns = ["SMB", "HML"]
ret_kr_factor.head(2)

Unnamed: 0_level_0,SMB,HML
Symbol Name,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-04,0.014233,-0.00965
2010-01-05,0.017,-0.0122


In [46]:
ret_regr = pd.concat([ret_kr_factor, ret_korea, ret_kr_stks['005930']], axis=1)["2010":"2017"]
ret_regr.columns = ret_regr.columns.tolist()[:-1] + ["samsung"]
ret_regr

Unnamed: 0,SMB,HML,large,mid,small,kospi,new_kospi,samsung
2010-01-04,0.014233,-0.00965,0.008359,0.003831,0.006816,0.007945,0.006932,0.012516
2010-01-05,0.017000,-0.01220,-0.003686,-0.004373,0.002633,-0.003254,-0.006144,0.016069
2010-01-06,-0.004567,-0.00095,0.009240,0.004075,0.005132,0.008695,0.006421,0.023114
2010-01-07,0.006867,-0.00290,-0.014304,-0.006125,-0.004970,-0.012825,-0.009539,-0.033294
2010-01-08,-0.002233,-0.00060,0.006410,0.006364,0.004006,0.007015,0.006243,0.009840
...,...,...,...,...,...,...,...,...
2017-12-21,-0.002033,0.00695,-0.018329,-0.010281,-0.012679,-0.017206,-0.013141,-0.034198
2017-12-22,-0.000867,-0.00775,0.003586,0.006940,0.011827,0.004408,0.003291,0.011396
2017-12-26,-0.000533,0.00855,-0.006686,0.001897,-0.000616,-0.005409,0.000481,-0.030181
2017-12-27,0.004233,-0.03105,0.003723,0.000723,0.004998,0.003844,-0.000834,0.024066


In [47]:
ret_regr.isnull().sum()

SMB          0
HML          0
large        0
mid          0
small        0
kospi        0
new_kospi    0
samsung      0
dtype: int64

### y ~ x1 + x2 + ... + xn

In [49]:
select_y = 'large'                     # 종속변수선택
select_xs = ['kospi', 'SMB', 'HML']    # 설명변수선택

y = ret_regr[select_y]
x = ret_regr[select_xs]
model = sm.OLS(y, x)
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  large   R-squared:                       0.990
Model:                            OLS   Adj. R-squared:                  0.989
Method:                 Least Squares   F-statistic:                 6.200e+04
Date:                Tue, 20 Apr 2021   Prob (F-statistic):               0.00
Time:                        21:15:21   Log-Likelihood:                 10794.
No. Observations:                1976   AIC:                        -2.158e+04
Df Residuals:                    1973   BIC:                        -2.156e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
kospi          1.0293      0.003    390.153      0.0

# US MARKET

In [50]:
us_data = pd.read_csv("data/미국회귀분석데이터.csv", index_col=0, parse_dates=True)
us_data.head(2)

Unnamed: 0_level_0,SP500,SP400,SP600,SP1500,NewSP1500,AAPL,SMB,HML
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-05,0.003122,0.00266,-0.003438,0.00285,0.002863,0.001963,-0.0059,0.0122
2010-01-06,0.000541,0.005274,-0.000954,0.000867,0.0011,-0.015997,-0.0025,0.0052


In [53]:
select_y = 'SP500'                     # 종속변수선택
select_xs = ['NewSP1500', 'SMB', 'HML']    # 설명변수선택

y = us_data[select_y]
x = us_data[select_xs]
model = sm.OLS(y, x)
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  SP500   R-squared:                       0.998
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                 2.937e+05
Date:                Tue, 20 Apr 2021   Prob (F-statistic):               0.00
Time:                        21:39:25   Log-Likelihood:                 12670.
No. Observations:                2011   AIC:                        -2.533e+04
Df Residuals:                    2008   BIC:                        -2.532e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
NewSP1500      0.9986      0.001    864.368      0.0