In [131]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from statsmodels.api import *
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels as sm
from patsy import dmatrix

## 가상의 주식 테이블 생성하기
- 시점 $T$ 는 1부터 1000까지 생성
- 개별 종목의 갯수는 400개로 설정
- 난수 발생으로 임의로 정하였기 때문에, 데이터 포맷 그 이상의 의미를 가지지 못한다.

In [134]:
code_df = pd.read_csv('industrial_code.csv')
code_df.rename(columns={'종목코드':'code'},inplace=True)
code_df = code_df[:401]
num_of_stock = 400
T = 1000
df = pd.DataFrame(columns=[i for i in range(1,T+1)])

for i in df.columns:
    df[i] = [0 for i in range(num_of_stock + 1)]
    
df.index = list(code_df['code'])
df.index = [str(i) for i in df.index]

for i in range(len(df)):
    df.iloc[i] = [(np.random.randint(10))*(1+np.random.rand()) for _ in range(1000)]
    
df.index = [str(i) for i in df.index]
df = df.append(pd.Series([1 for _ in range(T)], index=df.columns), ignore_index=True) 
df.rename(index={df.index[-1]:'bias'},inplace=True)
df.rename(index={df.index[-2]:'market'},inplace=True)
individual_stock_df = df[(df.index != 'market') & (df.index != 'bias')]
df.index = list(code_df['code'])[:-1] + ['market','bias']
display(df.tail())
print(df.shape)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
17810,11.976051,3.681654,4.621396,1.832938,6.439395,10.794017,4.727641,4.396189,7.563447,1.93423,...,1.352894,6.216272,11.330546,3.506029,6.287101,11.850657,5.146703,4.826293,9.938349,17.001989
11200,1.233398,1.942859,4.67993,11.122693,1.497765,5.951642,7.195653,6.169287,1.684776,8.736454,...,1.43383,4.275754,10.891971,11.061237,8.935565,1.232703,8.833515,9.543152,9.476768,4.995905
17390,1.465559,7.43151,11.860115,11.614737,13.030391,15.080793,8.077052,13.136256,0.0,3.850219,...,6.517146,6.971129,7.949062,1.01138,10.529499,2.548805,3.391416,8.054497,4.399376,10.321916
market,16.087387,5.029418,6.024766,1.507632,0.0,6.996546,0.0,6.464027,9.563055,5.475706,...,11.569359,5.230175,3.248323,8.934551,15.885012,4.288474,0.0,0.0,4.40365,5.719301
bias,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


(402, 1000)


## 1단계 : 포트폴리오 구성시기 (portfolio formation period)
- 시장모형의 시계열회귀분석으로 개별 종목에 대한 $\beta$ 계수를 추정한다.
- 개별 종목에 대한 $\beta$ 400개 계수 측정이 완료되면, 크기 순으로 나열해서 20개씩 묶어, 포트폴리오를 형성한다.
    - 개별 종목의 갯수를 400개로 한 이유가 이와 같다.

In [73]:
for i,value in enumerate(individual_stock_df.index):
    globals()['model'+str(i)] = OLS(individual_stock_df.loc[value][:400].T,df.loc['market'][:, np.newaxis][:400])
    globals()['result'+str(i)] = globals()['model'+str(i)].fit()
    globals()['params'+str(i)] = globals()['result'+str(i)].params.values[0]
    globals()['resid'+str(i)] = np.std(globals()['result'+str(i)].resid)
#     display(globals()['result'+str(i)].summary())

In [83]:
beta_bundles1 = [globals()['params'+str(i)] for i in range(len(individual_stock_df.index))]
resid_bundles1 = [globals()['resid'+str(i)] for i in range(len(individual_stock_df.index))]
len(beta_bundles1) , len(resid_bundles1)

(400, 400)

In [85]:
rank_df1 = pd.DataFrame()
rank_df1['beta'] = beta_bundles1
rank_df1['resid'] = resid_bundles1
rank_df1['stock'] = df[(df.index != 'market')&(df.index != 'bias')].index
rank_df1.reset_index(inplace=True)
rank_df1.tail()

Unnamed: 0,index,beta,resid,stock
395,395,0.677183,5.454544,11300
396,396,0.631338,5.525748,12610
397,397,0.658216,5.548866,17810
398,398,0.686505,5.468022,11200
399,399,0.651519,5.58821,17390


In [135]:
sorted_rank_df1 = rank_df1.sort_values(by='beta',ascending=True)
sorted_rank_df1.tail()

Unnamed: 0,index,beta,resid,stock
104,104,0.751625,5.541176,14710
289,289,0.754841,5.580611,69460
181,181,0.756052,5.688531,105630
201,201,0.756433,5.683429,99340
284,284,0.763531,5.520748,71090


In [117]:
testing_ls = []
ranked_idx_ls = []
popped_ls = list(sorted_rank_df1['index'])
for _ in range(20):
    testing_ls = []
    for _ in range(20):
        testing_ls.append(popped_ls.pop())
    ranked_idx_ls.append(testing_ls)

In [119]:
testing_ls = []
ranked_stock_ls = []
popped_ls = list(sorted_rank_df1['stock'])
for _ in range(20):
    testing_ls = []
    for _ in range(20):
        testing_ls.append(popped_ls.pop())
    ranked_stock_ls.append(testing_ls)

In [136]:
len(ranked_idx_ls[0]) , len(ranked_idx_ls) , len(ranked_stock_ls[0]) , len(ranked_stock_ls)

(20, 20, 20, 20)

In [76]:
sorted_beta_bundles = sorted(beta_bundles1)
len(sorted_beta_bundles)

400

In [77]:
for i in range(20):
    ls = []
    for _ in range(20):
        ls.append(sorted_beta_bundles.pop())
    globals()['portfolio_beta_20_'+str(i)] = ls.copy()

In [78]:
for i in range(20):
    ls = []
    for _ in range(20):
        ls.append(resid_bundles1.pop())
    globals()['portfolio_resid_20_'+str(i)] = ls.copy()

In [79]:
beta_ls1 = []
for i in range(20):
    beta_ls1.append(sum(globals()['portfolio_beta_20_'+str(i)])/len(globals()['portfolio_beta_20_'+str(i)]))
print(beta_ls1)

[0.74493594167673272, 0.72455595427677699, 0.71602529071004439, 0.71013126876695343, 0.70499886357923636, 0.69923779567652444, 0.69464826135875624, 0.6888141743345042, 0.68484937783264754, 0.68139372109177077, 0.67791103914684414, 0.67476930314403338, 0.67131723513305297, 0.66723944725295636, 0.66360745103666996, 0.66023083681468164, 0.65648497774087922, 0.65097228207701718, 0.6408368765350041, 0.62583456962153239]


In [80]:
resid_ls1 = []
for i in range(20):
    resid_ls1.append(sum(globals()['portfolio_resid_20_'+str(i)])/len(globals()['portfolio_resid_20_'+str(i)]))
print(resid_ls1)

[5.59124687465759, 5.524405415256554, 5.459948808455871, 5.56498177724697, 5.53424701462567, 5.505716961009198, 5.527742695127253, 5.538821159652453, 5.54505495837789, 5.51778730100123, 5.533810680144086, 5.481086325380909, 5.536685694216966, 5.5188830343771045, 5.5594147654234, 5.53260294868829, 5.513435504445243, 5.506304455257257, 5.537826687571326, 5.495624834617161]


In [137]:
testing_df = pd.DataFrame()
testing_df['beta'] = beta_ls1
testing_df['resid'] = resid_ls1
testing_df['valued_stock_idx'] = ranked_idx_ls
testing_df['valued_stock'] = ranked_stock_ls
testing_df.tail()

Unnamed: 0,beta,resid,valued_stock_idx,valued_stock
15,0.660231,5.532603,"[73, 124, 392, 171, 264, 5, 238, 279, 302, 323...","[28260, 139130, 11690, 82640, 23810, 298040, 8..."
16,0.656485,5.513436,"[97, 321, 397, 4, 342, 53, 68, 333, 301, 184, ...","[163560, 10040, 17810, 286940, 29460, 194370, ..."
17,0.650972,5.506304,"[290, 382, 353, 357, 59, 128, 372, 165, 250, 3...","[69620, 20000, 23960, 31820, 227840, 23350, 25..."
18,0.640837,5.537827,"[173, 162, 224, 213, 249, 121, 293, 141, 7, 39...","[93240, 71320, 44380, 97230, 83370, 69640, 692..."
19,0.625835,5.495625,"[11, 297, 267, 82, 41, 63, 177, 77, 396, 36, 1...","[18250, 67830, 58730, 27410, 229640, 95570, 53..."


# 2단계 : 포트폴리오 $\beta$ 의 추정시기 (estimation period)

- 시장 모형의 시계열회귀분석으로 20개 포트폴리오에 대한 $\beta$ 계수를 추정한다.
- 개별 종목에 대한 $\beta_i$ 를 구하고, 포트폴리오를 형성, 시계열 평균으로 $\beta_P$ 를 추정한다.

In [122]:
for i,value in enumerate(individual_stock_df.index):
    globals()['model'+str(i)] = OLS(individual_stock_df.loc[value][400:900].T,df.loc['market'][:, np.newaxis][400:900])
    globals()['result'+str(i)] = globals()['model'+str(i)].fit()
    globals()['params'+str(i)] = globals()['result'+str(i)].params.values[0]
    globals()['resid'+str(i)] = np.std(globals()['result'+str(i)].resid)
#     display(globals()['result'+str(i)].summary())

In [123]:
beta_bundles2 = [globals()['params'+str(i)] for i in range(len(individual_stock_df.index))]
resid_bundles2 = [globals()['resid'+str(i)] for i in range(len(individual_stock_df.index))]
len(beta_bundles2) , len(resid_bundles2)

(400, 400)

In [124]:
rank_df2 = pd.DataFrame()
rank_df2['beta'] = beta_bundles2
rank_df2['resid'] = resid_bundles2
rank_df2['stock'] = df[(df.index != 'market')&(df.index != 'bias')].index
rank_df2.reset_index(inplace=True)
rank_df2.tail()

Unnamed: 0,index,beta,resid,stock
395,395,0.639183,5.444483,11300
396,396,0.677091,5.501567,12610
397,397,0.643946,5.568284,17810
398,398,0.692209,5.90864,11200
399,399,0.676428,5.668885,17390


In [125]:
beta_ls2 = []
for i in range(len(testing_df['valued_stock_idx'])):
    beta_ls2.append(np.mean(rank_df2.iloc[testing_df['valued_stock_idx'][i]]['beta']))

In [126]:
resid_ls2 = []
for i in range(len(testing_df['valued_stock_idx'])):
    resid_ls2.append(np.mean(rank_df2.iloc[testing_df['valued_stock_idx'][i]]['resid']))

In [127]:
print(beta_ls2)

[0.6797728888599189, 0.6830399311930461, 0.6833875680109797, 0.6822231661769261, 0.687356118962339, 0.6836915336734921, 0.6902079483759234, 0.6840908604360926, 0.6949023413282639, 0.6820766768339164, 0.6836742375484935, 0.6767584319869511, 0.6767821150838075, 0.6891428000927485, 0.6882264666464842, 0.6874911546959102, 0.6800463588446124, 0.6796010332802509, 0.6826846463317491, 0.6746975897335465]


In [128]:
print(resid_ls2)

[5.571231074544103, 5.585573999089194, 5.561264089078766, 5.533532937373075, 5.569065857693423, 5.54867289845213, 5.549854550304219, 5.59518736089818, 5.588631392520929, 5.560538251645942, 5.536620716545706, 5.496269625114622, 5.538921220130997, 5.521374126858784, 5.561411003611129, 5.540291064360789, 5.483562910483104, 5.530815194810778, 5.485042320701187, 5.566716032355822]


In [132]:
final_df = pd.DataFrame()
final_df['beta'] = beta_ls2
final_df['resid'] = resid_ls2

In [133]:
dmatrix('beta + I(beta ** 2) + resid',final_df)

DesignMatrix with shape (20, 4)
  Intercept     beta  I(beta ** 2)    resid
          1  0.67977       0.46209  5.57123
          1  0.68304       0.46654  5.58557
          1  0.68339       0.46702  5.56126
          1  0.68222       0.46543  5.53353
          1  0.68736       0.47246  5.56907
          1  0.68369       0.46743  5.54867
          1  0.69021       0.47639  5.54985
          1  0.68409       0.46798  5.59519
          1  0.69490       0.48289  5.58863
          1  0.68208       0.46523  5.56054
          1  0.68367       0.46741  5.53662
          1  0.67676       0.45800  5.49627
          1  0.67678       0.45803  5.53892
          1  0.68914       0.47492  5.52137
          1  0.68823       0.47366  5.56141
          1  0.68749       0.47264  5.54029
          1  0.68005       0.46246  5.48356
          1  0.67960       0.46186  5.53082
          1  0.68268       0.46606  5.48504
          1  0.67470       0.45522  5.56672
  Terms:
    'Intercept' (column 0)
    'bet