# Factor Research 

for FnGuide Application

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json

from pprint import pprint as pp

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel

In [5]:
cwd = Path.cwd()
data_dir = cwd / "data"
fnguide_dir = data_dir / "fnguide"
kqdl_dir = data_dir / "kqdl"
navernews_dir = data_dir / "navernews" / "navernews"

## 1. Import Dataset

학교에서 추출한 DataGuide 파일들을 불러오기

추출할 때 기간 내 delisted 종목도 포함시켰기 때문에 survivorship bias 적음.

In [52]:
## 추출한 fnguide 데이터를 dataframe으로 전처리

def preprocess_dataguide_csv(fn_file_name, cols, skiprows=8):
    fn_df = pd.read_csv(fnguide_dir / fn_file_name, encoding="cp949", skiprows=skiprows)
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")

    return fn_df

In [53]:
fn_file_names = [
    'dataguide_kse+kosdaq_20140101-20231215_stc_pricevolume.CSV',
    'dataguide_kse+kosdaq_20140101-20231219_stc_mktcap.CSV',
    'dataguide_kse+kosdaq_20140101-20231219_foreigner.CSV',
]

In [54]:
## 날짜가 아닌 컬럼들

cols = ['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency',]

In [47]:
pricevolume_df = preprocess_dataguide_csv(fn_file_names[0], cols)
mktcap_df = preprocess_dataguide_csv(fn_file_names[1], cols)
foreigner_df = preprocess_dataguide_csv(fn_file_names[2], cols)

  fn_df = pd.read_csv(fnguide_dir / fn_file_name, encoding="cp949", skiprows=skiprows)
  fn_df = pd.read_csv(fnguide_dir / fn_file_name, encoding="cp949", skiprows=skiprows)


In [48]:
pricevolume_df['Item Name '].unique()

array(['수익률(%)', '수정주가(원)', '수정저가(원)', '수정고가(원)', '수정시가(원)', '거래량(주)',
       '거래대금(원)'], dtype=object)

In [49]:
mktcap_df['Item Name '].unique()

array(['시가총액 (티커-상장예정주식수 미포함)(백만원)'], dtype=object)

In [50]:
foreigner_df['Item Name '].unique()

array(['외국인보유비중(티커)(%)'], dtype=object)

## 2. Transform dataset

퀀트 분석을 위한 panel 형태로 변환

In [56]:
def get_panel_df(df, item_name):
    panel_df = df.loc[df['Item Name '] == item_name].copy()
    panel_df = panel_df.pivot(index='date', columns='Symbol', values='value')
    panel_df = panel_df.reset_index()
    
    return panel_df

In [69]:
def transform_panel(panel_df, remove_holidays=True, drop_sid_prefix=False):
    panel_df = panel_df.set_index('date', inplace=False)
    panel_df.sort_index(inplace=True)

    if drop_sid_prefix:
        panel_df.columns = [sid[1:] for sid in panel_df.columns]

    if remove_holidays:
        panel_df = panel_df.dropna(how='all', axis=0)
    
    return panel_df

In [74]:
return_2d = transform_panel(get_panel_df(pricevolume_df, '수익률(%)'))
tradingmoneyvolume_2d = transform_panel(get_panel_df(pricevolume_df, '거래대금(원)'))
mktcap_2d = transform_panel(get_panel_df(mktcap_df, '시가총액 (티커-상장예정주식수 미포함)(백만원)'))
foreigner_2d = transform_panel(get_panel_df(foreigner_df, '외국인보유비중(티커)(%)'))


In [75]:
return_2d.tail(10)

Symbol,A000010,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000090,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-12-01,,-0.84,,-0.2,-0.35,,0.0,1.58,,0.33,...,0.23,10.1,2.36,0.0,0.0,,1.66,-1.34,-0.5,0.73
2023-12-04,,-0.85,,1.38,1.05,,0.14,1.34,,-0.98,...,0.57,-1.48,0.32,-1.02,-0.51,,1.45,2.13,-1.32,5.12
2023-12-05,,2.14,,0.19,-0.57,,0.0,0.22,,3.94,...,-1.35,26.23,2.93,-0.91,-1.55,,-0.18,0.95,-0.82,7.81
2023-12-06,,0.52,,-2.72,1.85,,0.14,0.66,,1.42,...,1.37,-6.62,-0.69,-1.15,-2.09,,8.94,0.56,-0.1,-0.75
2023-12-07,,3.23,,-0.6,-1.02,,-1.58,0.0,,2.02,...,0.56,-0.54,-1.63,-0.12,-1.2,,-3.12,0.93,-1.66,1.29
2023-12-08,,-0.3,,1.0,0.92,,0.88,-0.22,,0.46,...,-0.45,-4.57,2.36,0.0,1.08,,-1.27,1.3,0.53,-0.4
2023-12-11,,-0.1,,-0.2,0.23,,0.43,1.09,,-0.61,...,-0.23,-5.26,4.54,1.17,0.4,,5.58,-1.83,0.0,-1.56
2023-12-12,,0.71,,0.2,-0.91,,-0.86,0.65,,0.15,...,6.67,-1.11,-3.46,-0.92,0.8,,-4.31,1.49,1.57,-1.59
2023-12-13,,-1.31,,-0.99,0.69,,-1.02,0.0,,-1.53,...,3.92,-6.03,-0.91,-0.47,-0.93,,-0.68,-5.5,-1.03,-1.67
2023-12-14,,0.0,,-26.91,-1.48,,0.59,0.21,,-0.47,...,-2.85,0.98,2.38,0.47,0.13,,0.86,2.14,0.1,5.59


In [76]:
# Make checkpoint

return_2d.to_pickle(fnguide_dir / 'return_2d.pkl')
tradingmoneyvolume_2d.to_pickle(fnguide_dir / 'tradingmoneyvolume_2d.pkl')
mktcap_2d.to_pickle(fnguide_dir / 'mktcap_2d.pkl')
foreigner_2d.to_pickle(fnguide_dir / 'foreigner_2d.pkl')

## 3. Universe Filtering 

현실적인 포트폴리오 백테스팅을 위해 유동성 상위 2000 종목으로만 구성된 유니버스를 구성

- start_date = '2014-01-01'
- end_date = '2022-05-31'  
- subuniverse = 2000

## 4. Factor Portfolio 

Fama-French 3 Factor Portfolio를 만들어 단순 시가총액 가중 포트폴리오(벤치마크)와 수익률 비교

## 5. Factor Portfolio 성과분석 상세

`quantstats` 이용