# 고금계 과제 1 검토

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

In [6]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

## Load & Preprocess data

### Dataguide 추출 데이터 

In [8]:
fn1 = DATA_DIR / '고금계과제1_v1.0_201301-202408.csv'

In [9]:
## 추출한 fnguide 데이터를 dataframe으로 전처리

def preprocess_dataguide_csv(fn_file_path, cols, skiprows=8, encoding="cp949"):
    fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")

    return fn_df

In [10]:
# 날짜가 아닌 컬럼들
cols = ['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency',]

In [12]:
fn1_df = preprocess_dataguide_csv(fn1, cols, encoding='utf-8')

In [13]:
fn1_df

Unnamed: 0,Symbol,Symbol Name,Kind,Item,Item Name,Frequency,date,value
0,A005930,삼성전자,SSC,S41000170F,수익률(%),DAILY,2013-01-31,0.56
1,A005930,삼성전자,COM,CP10000500,FnGuide Sector Code,,2013-01-31,FGSC.45
2,A005930,삼성전자,COM,CP10000600,FnGuide Sector,,2013-01-31,IT
3,A005930,삼성전자,COM,CP10000700,FnGuide Industry Group Code,,2013-01-31,FGSC.45.30
4,A005930,삼성전자,COM,CP10000800,FnGuide Industry Group,,2013-01-31,반도체
...,...,...,...,...,...,...,...,...
11598373,A950180,SNK,NFS-IFRS(C),6000903001,총자본(천원),ANNUAL,2024-09-06,
11598374,A950180,SNK,NFS-IFRS(C),6000903007,보통주자본금(천원),ANNUAL,2024-09-06,
11598375,A950180,SNK,NFS-IFRS(C),6000903008,우선주자본금(천원),ANNUAL,2024-09-06,
11598376,A950180,SNK,NFS-IFRS(C),6000911019,이연법인세부채(천원),ANNUAL,2024-09-06,


In [19]:
# univ_list = fn1_df['Symbol'].unique() # 나중에 기간 중 존재하지 않았던 종목들을 제외하고 다시 만들 것. 

items = fn1_df['Item Name '].unique() # 원래부터 DataGuide에 띄어쓰기가 들어가 있어서 이렇게 되어버림

In [18]:
items

array(['수익률(%)', 'FnGuide Sector Code', 'FnGuide Sector',
       'FnGuide Industry Group Code', 'FnGuide Industry Group',
       'FnGuide Industry Code', 'FnGuide Industry', '수정PBR(배)',
       '수정PBR(평균,기말)(배)', '수정PBR(무형자산차감)(배)', '수정PBR(자사주차감)(배)',
       '시가총액 (상장예정주식수 포함)(백만원)', '시가총액 (보통-상장예정주식수 포함)(백만원)', 'PBR(배)',
       'PBR(평균,기말)(배)', 'PBR(무형자산차감)(배)', 'PBR(자사주차감)(배)', '총자본(천원)',
       '보통주자본금(천원)', '우선주자본금(천원)', '이연법인세부채(천원)', '이연법인세자산(천원)'],
      dtype=object)

### mapping 생성

In [22]:
symbol_to_name = fn1_df[['Symbol', 'Symbol Name']].drop_duplicates().set_index('Symbol').to_dict()['Symbol Name']

In [39]:
name_to_symbol = {v:k for k, v in symbol_to_name.items()}

### 존재하지 않았던 기업 처리

Dataguide에서 상장폐지 종목 포함하여 불러오면 주어진 기간에 존재하지 않았던 기업까지 불러옴. (즉, 전체기간 모든 기업을 univ로 불러옴)

주어진 기간동안의 존재하지 않았던 주식들의 value 값에 대해선 모두 NaN을 줘버림. 

In [41]:
name_to_symbol['신한은행'] # 신한지주 출범으로 신한 증권과 함께 2001년 8월 30일 상장폐지. 우리의 데이터 기간엔 아예 존재하지 말았어야 함. 

'A015580'

In [42]:
name_to_symbol['신한지주'] # 동년 9월 상장됨 

'A055550'

In [26]:
def get_panel_df(df, item_name):
    panel_df = df.loc[df['Item Name '] == item_name].copy()
    panel_df = panel_df.pivot(index='date', columns='Symbol', values='value')
    panel_df = panel_df.reset_index()
    
    panel_df = panel_df.set_index('date', inplace=False)
    panel_df.sort_index(inplace=True)
    
    return panel_df 

In [28]:
returns_df = get_panel_df(fn1_df, '수익률(%)')
returns_df.head()

Symbol,A000010,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000090,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,,-1.79,,-0.4,1.17,-2.92,-0.59,0.47,,-0.27,...,0.26,,,,,,,,,
2013-02-28,,-0.17,,-0.85,0.33,4.5,0.58,-1.16,,2.88,...,5.77,,,,,,,,,
2013-03-31,,0.99,,-0.22,0.32,-1.65,2.02,4.15,,0.53,...,0.49,,,,,,,,,
2013-04-30,,1.05,,0.0,1.74,1.7,1.85,-0.31,,-1.87,...,2.52,,,,,,,,,
2013-05-31,,-1.7,,-2.61,-2.5,0.76,0.0,-0.63,,-1.85,...,2.62,14.98,,,,,,,,


In [29]:
returns_df.shape

(141, 3739)

In [44]:
returns_df.dropna(axis=1, how='all').shape

(141, 2926)

In [49]:
nans = returns_df.isnull().all()
nan_tickers = nans[nans].index.tolist()

[ symbol_to_name[ticker] for ticker in nan_tickers ] # 모든 값이 NaN인 종목들. 즉, 현재 존재하지 않는 종목들.

['조흥은행',
 '두산상사',
 '제일은행',
 '한일은행',
 '대한중석',
 'LG금속',
 '대우중공업',
 '동아건설',
 '해태제과',
 '두산백화',
 '대아건설',
 '한주통산',
 '그린손해보험',
 '태화',
 '대한제지',
 '삼덕제지',
 '금호건설',
 '삼화실업',
 '제일화재',
 '서울교통공사',
 '한국벨트',
 '전진산업',
 '강원산업',
 '한화기계',
 '리젠트화재',
 '동서증권',
 '서통',
 '신원제이엠씨',
 '장은증권',
 '마이크로닉스',
 '우리증권',
 '대한모방',
 'FnC코오롱',
 '동산씨앤지',
 '신광기업',
 '서광건설',
 '두산종합식품',
 '경남모직',
 '두산테크팩',
 '공영토건',
 '동부일렉트로닉스',
 '영일화학',
 '한보철강',
 '효성물산',
 '남한제지',
 '대호',
 '자유건설',
 '삼성건설',
 '경향건설',
 '벽산개발',
 '피어리스',
 '삼익건설',
 '우주종합건설',
 '유성',
 '미원',
 '고제',
 'LGEI',
 '미주제강',
 '고려증권',
 '동양화학',
 '피죤',
 '삼능건설',
 '영풍산업',
 '하나은행',
 '진성레미컨',
 '삼도물산',
 '경동산업',
 '두산건설',
 '남양',
 '한일약품',
 '넥상스코리아',
 '대일화학',
 '엔케이텔레콤',
 '신한증권',
 '하나증권',
 '유진화학',
 '삼진화학',
 '광덕물산',
 '고려시멘트',
 '비락',
 '대우',
 '중원',
 '성담',
 '삼일공사',
 '해태유업',
 '부흥',
 '동부산업',
 'BHK',
 '롯데미도파',
 '대선주조',
 '스마텔',
 '한일방직',
 '세신',
 '대성자원',
 '아시아자동차',
 '아이케이',
 '오리온전기',
 '고합',
 '동방전자',
 '디와이홀딩스',
 '삼호건설',
 'SY',
 '대우송도개발',
 '두산음료',
 '캠브리지코오롱',
 '신동방CP',
 '진로종합식품',
 '삼우인다스',
 '조인에너지',
 '남선물산',


In [52]:
returns_df = returns_df.dropna(axis=1, how='all')

univ_list = returns_df.columns

In [58]:
univ_list

Index(['A000020', 'A000030', 'A000040', 'A000050', 'A000060', 'A000070',
       'A000080', 'A000100', 'A000120', 'A000140',
       ...
       'A950110', 'A950130', 'A950140', 'A950160', 'A950170', 'A950180',
       'A950190', 'A950200', 'A950210', 'A950220'],
      dtype='object', name='Symbol', length=2926)

In [61]:
def filter_univ(univ_list, panel_df, is_copy=True):
    if is_copy:
        return panel_df[univ_list].copy()
    else:
        return panel_df[univ_list]

### 데이터셋 생성

#### 그룹

In [62]:
# WICS Groups

sector_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Sector') )
industry_group_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Industry Group') )
industry_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Industry') )

In [63]:
sector_df.head()

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000100,A000120,A000140,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,의료,금융,경기소비재,경기소비재,금융,산업재,필수소비재,의료,산업재,필수소비재,...,IT,,,,,,,,,
2013-02-28,의료,금융,경기소비재,경기소비재,금융,산업재,필수소비재,의료,산업재,필수소비재,...,IT,,,,,,,,,
2013-03-31,의료,금융,경기소비재,경기소비재,금융,산업재,필수소비재,의료,산업재,필수소비재,...,IT,,,,,,,,,
2013-04-30,의료,금융,경기소비재,경기소비재,금융,산업재,필수소비재,의료,산업재,필수소비재,...,IT,,,,,,,,,
2013-05-31,의료,금융,경기소비재,경기소비재,금융,산업재,필수소비재,의료,산업재,필수소비재,...,IT,의료,,,,,,,,


In [64]:
items

array(['수익률(%)', 'FnGuide Sector Code', 'FnGuide Sector',
       'FnGuide Industry Group Code', 'FnGuide Industry Group',
       'FnGuide Industry Code', 'FnGuide Industry', '수정PBR(배)',
       '수정PBR(평균,기말)(배)', '수정PBR(무형자산차감)(배)', '수정PBR(자사주차감)(배)',
       '시가총액 (상장예정주식수 포함)(백만원)', '시가총액 (보통-상장예정주식수 포함)(백만원)', 'PBR(배)',
       'PBR(평균,기말)(배)', 'PBR(무형자산차감)(배)', 'PBR(자사주차감)(배)', '총자본(천원)',
       '보통주자본금(천원)', '우선주자본금(천원)', '이연법인세부채(천원)', '이연법인세자산(천원)'],
      dtype=object)

#### PBR

In [65]:
PBR_df = filter_univ(univ_list, get_panel_df(fn1_df, 'PBR(배)') )
adj_PBR_df = filter_univ(univ_list, get_panel_df(fn1_df, '수정PBR(배)') )

#### 시장/재무

In [66]:
mkt_cap_df = filter_univ(univ_list, get_panel_df(fn1_df, '시가총액 (보통-상장예정주식수 포함)(백만원)') )
total_equity_df = filter_univ(univ_list, get_panel_df(fn1_df, '총자본(천원)') )

In [68]:
mkt_cap_df.head()

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000100,A000120,A000140,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,168427,,59074,215584,1286110,522399,2223287,2046492,2737481,375950,...,83005,,,,,,,,,
2013-02-28,168427,,55613,225055,1121720,534727,2369190,1990729,2623420,375950,...,92085,,,,,,,,,
2013-03-31,170661,,55255,231534,1155565,577652,2351821,2130136,2242453,392194,...,86850,,,,,,,,,
2013-04-30,188537,,55255,291599,1155565,764756,2240656,2342035,2418108,341139,...,130329,,,,,,,,,
2013-05-31,193565,,57881,340199,1337540,719001,2209391,2068797,2349671,333017,...,133747,309400.0,,,,,,,,
