# 고금계 과제 1 검토

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

In [72]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

## Load & Preprocess data

### Dataguide 추출 데이터 

In [73]:
fn1 = DATA_DIR / '고금계과제1_v1.1_201301-202408.csv'

In [74]:
## 추출한 fnguide 데이터를 dataframe으로 전처리

def preprocess_dataguide_csv(fn_file_path, cols, skiprows=8, encoding="cp949"):
    fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")

    return fn_df

In [75]:
# 날짜가 아닌 컬럼들
cols = ['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency',]

In [76]:
fn1_df = preprocess_dataguide_csv(fn1, cols, encoding='utf-8')

  fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")


In [77]:
fn1_df

Unnamed: 0,Symbol,Symbol Name,Kind,Item,Item Name,Frequency,date,value
0,A005930,삼성전자,COM,CP10000600,FnGuide Sector,,2013-01-31,IT
1,A005930,삼성전자,COM,CP10000800,FnGuide Industry Group,,2013-01-31,반도체
2,A005930,삼성전자,COM,CP10001000,FnGuide Industry,,2013-01-31,반도체 및 관련장비
3,A005930,삼성전자,COM,CP10007100,FnGuide Industry Group 27,,2013-01-31,반도체
4,A005930,삼성전자,COM,CP10001650,거래정지여부,,2013-01-31,정상
...,...,...,...,...,...,...,...,...
7653728,A950180,SNK,NFS-IFRS(C),6000904001,매출액(천원),ANNUAL,2024-09-06,
7653729,A950180,SNK,NFS-IFRS(C),6000905001,매출원가(천원),ANNUAL,2024-09-06,
7653730,A950180,SNK,NFS-IFRS(C),6000909054,이자비용(천원),ANNUAL,2024-09-06,
7653731,A950180,SNK,NFS-IFRS(C),6000906001,영업이익(천원),ANNUAL,2024-09-06,


In [78]:
# univ_list = fn1_df['Symbol'].unique() # 나중에 기간 중 존재하지 않았던 종목들을 제외하고 다시 만들 것. 

items = fn1_df['Item Name '].unique() # 원래부터 DataGuide에 띄어쓰기가 들어가 있어서 이렇게 되어버림

In [79]:
items

array(['FnGuide Sector', 'FnGuide Industry Group', 'FnGuide Industry',
       'FnGuide Industry Group 27', '거래정지여부', '관리종목여부', '수정주가(원)',
       '수익률(%)', '수익률 (1개월)(%)', '시가총액 (보통-상장예정주식수 포함)(백만원)',
       '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)', '자기주식(천원)', '이연법인세부채(천원)',
       '종가(원)', '수정계수', '기말발행주식수 (보통)(주)', '매출액(천원)', '매출원가(천원)',
       '이자비용(천원)', '영업이익(천원)', '총자산(천원)', nan], dtype=object)

### mapping 생성

In [80]:
symbol_to_name = fn1_df[['Symbol', 'Symbol Name']].drop_duplicates().set_index('Symbol').to_dict()['Symbol Name']

In [81]:
name_to_symbol = {v:k for k, v in symbol_to_name.items()}

### 존재하지 않았던 기업 처리

Dataguide에서 상장폐지 종목 포함하여 불러오면 주어진 기간에 존재하지 않았던 기업까지 불러옴. (즉, 전체기간 모든 기업을 univ로 불러옴)

주어진 기간동안의 존재하지 않았던 주식들의 value 값에 대해선 모두 NaN을 줘버림. 

In [82]:
name_to_symbol['신한은행'] # 신한지주 출범으로 신한 증권과 함께 2001년 8월 30일 상장폐지. 우리의 데이터 기간엔 아예 존재하지 말았어야 함. 

'A015580'

In [83]:
name_to_symbol['신한지주'] # 동년 9월 상장됨 

'A055550'

In [84]:
def get_panel_df(df, item_name):
    panel_df = df.loc[df['Item Name '] == item_name].copy()
    panel_df = panel_df.pivot(index='date', columns='Symbol', values='value')
    panel_df = panel_df.reset_index()
    
    panel_df = panel_df.set_index('date', inplace=False)
    panel_df.sort_index(inplace=True)
    
    return panel_df 

In [85]:
returns_df = get_panel_df(fn1_df, '수익률(%)')
returns_df.head()

Symbol,A000010,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000090,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,,-1.79,,,1.17,,-0.59,0.47,,-0.27,...,,,,,,,,,,
2013-02-28,,-0.17,,,0.33,,0.58,-1.16,,2.88,...,,,,,,,,,,
2013-04-30,,1.05,,,1.74,,1.85,-0.31,,-1.87,...,,,,,,,,,,
2013-05-31,,-1.7,,,-2.5,,0.0,-0.63,,-1.85,...,,14.98,,,,,,,,
2013-07-31,,0.86,,,-0.38,,0.59,0.36,,-0.71,...,,2.58,,,,,,,,


In [98]:
get_panel_df(fn1_df, '수익률 (1개월)(%)').head() # 이걸 쓰는 것이 맞아보임. 위의 수익률은 일별 수익률인데 그냥 마지막날에 맞춘 것일 가능성이 높아보인다. 

Symbol,A000010,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000090,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,,-0.17,,,-3.46,,-4.51,5.26,,6.07,...,,,,,,,,,,
2013-02-28,,0.33,,,5.37,,2.51,7.91,,-0.28,...,,,,,,,,,,
2013-04-30,,10.47,,,25.94,,32.39,-4.73,,9.95,...,,,,,,,,,,
2013-05-31,,2.67,,,16.67,,-5.98,-1.4,,-11.67,...,,,,,,,,,,
2013-07-31,,5.42,,,1.15,,-1.27,-11.86,,14.4,...,,-1.49,,,,,,,,


In [86]:
returns_df.shape

(89, 3739)

In [87]:
returns_df.dropna(axis=1, how='all').shape 

# TODO: 왜 이렇게 shape 차이가 나는지 v1.0과 v1.1 비교해보기. 분명 이전에는 2700개가 넘었다. 

(89, 1444)

In [88]:
nans = returns_df.isnull().all()
nan_tickers = nans[nans].index.tolist()

[ symbol_to_name[ticker] for ticker in nan_tickers ] # 모든 값이 NaN인 종목들. 즉, 현재 존재하지 않는 종목들.

['조흥은행',
 '우리은행',
 'KR모터스',
 '메리츠화재',
 '두산상사',
 '제일은행',
 '한일은행',
 '대한중석',
 'LG금속',
 '대우중공업',
 '유유제약',
 '동아건설',
 '대유플러스',
 '해태제과',
 '두산백화',
 '삼환기업',
 '대아건설',
 '로케트전기',
 '한주통산',
 '그린손해보험',
 '태화',
 '대한제지',
 '삼덕제지',
 '금호건설',
 'CS홀딩스',
 '삼화실업',
 '제일화재',
 '천일고속',
 '서울교통공사',
 '이화산업',
 '삼성물산',
 '한국벨트',
 '화천기공',
 '전진산업',
 '보해양조',
 '강원산업',
 '유니온',
 '한화기계',
 '전방',
 '신라섬유',
 '리젠트화재',
 '대한방직',
 '동서증권',
 '국보',
 '서통',
 '신원제이엠씨',
 '장은증권',
 '마이크로닉스',
 '금호전기',
 '남광토건',
 '우리증권',
 '상상인증권',
 '제일모직',
 '대한모방',
 'FnC코오롱',
 '동산씨앤지',
 '태원물산',
 '조비',
 '제일연마',
 '신광기업',
 '서광건설',
 '두산종합식품',
 '케이비아이동국실업',
 '경남모직',
 '두산테크팩',
 '공영토건',
 'SHD',
 '무림SP',
 '동부일렉트로닉스',
 '이화공영',
 'DL건설',
 '영일화학',
 '한보철강',
 '효성물산',
 '남한제지',
 '대호',
 '자유건설',
 '한국유리',
 '삼성건설',
 '경향건설',
 '비비안',
 '벽산개발',
 '피어리스',
 '고려산업',
 '삼익건설',
 '우주종합건설',
 '한일철강',
 '피에스텍',
 '알보젠코리아',
 '롯데푸드',
 '삼일기업공사',
 '한국제지',
 '유성',
 'SH에너지화학',
 '범양건영',
 '세기상사',
 '미원',
 '벽산건설',
 '고제',
 'KB손해보험',
 'LGEI',
 '오리엔트바이오',
 '미주제강',
 '한탑',
 '동일제강',
 '고려증권',
 '동양화학',
 '피죤',
 '보락',
 

In [89]:
returns_df.dropna(axis=1, how='all', inplace=True)

univ_list = returns_df.columns

In [90]:
univ_list

Index(['A000020', 'A000050', 'A000070', 'A000080', 'A000100', 'A000120',
       'A000140', 'A000150', 'A000180', 'A000210',
       ...
       'A900140', 'A900250', 'A900290', 'A950130', 'A950140', 'A950160',
       'A950170', 'A950190', 'A950210', 'A950220'],
      dtype='object', name='Symbol', length=1444)

In [91]:
def filter_univ(univ_list, panel_df, is_copy=True):
    if is_copy:
        return panel_df[univ_list].copy()
    else:
        return panel_df[univ_list]

### 데이터셋 생성

#### 그룹

In [92]:
# WICS Groups

sector_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Sector') )
industry_group_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Industry Group') )
industry_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Industry') )

In [93]:
sector_df.head()

Symbol,A000020,A000050,A000070,A000080,A000100,A000120,A000140,A000150,A000180,A000210,...,A900140,A900250,A900290,A950130,A950140,A950160,A950170,A950190,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,의료,경기소비재,산업재,필수소비재,의료,산업재,필수소비재,산업재,소재,산업재,...,경기소비재,,,,,,,,,
2013-02-28,의료,경기소비재,산업재,필수소비재,의료,산업재,필수소비재,산업재,소재,산업재,...,경기소비재,,,,,,,,,
2013-04-30,의료,경기소비재,산업재,필수소비재,의료,산업재,필수소비재,산업재,소재,산업재,...,경기소비재,,,,,,,,,
2013-05-31,의료,경기소비재,산업재,필수소비재,의료,산업재,필수소비재,산업재,소재,산업재,...,경기소비재,,,의료,,,,,,
2013-07-31,의료,경기소비재,필수소비재,필수소비재,의료,산업재,필수소비재,산업재,소재,산업재,...,경기소비재,,,의료,,,,,,


In [94]:
items

array(['FnGuide Sector', 'FnGuide Industry Group', 'FnGuide Industry',
       'FnGuide Industry Group 27', '거래정지여부', '관리종목여부', '수정주가(원)',
       '수익률(%)', '수익률 (1개월)(%)', '시가총액 (보통-상장예정주식수 포함)(백만원)',
       '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)', '자기주식(천원)', '이연법인세부채(천원)',
       '종가(원)', '수정계수', '기말발행주식수 (보통)(주)', '매출액(천원)', '매출원가(천원)',
       '이자비용(천원)', '영업이익(천원)', '총자산(천원)', nan], dtype=object)

#### PBR

In [95]:
PBR_df = filter_univ(univ_list, get_panel_df(fn1_df, 'PBR(배)') )
adj_PBR_df = filter_univ(univ_list, get_panel_df(fn1_df, '수정PBR(배)') )

KeyError: "None of [Index(['A000020', 'A000050', 'A000070', 'A000080', 'A000100', 'A000120',\n       'A000140', 'A000150', 'A000180', 'A000210',\n       ...\n       'A900140', 'A900250', 'A900290', 'A950130', 'A950140', 'A950160',\n       'A950170', 'A950190', 'A950210', 'A950220'],\n      dtype='object', name='Symbol', length=1444)] are in the [columns]"

#### 시장/재무

In [96]:
mkt_cap_df = filter_univ(univ_list, get_panel_df(fn1_df, '시가총액 (보통-상장예정주식수 포함)(백만원)') )
total_equity_df = filter_univ(univ_list, get_panel_df(fn1_df, '총자본(천원)') )

KeyError: "None of [Index(['A000020', 'A000050', 'A000070', 'A000080', 'A000100', 'A000120',\n       'A000140', 'A000150', 'A000180', 'A000210',\n       ...\n       'A900140', 'A900250', 'A900290', 'A950130', 'A950140', 'A950160',\n       'A950170', 'A950190', 'A950210', 'A950220'],\n      dtype='object', name='Symbol', length=1444)] are in the [columns]"

In [68]:
mkt_cap_df.head()

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000100,A000120,A000140,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,168427,,59074,215584,1286110,522399,2223287,2046492,2737481,375950,...,83005,,,,,,,,,
2013-02-28,168427,,55613,225055,1121720,534727,2369190,1990729,2623420,375950,...,92085,,,,,,,,,
2013-03-31,170661,,55255,231534,1155565,577652,2351821,2130136,2242453,392194,...,86850,,,,,,,,,
2013-04-30,188537,,55255,291599,1155565,764756,2240656,2342035,2418108,341139,...,130329,,,,,,,,,
2013-05-31,193565,,57881,340199,1337540,719001,2209391,2068797,2349671,333017,...,133747,309400.0,,,,,,,,
