# 고금계 과제 1 검토

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

In [2]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

## Load & Preprocess data

### Dataguide 추출 데이터 

In [3]:
fn1 = DATA_DIR / '고금계과제1_v3.0_201301-202408.csv'

In [4]:
## 추출한 fnguide 데이터를 dataframe으로 전처리

def preprocess_dataguide_csv(fn_file_path, cols, skiprows=8, encoding="cp949"):
    fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")

    return fn_df

In [5]:
# 날짜가 아닌 컬럼들
cols = ['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency',]

In [6]:
fn1_df = preprocess_dataguide_csv(fn1, cols, encoding='utf-8')

In [7]:
fn1_df.head(30
            )

Unnamed: 0,Symbol,Symbol Name,Kind,Item,Item Name,Frequency,date,value
0,A000010,조흥은행,SSC,S41000060F,종가(원),DAILY,2013-01-31,
1,A000010,조흥은행,SSC,S410001600,수정계수,DAILY,2013-01-31,
2,A000010,조흥은행,SSC,S410000700,수정주가(원),DAILY,2013-01-31,
3,A000010,조흥은행,SSC,S41000170F,수익률(%),DAILY,2013-01-31,
4,A000010,조흥은행,SSC,S41000180F,수익률 (1개월)(%),DAILY,2013-01-31,
5,A000010,조흥은행,SSC,S420002100,시가총액 (상장예정주식수 포함)(백만원),DAILY,2013-01-31,
6,A000010,조흥은행,SSC,S420002300,시가총액 (보통-상장예정주식수 포함)(백만원),DAILY,2013-01-31,
7,A000010,조흥은행,SSC,S420003800,상장주식수 (보통)(주),DAILY,2013-01-31,
8,A000010,조흥은행,COM,CP10000600,FnGuide Sector,,2013-01-31,금융
9,A000010,조흥은행,COM,CP10000800,FnGuide Industry Group,,2013-01-31,은행


In [8]:
fn1_df[ fn1_df['Frequency'] == 'DAILY' ]['Item Name '].unique()

array(['종가(원)', '수정계수', '수정주가(원)', '수익률(%)', '수익률 (1개월)(%)',
       '시가총액 (상장예정주식수 포함)(백만원)', '시가총액 (보통-상장예정주식수 포함)(백만원)',
       '상장주식수 (보통)(주)', '기말발행주식수 (보통)(주)'], dtype=object)

In [9]:
fn1_df['Kind'].unique()

array(['SSC', 'COM', 'NFS-IFRS(C)'], dtype=object)

In [10]:
fn1_df[ fn1_df['Kind'].isna() ] # 날짜 빼고 다 NaN으로 나오는 케이스들 있다. 

Unnamed: 0,Symbol,Symbol Name,Kind,Item,Item Name,Frequency,date,value


In [11]:
# univ_list = fn1_df['Symbol'].unique() # 나중에 기간 중 존재하지 않았던 종목들을 제외하고 다시 만들 것. 

items = fn1_df['Item Name '].unique() # 원래부터 DataGuide에 띄어쓰기가 들어가 있어서 이렇게 되어버림

In [12]:
items

array(['종가(원)', '수정계수', '수정주가(원)', '수익률(%)', '수익률 (1개월)(%)',
       '시가총액 (상장예정주식수 포함)(백만원)', '시가총액 (보통-상장예정주식수 포함)(백만원)',
       '상장주식수 (보통)(주)', 'FnGuide Sector', 'FnGuide Industry Group',
       'FnGuide Industry', 'FnGuide Industry Group 27', '거래정지여부',
       '관리종목여부', '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)', '자기주식(천원)',
       '이연법인세부채(천원)', '매출액(천원)', '매출원가(천원)', '이자비용(천원)', '영업이익(천원)',
       '총자산(천원)', '기말발행주식수 (보통)(주)'], dtype=object)

### mapping 생성

In [13]:
symbol_to_name = fn1_df[['Symbol', 'Symbol Name']].drop_duplicates().set_index('Symbol').to_dict()['Symbol Name']

In [14]:
name_to_symbol = {v:k for k, v in symbol_to_name.items()}

### 존재하지 않았던 기업 처리

Dataguide에서 상장폐지 종목 포함하여 불러오면 주어진 기간에 존재하지 않았던 기업까지 불러옴. (즉, 전체기간 모든 기업을 univ로 불러옴)

주어진 기간동안의 존재하지 않았던 주식들의 value 값에 대해선 모두 NaN을 줘버림. 

In [15]:
name_to_symbol['신한은행'] # 신한지주 출범으로 신한 증권과 함께 2001년 8월 30일 상장폐지. 우리의 데이터 기간엔 아예 존재하지 말았어야 함. 

'A015580'

In [16]:
name_to_symbol['신한지주'] # 동년 9월 상장됨 

'A055550'

In [17]:
def get_panel_df(df, item_name):
    panel_df = df.loc[df['Item Name '] == item_name].copy()
    panel_df = panel_df.pivot(index='date', columns='Symbol', values='value')
    panel_df = panel_df.reset_index()
    
    panel_df = panel_df.set_index('date', inplace=False)
    panel_df.sort_index(inplace=True)
    
    return panel_df 

In [21]:
returns_df = get_panel_df(fn1_df, '수익률(%)')
returns_df.head()

Symbol,A000010,A000011,A000012,A000020,A000021,A000022,A000023,A000030,A000031,A000032,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,,,,-1.79,,,,,,,...,0.26,,,,,,,,,
2013-02-28,,,,-0.17,,,,,,,...,5.77,,,,,,,,,
2013-03-31,,,,0.99,,,,,,,...,0.49,,,,,,,,,
2013-04-30,,,,1.05,,,,,,,...,2.52,,,,,,,,,
2013-05-31,,,,-1.7,,,,,,,...,2.62,14.98,,,,,,,,


In [19]:
get_panel_df(fn1_df, '수익률 (1개월)(%)').head() # 이걸 쓰는 것이 맞아보임. 위의 수익률은 일별 수익률인데 그냥 마지막날에 맞춘 것일 가능성이 높아보인다. 

Symbol,A000010,A000011,A000012,A000020,A000021,A000022,A000023,A000030,A000031,A000032,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,,,,-0.17,,,,,,,...,4.44,,,,,,,,,
2013-02-28,,,,0.33,,,,,,,...,14.17,,,,,,,,,
2013-03-31,,,,1.33,,,,,,,...,-5.68,,,,,,,,,
2013-04-30,,,,10.47,,,,,,,...,50.06,,,,,,,,,
2013-05-31,,,,2.67,,,,,,,...,2.62,,,,,,,,,


In [23]:
returns_df.shape

(141, 5751)

In [24]:
returns_df.dropna(axis=1, how='all').shape 

# DataGuide에서 데이터 뽑아올 때, 비영업일 제외로 선택하면 월말일이 주말/공휴일일 경우 데이터가 누락됨. 

(141, 3101)

In [25]:
returns_df.index

Index(['2013-01-31', '2013-02-28', '2013-03-31', '2013-04-30', '2013-05-31',
       '2013-06-30', '2013-07-31', '2013-08-31', '2013-09-30', '2013-10-31',
       ...
       '2023-12-31', '2024-01-31', '2024-02-29', '2024-03-31', '2024-04-30',
       '2024-05-31', '2024-06-30', '2024-07-31', '2024-08-31', '2024-09-10'],
      dtype='object', name='date', length=141)

In [27]:
nans = returns_df.isnull().all()
nan_tickers = nans[nans].index.tolist()

[ symbol_to_name[ticker] for ticker in nan_tickers ] # 모든 값이 NaN인 종목들. 즉, 현재 존재하지 않는 종목들.

['조흥은행',
 '조흥은행(1신)',
 '조흥은행(2신)',
 '동화약품공업(1신)',
 '동화약품(전환1신)',
 '동화약품(전환2신)',
 '한빛은행(1신)',
 '한국상업은행(2신)',
 '효성기계공업(1신)',
 '효성기계공업(2신)',
 '동양화재(1신)',
 '동양화재(2신)',
 '삼양사(1신)',
 '삼양사(2신)',
 '삼양사(3신)',
 '삼양사(1우1신)',
 '진로(1신)',
 '진로(2신)',
 '진로(1우2신)',
 '진로우',
 '진로(1우1신)',
 '두산상사',
 '두산상사(1신)',
 '유한양행(1신)',
 '유한양행(2신)',
 '유한양행(1우1신)',
 '유한양행2우B',
 '제일은행',
 '제일은행(1신)',
 '제일은행(2신)',
 '대한통운(1신)',
 '대한통운(2신)',
 '한일은행',
 '한일은행(1신)',
 '한일은행(2신)',
 '하이트맥주(1신)',
 '조선맥주 (우선신)',
 '하이트맥주2우B',
 '하이트맥주3우B',
 '두산(1신)',
 '오비맥주(1우1신)',
 '대한중석',
 '대한중석(1신)',
 '대한중석 (2신)',
 'LG금속',
 'LG금속(1신)',
 '성창기업(1신)',
 '대우중공업',
 '대우중공업(1신)',
 '대우중공업(2신)',
 '대우중공업(1우2신)',
 '대우중공우',
 '대우중공업(1우1신)',
 '대림산업(1신)',
 '대림산업(전환1신)',
 '대림산업(1우1신)',
 '대림산업(2우1신)',
 '유유산업(1신)',
 '일동제약(1신)',
 '일동제약(2신)',
 '한국타이어제조(1신)',
 '한국타이어제조(2신)',
 '한국타이어제조(3신)',
 '기아자동차(1신)',
 '기아자동차(2신)',
 '기아자동차(전환1신)',
 '동아건설',
 '동아건설산업(1신)',
 '동아건설산업 (2신)',
 '두레에어메탈(1신)',
 '두레에어메탈(1우2신)',
 '삼선공업1우선',
 '두레에어메탈(1우1신)',
 '해태제과',
 '해태제과(1신)',
 '해태제과(전환1신)',


In [28]:
returns_df.dropna(axis=1, how='all', inplace=True)

univ_list = returns_df.columns

In [29]:
univ_list

Index(['A000020', 'A000030', 'A000040', 'A000050', 'A000060', 'A000070',
       'A000075', 'A000080', 'A000087', 'A000100',
       ...
       'A950110', 'A950130', 'A950140', 'A950160', 'A950170', 'A950180',
       'A950190', 'A950200', 'A950210', 'A950220'],
      dtype='object', name='Symbol', length=3101)

In [30]:
def filter_univ(univ_list, panel_df, is_copy=True):
    if is_copy:
        return panel_df[univ_list].copy()
    else:
        return panel_df[univ_list]

## 데이터셋 생성

In [None]:
items

array(['종가(원)', '수정계수', '수정주가(원)', '수익률(%)', '수익률 (1개월)(%)',
       '시가총액 (상장예정주식수 포함)(백만원)', '시가총액 (보통-상장예정주식수 포함)(백만원)',
       '상장주식수 (보통)(주)', 'FnGuide Sector', 'FnGuide Industry Group',
       'FnGuide Industry', 'FnGuide Industry Group 27', '거래정지여부',
       '관리종목여부', '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)', '자기주식(천원)',
       '이연법인세부채(천원)', '매출액(천원)', '매출원가(천원)', '이자비용(천원)', '영업이익(천원)',
       '총자산(천원)', '기말발행주식수 (보통)(주)'], dtype=object)

#### 그룹

In [31]:
# WICS Groups

sector_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Sector') )
industry_group_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Industry Group') )
industry_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Industry') )

In [32]:
sector_df.head()

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000075,A000080,A000087,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-02-28,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-03-31,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-04-30,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-05-31,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,의료,,,,,,,,


### 시장

In [40]:
close_df = filter_univ(univ_list, get_panel_df(fn1_df, '종가(원)') ) 
adjclose_df = filter_univ(univ_list, get_panel_df(fn1_df, '수정주가(원)') )
adjfactor_df = filter_univ(univ_list, get_panel_df(fn1_df, '수정계수') )
monthly_returns_df = filter_univ(univ_list, get_panel_df(fn1_df, '수익률 (1개월)(%)') ) # 수익률은 %로 되어있어 /100 해줘야 함.

all_mkt_cap_df = filter_univ(univ_list, get_panel_df(fn1_df, '시가총액 (상장예정주식수 포함)(백만원)') )
common_mkt_cap_df = filter_univ(univ_list, get_panel_df(fn1_df, '시가총액 (보통-상장예정주식수 포함)(백만원)') )
common_shares_outstanding_df = filter_univ(univ_list, get_panel_df(fn1_df, '기말발행주식수 (보통)(주)') )

is_under_supervision_df = filter_univ(univ_list, get_panel_df(fn1_df, '관리종목여부') )
is_trading_halt_df = filter_univ(univ_list, get_panel_df(fn1_df, '거래정지여부') )

In [42]:
adjclose_df.head()

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000075,A000080,A000087,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,6030,,8727,7874,13300,67800,31950,32000,16600,25873,...,3885,,,,,,,,,
2013-02-28,6030,,8215,8220,11600,69400,31250,34100,17600,25168,...,4310,,,,,,,,,
2013-03-31,6110,,8162,8456,11950,70700,32750,33850,19100,26930,...,4065,,,,,,,,,
2013-04-30,6750,,8162,10650,11950,93600,34500,32250,22450,29609,...,6100,,,,,,,,,
2013-05-31,6930,,8550,12425,13200,88000,36000,31800,20350,26155,...,6260,11900.0,,,,,,,,


#### 재무

In [41]:
common_stock_df = filter_univ(univ_list, get_panel_df(fn1_df, '보통주자본금(천원)') )
capital_surplus_df = filter_univ(univ_list, get_panel_df(fn1_df, '자본잉여금(천원)') )
retained_earnings_df = filter_univ(univ_list, get_panel_df(fn1_df, '이익잉여금(천원)') )
treasury_stock_df = filter_univ(univ_list, get_panel_df(fn1_df, '자기주식(천원)') )
deferred_tax_liabilities_df = filter_univ(univ_list, get_panel_df(fn1_df, '이연법인세부채(천원)') )
sales_revenue_df = filter_univ(univ_list, get_panel_df(fn1_df, '매출액(천원)') )
cost_of_goods_sold_df = filter_univ(univ_list, get_panel_df(fn1_df, '매출원가(천원)') )
interest_expense_df = filter_univ(univ_list, get_panel_df(fn1_df, '이자비용(천원)') )
operating_profit_df = filter_univ(univ_list, get_panel_df(fn1_df, '영업이익(천원)') )
total_assets_df = filter_univ(univ_list, get_panel_df(fn1_df, '총자산(천원)') )

In [44]:
total_assets_df.head()

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000075,A000080,A000087,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,316165541,340690382000,116931646,1296989142,10172764138,2595067569,,3553146711,,1513968939,...,126398485,59189994,,,,,,,,
2013-02-28,316165541,340690382000,116931646,1296989142,10172764138,2595067569,,3553146711,,1513968939,...,126398485,59189994,,,,,,,,
2013-03-31,316165541,340690382000,116931646,1296989142,10172764138,2595067569,,3553146711,,1513968939,...,126398485,59189994,,,,,,,,
2013-04-30,316165541,340690382000,116931646,1296989142,11052627578,2595067569,,3553146711,,1513968939,...,86879175,59189994,,,,,,,,
2013-05-31,316165541,340690382000,116931646,1296989142,11052627578,2595067569,,3553146711,,1513968939,...,86879175,59189994,,,,,,,,


## 데이터셋 추가 전처리

In [80]:
numeric_data = [
    close_df, adjclose_df, adjfactor_df, monthly_returns_df, all_mkt_cap_df, common_mkt_cap_df, common_shares_outstanding_df,
    common_stock_df, capital_surplus_df, retained_earnings_df, treasury_stock_df, deferred_tax_liabilities_df,
    sales_revenue_df, cost_of_goods_sold_df, interest_expense_df, operating_profit_df, total_assets_df
]

In [88]:
for df in numeric_data:
    obj_cols = df.select_dtypes('object').columns
    df[obj_cols] = df[obj_cols].apply(pd.to_numeric, errors='coerce')


In [93]:
monthly_returns_df = monthly_returns_df / 100

In [100]:
is_under_supervision_mapping = {
    '정상': 1,
    '관리': 0,
}

In [98]:
is_trading_halt_mapping = {
    '정상': 1,
    '정지': 0,
}

In [107]:
is_under_supervision_df = is_under_supervision_df.replace(is_under_supervision_mapping)

In [109]:
is_trading_halt_df = is_trading_halt_df.replace(is_trading_halt_mapping)