# 고금계 과제 1 데이터 정리 

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

In [30]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

## 데이터 로드

In [31]:
fn1 = DATA_DIR / '고금계과제1_v3.0_201301-202408.csv'

In [32]:
## 추출한 fnguide 데이터를 dataframe으로 전처리

def preprocess_dataguide_csv(
        fn_file_path, 
        cols=['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency',], # 날짜가 아닌 컬럼들
        skiprows=8, 
        encoding="cp949",
        ):
    fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")

    return fn_df

In [33]:
fn1_df = preprocess_dataguide_csv(fn1, encoding='utf-8')

In [59]:
fn1_df

Unnamed: 0,Symbol,Symbol Name,Kind,Item,Item Name,Frequency,date,value
0,A000010,조흥은행,SSC,S41000060F,종가(원),DAILY,2013-01-31,
1,A000010,조흥은행,SSC,S410001600,수정계수,DAILY,2013-01-31,
2,A000010,조흥은행,SSC,S410000700,수정주가(원),DAILY,2013-01-31,
3,A000010,조흥은행,SSC,S41000170F,수익률(%),DAILY,2013-01-31,
4,A000010,조흥은행,SSC,S41000180F,수익률 (1개월)(%),DAILY,2013-01-31,
...,...,...,...,...,...,...,...,...
20272270,A950220,네오이뮨텍,NFS-IFRS(C),6000905001,매출원가(천원),ANNUAL,2024-09-10,
20272271,A950220,네오이뮨텍,NFS-IFRS(C),6000909054,이자비용(천원),ANNUAL,2024-09-10,
20272272,A950220,네오이뮨텍,NFS-IFRS(C),6000906001,영업이익(천원),ANNUAL,2024-09-10,
20272273,A950220,네오이뮨텍,NFS-IFRS(C),6000901001,총자산(천원),ANNUAL,2024-09-10,


In [34]:
items = fn1_df['Item Name '].unique() # 원래부터 DataGuide에 띄어쓰기가 들어가 있어서 'Item Name '

In [35]:
# Mappings

symbol_to_name = fn1_df[['Symbol', 'Symbol Name']].drop_duplicates().set_index('Symbol').to_dict()['Symbol Name']
name_to_symbol = {v:k for k, v in symbol_to_name.items()}

In [36]:
def get_panel_df(molten_df, item_name):
    panel_df = molten_df.loc[molten_df['Item Name '] == item_name]
    panel_df = panel_df.pivot(index='date', columns='Symbol', values='value')
    panel_df = panel_df.reset_index()
    
    panel_df = panel_df.set_index('date', inplace=False)
    panel_df.sort_index(inplace=True)
    
    return panel_df 

## 전처리 (1차)

### 기간 내 존재하지 않는 기업 제외

In [37]:
adj_close_temp = get_panel_df(fn1_df, '수정주가(원)')

In [38]:
adj_close_temp.shape

(141, 5751)

In [39]:
adj_close_temp.dropna(axis=1, how='all', inplace=True)

In [40]:
adj_close_temp.shape

(141, 3101)

In [41]:
# 분석 기간 내 존재했던 종목들
univ_list = adj_close_temp.columns

### 기타 조건별 제외

In [42]:
def filter_univ(univ_list, panel_df, is_copy=True):
    if is_copy:
        return panel_df[univ_list].copy()
    else:
        return panel_df[univ_list]

#### 금융주 제외

In [43]:
sector_df = filter_univ(univ_list, get_panel_df(fn1_df, 'FnGuide Sector') )
sector_df.head()

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000075,A000080,A000087,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-02-28,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-03-31,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-04-30,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,,,,,,,,,
2013-05-31,의료,금융,경기소비재,경기소비재,금융,산업재,산업재,필수소비재,필수소비재,의료,...,IT,의료,,,,,,,,


In [44]:
# 섹터는 고정되어있지 않고 중간에 바뀌기도 함. 
sector_df.nunique()[sector_df.nunique() != 1].sort_values(ascending=False)

Symbol
A141020    4
A056730    4
A066980    4
A038530    4
A038340    4
          ..
A060570    2
A060540    2
A060300    2
A060280    2
A085910    2
Length: 472, dtype: int64

In [45]:
univ_df = ~sector_df.isnull() & (sector_df != '금융')

#### 관리종목, 거래정지 제외

In [46]:
is_under_supervision_df = filter_univ(univ_list, get_panel_df(fn1_df, '관리종목여부') )
is_trading_halt_df = filter_univ(univ_list, get_panel_df(fn1_df, '거래정지여부') )

In [47]:
is_under_supervision_mapping = {
    '정상': True,
    '관리': False,
}
is_trading_halt_mapping = {
    '정상': True,
    '정지': False,
}

In [48]:
is_under_supervision_df = is_under_supervision_df.replace(is_under_supervision_mapping).infer_objects(copy=False)
is_trading_halt_df = is_trading_halt_df.replace(is_trading_halt_mapping).infer_objects(copy=False)

  is_under_supervision_df = is_under_supervision_df.replace(is_under_supervision_mapping).infer_objects(copy=False)
  is_trading_halt_df = is_trading_halt_df.replace(is_trading_halt_mapping).infer_objects(copy=False)


In [49]:
univ_df = univ_df & is_under_supervision_df & is_trading_halt_df

In [50]:
# Update univ_list
univ_list = univ_df.columns

## 데이터셋 생성

### 시장

In [51]:
close_df = filter_univ(univ_list, get_panel_df(fn1_df, '종가(원)') ) 
adjclose_df = filter_univ(univ_list, get_panel_df(fn1_df, '수정주가(원)') )
adjfactor_df = filter_univ(univ_list, get_panel_df(fn1_df, '수정계수') )
monthly_returns_df = filter_univ(univ_list, get_panel_df(fn1_df, '수익률 (1개월)(%)') ) # 수익률은 %로 되어있어 뒤에서 /100 해줘야 함.

all_mkt_cap_df = filter_univ(univ_list, get_panel_df(fn1_df, '시가총액 (상장예정주식수 포함)(백만원)') )
common_mkt_cap_df = filter_univ(univ_list, get_panel_df(fn1_df, '시가총액 (보통-상장예정주식수 포함)(백만원)') )
common_shares_outstanding_df = filter_univ(univ_list, get_panel_df(fn1_df, '기말발행주식수 (보통)(주)') )

is_under_supervision_df = filter_univ(univ_list, get_panel_df(fn1_df, '관리종목여부') )
is_trading_halt_df = filter_univ(univ_list, get_panel_df(fn1_df, '거래정지여부') )

#### 재무

In [52]:
common_stock_df = filter_univ(univ_list, get_panel_df(fn1_df, '보통주자본금(천원)') )
capital_surplus_df = filter_univ(univ_list, get_panel_df(fn1_df, '자본잉여금(천원)') )
retained_earnings_df = filter_univ(univ_list, get_panel_df(fn1_df, '이익잉여금(천원)') )
treasury_stock_df = filter_univ(univ_list, get_panel_df(fn1_df, '자기주식(천원)') )
deferred_tax_liabilities_df = filter_univ(univ_list, get_panel_df(fn1_df, '이연법인세부채(천원)') )
sales_revenue_df = filter_univ(univ_list, get_panel_df(fn1_df, '매출액(천원)') )
cost_of_goods_sold_df = filter_univ(univ_list, get_panel_df(fn1_df, '매출원가(천원)') )
interest_expense_df = filter_univ(univ_list, get_panel_df(fn1_df, '이자비용(천원)') )
operating_profit_df = filter_univ(univ_list, get_panel_df(fn1_df, '영업이익(천원)') )
total_assets_df = filter_univ(univ_list, get_panel_df(fn1_df, '총자산(천원)') )

In [53]:
total_assets_df

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000075,A000080,A000087,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,316165541,340690382000,116931646,1296989142,10172764138,2595067569,,3553146711,,1513968939,...,126398485,59189994,,,,,,,,
2013-02-28,316165541,340690382000,116931646,1296989142,10172764138,2595067569,,3553146711,,1513968939,...,126398485,59189994,,,,,,,,
2013-03-31,316165541,340690382000,116931646,1296989142,10172764138,2595067569,,3553146711,,1513968939,...,126398485,59189994,,,,,,,,
2013-04-30,316165541,340690382000,116931646,1296989142,11052627578,2595067569,,3553146711,,1513968939,...,86879175,59189994,,,,,,,,
2013-05-31,316165541,340690382000,116931646,1296989142,11052627578,2595067569,,3553146711,,1513968939,...,86879175,59189994,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-31,,,,,,,,,,,...,,,,,,,,,,
2024-06-30,,,,,,,,,,,...,,,,,,,,,,
2024-07-31,,,,,,,,,,,...,,,,,,,,,,
2024-08-31,,,,,,,,,,,...,,,,,,,,,,


## 전처리 (2차)

### 형변환

In [54]:
numeric_data = [
    close_df, adjclose_df, adjfactor_df, monthly_returns_df, all_mkt_cap_df, common_mkt_cap_df, common_shares_outstanding_df,
    common_stock_df, capital_surplus_df, retained_earnings_df, treasury_stock_df, deferred_tax_liabilities_df,
    sales_revenue_df, cost_of_goods_sold_df, interest_expense_df, operating_profit_df, total_assets_df
]

In [55]:
for df in numeric_data:
    obj_cols = df.select_dtypes('object').columns
    df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False) 
    df[obj_cols] = df[obj_cols].apply(pd.to_numeric, errors='coerce')


  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
  df[obj_cols] = df[obj_cols].replace(',', '', regex=True).infer_

### 단위 통일

In [56]:
monthly_returns_df = monthly_returns_df / 100 # 수익률은 %로 되어있어 /100

In [57]:
all_mkt_cap_df = all_mkt_cap_df * 100 # 시가총액은 100만원 단위라 *100하여 천원 단위로 맞춰줌
common_mkt_cap_df = common_mkt_cap_df * 100

In [58]:
all_mkt_cap_df

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000075,A000080,A000087,A000100,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,16842700,,5907400,21558400,128611000.0,52950500,,224204700,,206940200,...,8300500,,,,,,,,,
2013-02-28,16842700,,5561300,22505500,112172000.0,54167800,,238908000,,201293100,...,9208500,,,,,,,,,
2013-03-31,17066100,,5525500,23153400,115556500.0,58761000,,237340700,,215262100,...,8685000,,,,,,,,,
2013-04-30,18853700,,5525500,29159900,115556500.0,77524600,,226602800,,236459100,...,13032900,,,,,,,,,
2013-05-31,19356500,,5788100,34019900,133754000.0,72994700,,223238900,,209222700,...,13374700,30940000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-31,23127300,,4191300,21027500,,61380500,,142770000,,559030300,...,7937300,24910600.0,46689500.0,105916500.0,27166800.0,,14204600.0,9146700.0,47776400.0,17499500.0
2024-06-30,22512800,,3932700,21329100,,60606700,,150898600,,656980700,...,7227800,24146900.0,45497400.0,109895800.0,28098300.0,,13579900.0,7559800.0,51983200.0,15818800.0
2024-07-31,23294800,,3547800,19464800,,61600900,,147704200,,765900700,...,6891100,24837900.0,39715900.0,137533900.0,29133200.0,,14055200.0,8781300.0,83113000.0,14484100.0
2024-08-31,22540700,,3577900,18560100,,63925500,,147014200,,1148425700,...,5580200,25310600.0,24338100.0,123967600.0,23932700.0,,14516900.0,8637000.0,84855800.0,14978400.0
