# 데이터 정리 v2

- long data를 기본으로, panel로도 불러올 수 있도록 처리 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

In [2]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

## 데이터 로드

In [3]:
fn1 = DATA_DIR / '고금계과제1_v3.3_201301-202408.csv'

In [127]:
## 추출한 fnguide 데이터를 dataframe으로 전처리

def preprocess_dataguide_csv(
        fn_file_path, 
        cols=['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency',], # 날짜가 아닌 컬럼들
        skiprows=8, 
        encoding="cp949",
        ):
    fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")

    return fn_df

In [128]:
fn1_df = preprocess_dataguide_csv(fn1, encoding='utf-8')

In [129]:
fn1_df['Item Name '].unique()

array(['종가(원)', '수정계수', '수정주가(원)', '수익률 (1개월)(%)', 'FnGuide Sector',
       '거래정지여부', '관리종목여부', '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)',
       '자기주식(천원)', '이연법인세부채(천원)', '매출액(천원)', '매출원가(천원)', '이자비용(천원)',
       '영업이익(천원)', '총자산(천원)', '기말발행주식수 (보통)(주)'], dtype=object)

In [146]:
symbol_to_name = fn1_df[['Symbol', 'Symbol Name']].drop_duplicates().set_index('Symbol').to_dict()['Symbol Name']
name_to_symbol = {v:k for k, v in symbol_to_name.items()}

In [130]:
# string value를 가진 FnGuide Sector의 경우 pivot_table이 안됨. 
# 이래서 차라리 FnGuide Sector Code 로 가져오는 것이 훨씬 유용한듯. 

sectors = fn1_df[ fn1_df['Item Name '] == 'FnGuide Sector' ].pivot(
    index=['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency',],
    columns='Item Name ',
    values='value',
).reset_index()


In [131]:
sectors

Item Name,date,Symbol,Symbol Name,Kind,Frequency,FnGuide Sector
0,2013-01-31,A000010,조흥은행,COM,,금융
1,2013-01-31,A000020,동화약품,COM,,의료
2,2013-01-31,A000030,우리은행,COM,,금융
3,2013-01-31,A000040,KR모터스,COM,,경기소비재
4,2013-01-31,A000050,경방,COM,,경기소비재
...,...,...,...,...,...,...
527194,2024-09-19,A950180,SNK,COM,,IT
527195,2024-09-19,A950190,고스트스튜디오,COM,,IT
527196,2024-09-19,A950200,소마젠,COM,,의료
527197,2024-09-19,A950210,프레스티지바이오파마,COM,,의료


In [150]:
sectors[ sectors['FnGuide Sector'] == '금융']

Item Name,date,Symbol,Symbol Name,Kind,Frequency,FnGuide Sector
0,2013-01-31,A000010,조흥은행,COM,,금융
2,2013-01-31,A000030,우리은행,COM,,금융
5,2013-01-31,A000060,메리츠화재,COM,,금융
10,2013-01-31,A000110,제일은행,COM,,금융
31,2013-01-31,A000370,한화손해보험,COM,,금융
...,...,...,...,...,...,...
527155,2024-09-19,A479880,한국제15호스팩,COM,,금융
527156,2024-09-19,A481890,엔에이치스팩31호,COM,,금융
527157,2024-09-19,A482520,교보16호스팩,COM,,금융
527158,2024-09-19,A482680,미래에셋비전스팩7호,COM,,금융


In [132]:
sectors.groupby('date').count()['FnGuide Sector']


date
2013-01-31    2309
2013-02-28    2313
2013-03-31    2315
2013-04-30    2318
2013-05-31    2321
              ... 
2024-05-31    3476
2024-06-30    3489
2024-07-31    3499
2024-08-31    3511
2024-09-19    3513
Name: FnGuide Sector, Length: 141, dtype: int64

In [133]:
sectors.groupby('date').size()

date
2013-01-31    3739
2013-02-28    3739
2013-03-31    3739
2013-04-30    3739
2013-05-31    3739
              ... 
2024-05-31    3739
2024-06-30    3739
2024-07-31    3739
2024-08-31    3739
2024-09-19    3739
Length: 141, dtype: int64

In [134]:
new_df = fn1_df.pivot_table(
    index=['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency',],
    columns='Item Name ',
    values='value',
    aggfunc='first',
    dropna=True, # False 로 하면 memory error 남. 
)

In [135]:
new_df.reset_index(inplace=True)
new_df.index.name = None

In [136]:
new_df.columns

Index(['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency', '기말발행주식수 (보통)(주)',
       '매출액(천원)', '매출원가(천원)', '보통주자본금(천원)', '수익률 (1개월)(%)', '수정계수', '수정주가(원)',
       '영업이익(천원)', '이연법인세부채(천원)', '이익잉여금(천원)', '이자비용(천원)', '자기주식(천원)',
       '자본잉여금(천원)', '종가(원)', '총자산(천원)'],
      dtype='object', name='Item Name ')

In [137]:
for col in new_df.columns:
    try:
        new_df[col] = new_df[col].replace(',', '', regex=True).infer_objects()
        new_df[col] = pd.to_numeric(new_df[col]) # Catch exception explicitly
    except:
        pass

In [138]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639194 entries, 0 to 639193
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             639194 non-null  object 
 1   Symbol           639194 non-null  object 
 2   Symbol Name      639194 non-null  object 
 3   Kind             639194 non-null  object 
 4   Frequency        639194 non-null  object 
 5   기말발행주식수 (보통)(주)  298338 non-null  float64
 6   매출액(천원)          340856 non-null  float64
 7   매출원가(천원)         335939 non-null  float64
 8   보통주자본금(천원)       340784 non-null  float64
 9   수익률 (1개월)(%)     297080 non-null  float64
 10  수정계수             298265 non-null  float64
 11  수정주가(원)          298338 non-null  float64
 12  영업이익(천원)         340856 non-null  float64
 13  이연법인세부채(천원)      340784 non-null  float64
 14  이익잉여금(천원)        340784 non-null  float64
 15  이자비용(천원)         340856 non-null  float64
 16  자기주식(천원)         340784 non-null  floa

In [139]:
new_df

Item Name,date,Symbol,Symbol Name,Kind,Frequency,기말발행주식수 (보통)(주),매출액(천원),매출원가(천원),보통주자본금(천원),수익률 (1개월)(%),수정계수,수정주가(원),영업이익(천원),이연법인세부채(천원),이익잉여금(천원),이자비용(천원),자기주식(천원),자본잉여금(천원),종가(원),총자산(천원)
0,2013-01-31,A000010,조흥은행,NFS-IFRS(C),ANNUAL,,9.087373e+09,4.736825e+09,7.928078e+09,,,,1.719972e+09,10473000.0,9.806344e+09,4.736825e+09,0.0,403164000.0,,2.380457e+11
1,2013-01-31,A000020,동화약품,NFS-IFRS(C),ANNUAL,,2.202406e+08,1.165344e+08,2.793147e+07,,,,2.062220e+06,0.0,1.701473e+08,3.843350e+05,0.0,26919758.0,,3.161655e+08
2,2013-01-31,A000020,동화약품,SSC,DAILY,27931470.0,,,,-0.17,1.0,6030.0,,,,,,,6030.0,
3,2013-01-31,A000030,우리은행,NFS-IFRS(C),ANNUAL,,9.493383e+09,5.001361e+09,4.030077e+09,,,,2.395670e+08,49105000.0,1.311269e+10,6.622744e+09,-14000.0,176502000.0,,3.406904e+11
4,2013-01-31,A000040,KR모터스,NFS-IFRS(C),ANNUAL,,9.955327e+07,9.199679e+07,5.967069e+07,,,,-3.486600e+06,7181340.0,1.298918e+06,8.133330e+05,-5044.0,3884892.0,,1.169316e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639189,2024-09-19,A481890,엔에이치스팩31호,SSC,DAILY,6345000.0,,,,-0.98,1.0,2015.0,,,,,,,2015.0,
639190,2024-09-19,A487570,HS효성,SSC,DAILY,3725927.0,,,,-12.80,1.0,41200.0,,,,,,,41200.0,
639191,2024-09-19,A478780,대신밸런스제18호스팩,SSC,DAILY,7910000.0,,,,,1.0,2040.0,,,,,,,2040.0,
639192,2024-09-19,A482520,교보16호스팩,SSC,DAILY,6100000.0,,,,-0.49,1.0,2040.0,,,,,,,2040.0,


In [140]:
new_df.groupby('date')['수익률 (1개월)(%)'].count()

date
2013-01-31    1716
2013-02-28    1713
2013-03-31    1716
2013-04-30    1711
2013-05-31    1707
              ... 
2024-05-31    2533
2024-06-30    2537
2024-07-31    2545
2024-08-31    2554
2024-09-19    2558
Name: 수익률 (1개월)(%), Length: 141, dtype: int64

In [148]:
existing = new_df.groupby('Symbol').filter(
    lambda x: x['종가(원)'].notnull().any()
)

univ_list = existing['Symbol'].unique()

In [149]:
len(univ_list)

2927

In [151]:
new_df.pivot_table(
    index='date',
    columns='Symbol',
    values='종가(원)',
)

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000100,A000120,A000140,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,6030.0,,495.0,86500.0,13300.0,67800.0,32000.0,183500.0,120000.0,16200.0,...,3885.0,,,,,,,,,
2013-02-28,6030.0,,466.0,90300.0,11600.0,69400.0,34100.0,178500.0,115000.0,16200.0,...,4310.0,,,,,,,,,
2013-03-31,6110.0,,463.0,92900.0,11950.0,70700.0,33850.0,191000.0,98300.0,16900.0,...,4065.0,,,,,,,,,
2013-04-30,6750.0,,463.0,117000.0,11950.0,93600.0,32250.0,210000.0,106000.0,14700.0,...,6100.0,,,,,,,,,
2013-05-31,6930.0,,485.0,136500.0,13200.0,88000.0,31800.0,185500.0,103000.0,14350.0,...,6260.0,11900.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-31,8280.0,,697.0,7670.0,,69700.0,20100.0,68800.0,103700.0,9020.0,...,3300.0,6850.0,23500.0,14260.0,5250.0,,10460.0,4755.0,7950.0,1770.0
2024-06-30,8060.0,,654.0,7780.0,,68800.0,21250.0,80900.0,95100.0,9150.0,...,3005.0,6640.0,22900.0,13990.0,5430.0,,10000.0,3930.0,8650.0,1600.0
2024-07-31,8340.0,,590.0,7100.0,,70000.0,20800.0,94400.0,98500.0,9140.0,...,2865.0,6830.0,19990.0,17330.0,5630.0,,10350.0,4565.0,13830.0,1465.0
2024-08-31,8070.0,,595.0,6770.0,,72700.0,20700.0,141000.0,94300.0,9280.0,...,2320.0,6960.0,12250.0,15600.0,4625.0,,10690.0,4490.0,14120.0,1515.0


모듈화한 것 테스트

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from pathlib import Path

In [2]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

In [3]:
fn1 = DATA_DIR / '고금계과제1_v3.3_201301-202408.csv'

In [4]:
from fndata import FnData

In [5]:
fnd = FnData(fn1)

In [6]:
item = '종가(원)'

In [7]:
fnd.get_data()

Unnamed: 0_level_0,Item Name,기말발행주식수 (보통)(주),매출액(천원),매출원가(천원),보통주자본금(천원),수익률 (1개월)(%),수정계수,수정주가(원),영업이익(천원),이연법인세부채(천원),이익잉여금(천원),이자비용(천원),자기주식(천원),자본잉여금(천원),종가(원),총자산(천원)
date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-01-31,A000010,,9087373000,4736825000,7928078000,,,,1719972000,10473000,9806344000,4736825000,0,403164000,,238045694000
2013-01-31,A000020,,220240575,116534418,27931470,,,,2062220,0,170147330,384335,0,26919758,,316165541
2013-01-31,A000020,27931470,,,,-0.17,1.000000,6030,,,,,,,6030,
2013-01-31,A000030,,9493383000,5001361000,4030077000,,,,239567000,49105000,13112690000,6622744000,-14000,176502000,,340690382000
2013-01-31,A000040,,99553272,91996790,59670690,,,,-3486600,7181340,1298918,813333,-5044,3884892,,116931646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-19,A481890,6345000,,,,-0.98,1.000000,2015,,,,,,,2015,
2024-09-19,A487570,3725927,,,,-12.80,1.000000,41200,,,,,,,41200,
2024-09-19,A478780,7910000,,,,,1.000000,2040,,,,,,,2040,
2024-09-19,A482520,6100000,,,,-0.49,1.000000,2040,,,,,,,2040,


In [8]:
fnd.get_data(item)

TypeError: agg function failed [how->mean,dtype->object]

In [9]:
fnd.get_data([item])

Unnamed: 0_level_0,Item Name,종가(원)
date,Symbol,Unnamed: 2_level_1


In [10]:
fnd.long_format_df

Item Name,date,Symbol,Symbol Name,Kind,Frequency,기말발행주식수 (보통)(주),매출액(천원),매출원가(천원),보통주자본금(천원),수익률 (1개월)(%),...,이연법인세부채(천원),이익잉여금(천원),이자비용(천원),자기주식(천원),자본잉여금(천원),종가(원),총자산(천원),FnGuide Sector,관리종목여부,거래정지여부
