# 데이터 정리 v2

- long data를 기본으로, panel로도 불러올 수 있도록 처리 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

In [2]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

## 데이터 로드

In [3]:
fn1 = DATA_DIR / '고금계과제1_v3.3_201301-202408.csv'

In [127]:
## 추출한 fnguide 데이터를 dataframe으로 전처리

def preprocess_dataguide_csv(
        fn_file_path, 
        cols=['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency',], # 날짜가 아닌 컬럼들
        skiprows=8, 
        encoding="cp949",
        ):
    fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")

    return fn_df

In [128]:
fn1_df = preprocess_dataguide_csv(fn1, encoding='utf-8')

In [129]:
fn1_df['Item Name '].unique()

array(['종가(원)', '수정계수', '수정주가(원)', '수익률 (1개월)(%)', 'FnGuide Sector',
       '거래정지여부', '관리종목여부', '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)',
       '자기주식(천원)', '이연법인세부채(천원)', '매출액(천원)', '매출원가(천원)', '이자비용(천원)',
       '영업이익(천원)', '총자산(천원)', '기말발행주식수 (보통)(주)'], dtype=object)

In [146]:
symbol_to_name = fn1_df[['Symbol', 'Symbol Name']].drop_duplicates().set_index('Symbol').to_dict()['Symbol Name']
name_to_symbol = {v:k for k, v in symbol_to_name.items()}

In [130]:
# string value를 가진 FnGuide Sector의 경우 pivot_table이 안됨. 
# 이래서 차라리 FnGuide Sector Code 로 가져오는 것이 훨씬 유용한듯. 

sectors = fn1_df[ fn1_df['Item Name '] == 'FnGuide Sector' ].pivot(
    index=['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency',],
    columns='Item Name ',
    values='value',
).reset_index()


In [131]:
sectors

Item Name,date,Symbol,Symbol Name,Kind,Frequency,FnGuide Sector
0,2013-01-31,A000010,조흥은행,COM,,금융
1,2013-01-31,A000020,동화약품,COM,,의료
2,2013-01-31,A000030,우리은행,COM,,금융
3,2013-01-31,A000040,KR모터스,COM,,경기소비재
4,2013-01-31,A000050,경방,COM,,경기소비재
...,...,...,...,...,...,...
527194,2024-09-19,A950180,SNK,COM,,IT
527195,2024-09-19,A950190,고스트스튜디오,COM,,IT
527196,2024-09-19,A950200,소마젠,COM,,의료
527197,2024-09-19,A950210,프레스티지바이오파마,COM,,의료


In [150]:
sectors[ sectors['FnGuide Sector'] == '금융']

Item Name,date,Symbol,Symbol Name,Kind,Frequency,FnGuide Sector
0,2013-01-31,A000010,조흥은행,COM,,금융
2,2013-01-31,A000030,우리은행,COM,,금융
5,2013-01-31,A000060,메리츠화재,COM,,금융
10,2013-01-31,A000110,제일은행,COM,,금융
31,2013-01-31,A000370,한화손해보험,COM,,금융
...,...,...,...,...,...,...
527155,2024-09-19,A479880,한국제15호스팩,COM,,금융
527156,2024-09-19,A481890,엔에이치스팩31호,COM,,금융
527157,2024-09-19,A482520,교보16호스팩,COM,,금융
527158,2024-09-19,A482680,미래에셋비전스팩7호,COM,,금융


In [132]:
sectors.groupby('date').count()['FnGuide Sector']


date
2013-01-31    2309
2013-02-28    2313
2013-03-31    2315
2013-04-30    2318
2013-05-31    2321
              ... 
2024-05-31    3476
2024-06-30    3489
2024-07-31    3499
2024-08-31    3511
2024-09-19    3513
Name: FnGuide Sector, Length: 141, dtype: int64

In [133]:
sectors.groupby('date').size()

date
2013-01-31    3739
2013-02-28    3739
2013-03-31    3739
2013-04-30    3739
2013-05-31    3739
              ... 
2024-05-31    3739
2024-06-30    3739
2024-07-31    3739
2024-08-31    3739
2024-09-19    3739
Length: 141, dtype: int64

In [134]:
new_df = fn1_df.pivot_table(
    index=['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency',],
    columns='Item Name ',
    values='value',
    aggfunc='first',
    dropna=True, # False 로 하면 memory error 남. 
)

In [135]:
new_df.reset_index(inplace=True)
new_df.index.name = None

In [136]:
new_df.columns

Index(['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency', '기말발행주식수 (보통)(주)',
       '매출액(천원)', '매출원가(천원)', '보통주자본금(천원)', '수익률 (1개월)(%)', '수정계수', '수정주가(원)',
       '영업이익(천원)', '이연법인세부채(천원)', '이익잉여금(천원)', '이자비용(천원)', '자기주식(천원)',
       '자본잉여금(천원)', '종가(원)', '총자산(천원)'],
      dtype='object', name='Item Name ')

In [137]:
for col in new_df.columns:
    try:
        new_df[col] = new_df[col].replace(',', '', regex=True).infer_objects()
        new_df[col] = pd.to_numeric(new_df[col]) # Catch exception explicitly
    except:
        pass

In [138]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639194 entries, 0 to 639193
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             639194 non-null  object 
 1   Symbol           639194 non-null  object 
 2   Symbol Name      639194 non-null  object 
 3   Kind             639194 non-null  object 
 4   Frequency        639194 non-null  object 
 5   기말발행주식수 (보통)(주)  298338 non-null  float64
 6   매출액(천원)          340856 non-null  float64
 7   매출원가(천원)         335939 non-null  float64
 8   보통주자본금(천원)       340784 non-null  float64
 9   수익률 (1개월)(%)     297080 non-null  float64
 10  수정계수             298265 non-null  float64
 11  수정주가(원)          298338 non-null  float64
 12  영업이익(천원)         340856 non-null  float64
 13  이연법인세부채(천원)      340784 non-null  float64
 14  이익잉여금(천원)        340784 non-null  float64
 15  이자비용(천원)         340856 non-null  float64
 16  자기주식(천원)         340784 non-null  floa

In [139]:
new_df

Item Name,date,Symbol,Symbol Name,Kind,Frequency,기말발행주식수 (보통)(주),매출액(천원),매출원가(천원),보통주자본금(천원),수익률 (1개월)(%),수정계수,수정주가(원),영업이익(천원),이연법인세부채(천원),이익잉여금(천원),이자비용(천원),자기주식(천원),자본잉여금(천원),종가(원),총자산(천원)
0,2013-01-31,A000010,조흥은행,NFS-IFRS(C),ANNUAL,,9.087373e+09,4.736825e+09,7.928078e+09,,,,1.719972e+09,10473000.0,9.806344e+09,4.736825e+09,0.0,403164000.0,,2.380457e+11
1,2013-01-31,A000020,동화약품,NFS-IFRS(C),ANNUAL,,2.202406e+08,1.165344e+08,2.793147e+07,,,,2.062220e+06,0.0,1.701473e+08,3.843350e+05,0.0,26919758.0,,3.161655e+08
2,2013-01-31,A000020,동화약품,SSC,DAILY,27931470.0,,,,-0.17,1.0,6030.0,,,,,,,6030.0,
3,2013-01-31,A000030,우리은행,NFS-IFRS(C),ANNUAL,,9.493383e+09,5.001361e+09,4.030077e+09,,,,2.395670e+08,49105000.0,1.311269e+10,6.622744e+09,-14000.0,176502000.0,,3.406904e+11
4,2013-01-31,A000040,KR모터스,NFS-IFRS(C),ANNUAL,,9.955327e+07,9.199679e+07,5.967069e+07,,,,-3.486600e+06,7181340.0,1.298918e+06,8.133330e+05,-5044.0,3884892.0,,1.169316e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639189,2024-09-19,A481890,엔에이치스팩31호,SSC,DAILY,6345000.0,,,,-0.98,1.0,2015.0,,,,,,,2015.0,
639190,2024-09-19,A487570,HS효성,SSC,DAILY,3725927.0,,,,-12.80,1.0,41200.0,,,,,,,41200.0,
639191,2024-09-19,A478780,대신밸런스제18호스팩,SSC,DAILY,7910000.0,,,,,1.0,2040.0,,,,,,,2040.0,
639192,2024-09-19,A482520,교보16호스팩,SSC,DAILY,6100000.0,,,,-0.49,1.0,2040.0,,,,,,,2040.0,


In [140]:
new_df.groupby('date')['수익률 (1개월)(%)'].count()

date
2013-01-31    1716
2013-02-28    1713
2013-03-31    1716
2013-04-30    1711
2013-05-31    1707
              ... 
2024-05-31    2533
2024-06-30    2537
2024-07-31    2545
2024-08-31    2554
2024-09-19    2558
Name: 수익률 (1개월)(%), Length: 141, dtype: int64

In [148]:
existing = new_df.groupby('Symbol').filter(
    lambda x: x['종가(원)'].notnull().any()
)

univ_list = existing['Symbol'].unique()

In [149]:
len(univ_list)

2927

In [151]:
new_df.pivot_table(
    index='date',
    columns='Symbol',
    values='종가(원)',
)

Symbol,A000020,A000030,A000040,A000050,A000060,A000070,A000080,A000100,A000120,A000140,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,6030.0,,495.0,86500.0,13300.0,67800.0,32000.0,183500.0,120000.0,16200.0,...,3885.0,,,,,,,,,
2013-02-28,6030.0,,466.0,90300.0,11600.0,69400.0,34100.0,178500.0,115000.0,16200.0,...,4310.0,,,,,,,,,
2013-03-31,6110.0,,463.0,92900.0,11950.0,70700.0,33850.0,191000.0,98300.0,16900.0,...,4065.0,,,,,,,,,
2013-04-30,6750.0,,463.0,117000.0,11950.0,93600.0,32250.0,210000.0,106000.0,14700.0,...,6100.0,,,,,,,,,
2013-05-31,6930.0,,485.0,136500.0,13200.0,88000.0,31800.0,185500.0,103000.0,14350.0,...,6260.0,11900.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-31,8280.0,,697.0,7670.0,,69700.0,20100.0,68800.0,103700.0,9020.0,...,3300.0,6850.0,23500.0,14260.0,5250.0,,10460.0,4755.0,7950.0,1770.0
2024-06-30,8060.0,,654.0,7780.0,,68800.0,21250.0,80900.0,95100.0,9150.0,...,3005.0,6640.0,22900.0,13990.0,5430.0,,10000.0,3930.0,8650.0,1600.0
2024-07-31,8340.0,,590.0,7100.0,,70000.0,20800.0,94400.0,98500.0,9140.0,...,2865.0,6830.0,19990.0,17330.0,5630.0,,10350.0,4565.0,13830.0,1465.0
2024-08-31,8070.0,,595.0,6770.0,,72700.0,20700.0,141000.0,94300.0,9280.0,...,2320.0,6960.0,12250.0,15600.0,4625.0,,10690.0,4490.0,14120.0,1515.0


모듈화한 것 테스트

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from pathlib import Path

In [19]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

In [20]:
fn1 = DATA_DIR / '고금계과제1_v3.3_201301-202408.csv'

In [21]:
from fndata import FnData

In [22]:
fnd = FnData(fn1)

In [23]:
item = '종가(원)'

In [24]:
fnd.get_data()

Unnamed: 0_level_0,Item Name,기말발행주식수 (보통)(주),매출액(천원),매출원가(천원),보통주자본금(천원),수익률 (1개월)(%),수정계수,수정주가(원),영업이익(천원),이연법인세부채(천원),이익잉여금(천원),이자비용(천원),자기주식(천원),자본잉여금(천원),종가(원),총자산(천원),FnGuide Sector,관리종목여부,거래정지여부
date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2013-01-31,A000020,,220240575.0,116534418.0,27931470.0,,,,2062220.0,0.0,170147330.0,384335.0,0.0,26919758.0,,3.161655e+08,,,
2013-01-31,A000020,27931470.0,,,,-0.17,1.0,6030.0,,,,,,,6030.0,,,,
2013-01-31,A000040,,99553272.0,91996790.0,59670690.0,,,,-3486600.0,7181340.0,1298918.0,813333.0,-5044.0,3884892.0,,1.169316e+08,,,
2013-01-31,A000040,119341379.0,,,,0.20,1.0,8727.0,,,,,,,495.0,,,,
2013-01-31,A000050,,347189559.0,251468225.0,12461490.0,,,,36031739.0,90652345.0,612310562.0,13944625.0,0.0,13991400.0,,1.296989e+09,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-19,A460860,49608017.0,,,,-5.08,1.0,8220.0,,,,,,,8220.0,,,,
2024-09-19,A462520,11855168.0,,,,6.18,1.0,17870.0,,,,,,,17870.0,,,,
2024-09-19,A465770,7171032.0,,,,-18.76,1.0,9830.0,,,,,,,9830.0,,,,
2024-09-19,A472850,32343933.0,,,,0.57,1.0,5330.0,,,,,,,5330.0,,,,


In [25]:
fnd.get_data(item)

Symbol,A000020,A000040,A000050,A000070,A000080,A000100,A000120,A000140,A000150,A000180,...,A950110,A950130,A950140,A950160,A950170,A950180,A950190,A950200,A950210,A950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,6030.0,495.0,86500.0,67800.0,32000.0,183500.0,120000.0,16200.0,128500.0,16900.0,...,3885.0,,,,,,,,,
2013-02-28,6030.0,466.0,90300.0,69400.0,34100.0,178500.0,115000.0,16200.0,129000.0,18200.0,...,4310.0,,,,,,,,,
2013-03-31,6110.0,463.0,92900.0,70700.0,33850.0,191000.0,98300.0,16900.0,129500.0,21150.0,...,4065.0,,,,,,,,,
2013-04-30,6750.0,463.0,117000.0,93600.0,32250.0,210000.0,106000.0,14700.0,123000.0,23150.0,...,6100.0,,,,,,,,,
2013-05-31,6930.0,485.0,136500.0,88000.0,31800.0,185500.0,103000.0,14350.0,142000.0,24000.0,...,6260.0,11900.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-31,8280.0,697.0,7670.0,69700.0,20100.0,68800.0,103700.0,9020.0,206000.0,1906.0,...,3300.0,6850.0,23500.0,14260.0,5250.0,,10460.0,4755.0,7950.0,1770.0
2024-06-30,8060.0,654.0,7780.0,68800.0,21250.0,80900.0,95100.0,9150.0,217000.0,1791.0,...,3005.0,6640.0,22900.0,13990.0,5430.0,,10000.0,3930.0,8650.0,1600.0
2024-07-31,8340.0,590.0,7100.0,70000.0,20800.0,94400.0,98500.0,9140.0,175000.0,1676.0,...,2865.0,6830.0,19990.0,17330.0,5630.0,,10350.0,4565.0,13830.0,1465.0
2024-08-31,8070.0,595.0,6770.0,72700.0,20700.0,141000.0,94300.0,9280.0,149500.0,1694.0,...,2320.0,6960.0,12250.0,15600.0,4625.0,,10690.0,4490.0,14120.0,1515.0


In [29]:
fnd.get_data().info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 584354 entries, ('2013-01-31', 'A000020') to ('2024-09-19', 'A487570')
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   기말발행주식수 (보통)(주)  270484 non-null  float64
 1   매출액(천원)          313870 non-null  float64
 2   매출원가(천원)         313696 non-null  float64
 3   보통주자본금(천원)       313798 non-null  float64
 4   수익률 (1개월)(%)     269576 non-null  float64
 5   수정계수             270433 non-null  float64
 6   수정주가(원)          270484 non-null  float64
 7   영업이익(천원)         313870 non-null  float64
 8   이연법인세부채(천원)      313798 non-null  float64
 9   이익잉여금(천원)        313798 non-null  float64
 10  이자비용(천원)         313870 non-null  float64
 11  자기주식(천원)         313798 non-null  float64
 12  자본잉여금(천원)        313798 non-null  float64
 13  종가(원)            270484 non-null  float64
 14  총자산(천원)          313798 non-null  float64
 15  FnGuide Sector   0 non-null       object 


In [26]:
multi_items = ['종가(원)', '수익률 (1개월)(%)']

In [27]:
fnd.get_data(multi_items)

Unnamed: 0_level_0,Item Name,종가(원),수익률 (1개월)(%)
date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-31,A000020,,
2013-01-31,A000020,6030.0,-0.0017
2013-01-31,A000040,,
2013-01-31,A000040,495.0,0.0020
2013-01-31,A000050,,
...,...,...,...
2024-09-19,A460860,8220.0,-0.0508
2024-09-19,A462520,17870.0,0.0618
2024-09-19,A465770,9830.0,-0.1876
2024-09-19,A472850,5330.0,0.0057


In [28]:
fnd.long_format_df

Item Name,date,Symbol,Symbol Name,Kind,Frequency,기말발행주식수 (보통)(주),매출액(천원),매출원가(천원),보통주자본금(천원),수익률 (1개월)(%),...,이연법인세부채(천원),이익잉여금(천원),이자비용(천원),자기주식(천원),자본잉여금(천원),종가(원),총자산(천원),FnGuide Sector,관리종목여부,거래정지여부
0,2013-01-31,A000020,동화약품,NFS-IFRS(C),ANNUAL,,220240575.0,116534418.0,27931470.0,,...,0.0,170147330.0,384335.0,0.0,26919758.0,,3.161655e+08,,,
1,2013-01-31,A000020,동화약품,SSC,DAILY,27931470.0,,,,-0.17,...,,,,,,6030.0,,,,
2,2013-01-31,A000040,KR모터스,NFS-IFRS(C),ANNUAL,,99553272.0,91996790.0,59670690.0,,...,7181340.0,1298918.0,813333.0,-5044.0,3884892.0,,1.169316e+08,,,
3,2013-01-31,A000040,KR모터스,SSC,DAILY,119341379.0,,,,0.20,...,,,,,,495.0,,,,
4,2013-01-31,A000050,경방,NFS-IFRS(C),ANNUAL,,347189559.0,251468225.0,12461490.0,,...,90652345.0,612310562.0,13944625.0,0.0,13991400.0,,1.296989e+09,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
584349,2024-09-19,A460860,동국제강,SSC,DAILY,49608017.0,,,,-5.08,...,,,,,,8220.0,,,,
584350,2024-09-19,A462520,조선내화,SSC,DAILY,11855168.0,,,,6.18,...,,,,,,17870.0,,,,
584351,2024-09-19,A465770,STX그린로지스,SSC,DAILY,7171032.0,,,,-18.76,...,,,,,,9830.0,,,,
584352,2024-09-19,A472850,폰드그룹,SSC,DAILY,32343933.0,,,,0.57,...,,,,,,5330.0,,,,


## 디버깅

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from pathlib import Path

In [31]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

In [32]:
fn1 = DATA_DIR / '고금계과제1_v3.3_201301-202408.csv'

In [33]:

NUMERIC_DATA = [
    '종가(원)',
    '수정주가(원)',
    '수정계수',
    '수익률 (1개월)(%)',
    # '상장주식수 (보통)(주)',
    # '시가총액 (상장예정주식수 포함)(백만원)',
    # '시가총액 (보통-상장예정주식수 포함)(백만원)',
    '기말발행주식수 (보통)(주)',
    '보통주자본금(천원)',
    '자본잉여금(천원)',
    '이익잉여금(천원)',
    '자기주식(천원)',
    '이연법인세부채(천원)',
    '매출액(천원)',
    '매출원가(천원)',
    '이자비용(천원)',
    '영업이익(천원)',
    '총자산(천원)'
    ]

UNIV_REFERENCE_ITEMS = [
    '수정주가(원)',
    '종가(원)',
    '수익률 (1개월)(%)',
    '수익률 (%)'
    ]

DIV_BY_100 = [
    '수익률 (%)',
    '수익률 (1개월)(%)',
    ]

MULTIPLY_BY_1000 = [
    '보통주자본금(천원)',
    '자본잉여금(천원)',
    '이익잉여금(천원)',
    '자기주식(천원)',
    '이연법인세부채(천원)',
    '매출액(천원)',
    '매출원가(천원)',
    '이자비용(천원)',
    '영업이익(천원)',
    '총자산(천원)',
    ]

FN_INDEX_COLS = ['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency',]

In [123]:
import pandas as pd

# Constants (formerly class variables)
NUMERIC_DATA = [
    '종가(원)', '수정주가(원)', '수정계수', '수익률 (1개월)(%)',
    '기말발행주식수 (보통)(주)', '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)',
    '자기주식(천원)', '이연법인세부채(천원)', '매출액(천원)', '매출원가(천원)',
    '이자비용(천원)', '영업이익(천원)', '총자산(천원)'
]

UNIV_REFERENCE_ITEMS = [
    '수익률 (1개월)(%)',
]

DIV_BY_100 = [
    '수익률 (%)', '수익률 (1개월)(%)'
]

MULTIPLY_BY_1000 = [
    '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)', '자기주식(천원)', 
    '이연법인세부채(천원)', '매출액(천원)', '매출원가(천원)', '이자비용(천원)', 
    '영업이익(천원)', '총자산(천원)'
]

# FN_INDEX_COLS = ['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency']
FN_INDEX_COLS = ['date', 'Symbol', 'Symbol Name', ]

def melt_dataguide_csv(fn_file_path, cols=['Symbol', 'Symbol Name', 'Kind', 'Item', 'Item Name ', 'Frequency'], skiprows=8, encoding="cp949"):
    fn_df = pd.read_csv(fn_file_path, encoding=encoding, skiprows=skiprows, thousands=",")
    fn_df = fn_df.melt(id_vars=cols, var_name="date", value_name="value")
    fn_df.drop(columns=['Kind', 'Item', 'Frequency'], inplace=True)
    return fn_df

def pivot_nonnumeric(fn1_df, item_name):
    nonnumeric_data = fn1_df[fn1_df['Item Name '] == item_name].pivot(
        index=FN_INDEX_COLS,
        columns='Item Name ',
        values='value'
    ).reset_index()
    return nonnumeric_data

def pivot_numerics(fn1_df):
    numeric_data = fn1_df.pivot_table(
        index=FN_INDEX_COLS,
        columns='Item Name ',
        values='value',
        aggfunc='first',
        dropna=True
    ).reset_index()
    return numeric_data

def preprocess_numerics(long_format_df):
    obj_cols = long_format_df.select_dtypes(include='object').columns
    obj_cols = [obj_col for obj_col in obj_cols if obj_col in NUMERIC_DATA]
    long_format_df[obj_cols] = long_format_df[obj_cols].replace(',', '', regex=True).infer_objects(copy=False)
    long_format_df[obj_cols] = long_format_df[obj_cols].apply(pd.to_numeric, errors='raise')
    return long_format_df

def make_filters(fn1_df):
    finance_sector = pivot_nonnumeric(fn1_df, 'FnGuide Sector')
    finance_sector = finance_sector[finance_sector['FnGuide Sector'] == '금융']

    is_under_supervision = pivot_nonnumeric(fn1_df, '관리종목여부')
    is_under_supervision = is_under_supervision[is_under_supervision['관리종목여부'] == '관리']

    is_trading_halted = pivot_nonnumeric(fn1_df, '거래정지여부') 
    is_trading_halted = is_trading_halted[is_trading_halted['거래정지여부'] == '정지']

    return [
        finance_sector,
        is_under_supervision,
        is_trading_halted,
    ]

def apply_filters(long_format_df, filter_dfs):
    for filter_df in filter_dfs:
        filter_df['_flag_right'] = 1
        long_format_df = long_format_df.merge(
            filter_df,
            on=['date', 'Symbol'],
            how='left',
            suffixes=('', '_right')
        )
        long_format_df = long_format_df[long_format_df['_flag_right'].isnull()] 
        long_format_df.drop(columns=[c for c in long_format_df.columns if c.endswith('_right')], inplace=True)
        long_format_df.reset_index(drop=True, inplace=True)
    return long_format_df

def get_univ_list(long_format_df, reference_item='수익률 (1개월)(%)'):
    assert reference_item in UNIV_REFERENCE_ITEMS, f"유니버스 구축을 위해 {UNIV_REFERENCE_ITEMS} 중 하나가 필요합니다."
    only_existing = long_format_df.groupby('Symbol').filter(
        lambda x: x[reference_item].notnull().any()
    )
    return only_existing['Symbol'].unique()

def get_wide_format_df(long_format_df, item_name):
    return long_format_df.pivot_table(
        index='date',
        columns='Symbol',
        values=item_name,
    )

def get_data(long_format_df, items, univ_list, item: list | str | None = None, multiindex: bool = True):
    if isinstance(item, str):
        assert item in items, f"{item} is not in the item list"
        assert item in NUMERIC_DATA, f"{item} is not a numeric data"
        data = get_wide_format_df(long_format_df, item)
        data = data.reindex(columns=univ_list)
        if item in DIV_BY_100:
            data = data / 100
        elif item in MULTIPLY_BY_1000:
            data = data * 1000
    elif isinstance(item, list):
        for i in item:
            assert i in items, f"{i} is not in the item list"
            assert i in NUMERIC_DATA, f"{i} is not a numeric data"
        data = long_format_df.loc[:, FN_INDEX_COLS + item]
        for col in data.columns:
            if col in DIV_BY_100:
                data[col] = data[col] / 100
            elif col in MULTIPLY_BY_1000:
                data[col] = data[col] * 1000
        if multiindex:
            data.drop(columns=['Symbol Name',], inplace=True)
            data.index.name = None
            data.set_index(['date', 'Symbol'], inplace=True)
        data = data.reindex(univ_list, level=1)
    elif item is None:
        data = long_format_df.copy()
        if multiindex:
            data.drop(columns=['Symbol Name',], inplace=True)
            data.index.name = None
            data.set_index(['date', 'Symbol'], inplace=True)
        data = data.reindex(univ_list, level=1)
    else:
        raise ValueError("""
                         item은 
                         - str (1개 item만 wide-format 반환) 
                         - list (선택한 item들 long-format 반환)
                         - None (전체 long-format 반환)
                         중 하나여야 합니다.
                         (numeric data만 선택 가능)
                         """)
    return data

def symbol_to_name(symbol_code, symbol_to_name_mapping):
    return symbol_to_name_mapping[symbol_code]

def name_to_symbol(symbol_name, name_to_symbol_mapping):
    return name_to_symbol_mapping[symbol_name]


In [124]:
fn1_df = melt_dataguide_csv(fn1, encoding='utf-8')
items = fn1_df['Item Name '].unique()

In [125]:
fn1_df.drop(columns=['Kind', 'Item', 'Frequency'], inplace=True)

KeyError: "['Kind', 'Item', 'Frequency'] not found in axis"

In [110]:
items

array(['종가(원)', '수정계수', '수정주가(원)', '수익률 (1개월)(%)', 'FnGuide Sector',
       '거래정지여부', '관리종목여부', '보통주자본금(천원)', '자본잉여금(천원)', '이익잉여금(천원)',
       '자기주식(천원)', '이연법인세부채(천원)', '매출액(천원)', '매출원가(천원)', '이자비용(천원)',
       '영업이익(천원)', '총자산(천원)', '기말발행주식수 (보통)(주)'], dtype=object)

In [113]:
long_format_df = pivot_numerics(fn1_df)


In [114]:
long_format_df[ long_format_df['Symbol'] == 'A000020']

Item Name,date,Symbol,Symbol Name,FnGuide Sector,거래정지여부,관리종목여부,기말발행주식수 (보통)(주),매출액(천원),매출원가(천원),보통주자본금(천원),...,수정계수,수정주가(원),영업이익(천원),이연법인세부채(천원),이익잉여금(천원),이자비용(천원),자기주식(천원),자본잉여금(천원),종가(원),총자산(천원)
1,2013-01-31,A000020,동화약품,의료,정상,정상,27931470,220240575,116534418,27931470,...,1.000000,6030,2062220,0,170147330,384335,0,26919758,6030,316165541
3740,2013-02-28,A000020,동화약품,의료,정상,정상,27931470,220240575,116534418,27931470,...,1.000000,6030,2062220,0,170147330,384335,0,26919758,6030,316165541
7479,2013-03-31,A000020,동화약품,의료,정상,정상,27931470,220240575,116534418,27931470,...,1.000000,6110,2062220,0,170147330,384335,0,26919758,6110,316165541
11218,2013-04-30,A000020,동화약품,의료,정상,정상,27931470,220240575,116534418,27931470,...,1.000000,6750,2062220,0,170147330,384335,0,26919758,6750,316165541
14957,2013-05-31,A000020,동화약품,의료,정상,정상,27931470,220240575,116534418,27931470,...,1.000000,6930,2062220,0,170147330,384335,0,26919758,6930,316165541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508505,2024-05-31,A000020,동화약품,의료,정상,정상,27931470,,,,...,1.000000,8280,,,,,,,8280,
512244,2024-06-30,A000020,동화약품,의료,정상,정상,27931470,,,,...,1.000000,8060,,,,,,,8060,
515983,2024-07-31,A000020,동화약품,의료,정상,정상,27931470,,,,...,1.000000,8340,,,,,,,8340,
519722,2024-08-31,A000020,동화약품,의료,정상,정상,27931470,,,,...,1.000000,8070,,,,,,,8070,


In [115]:

long_format_df = preprocess_numerics(long_format_df)



In [126]:

# Apply filters: e.g., for 금융 제거, 관리종목여부, 거래정지여부
filter_dfs = make_filters(fn1_df)
long_format_df = apply_filters(long_format_df, filter_dfs)


In [127]:
univ_list = get_univ_list(long_format_df, '수익률 (1개월)(%)')
print(univ_list)


['A000020' 'A000040' 'A000050' ... 'A001260' 'A019660' 'A023460']


In [128]:
len(univ_list)

2616

In [129]:
data = get_data(long_format_df, items, univ_list, item='수익률 (1개월)(%)') # wide는 정상작동

In [130]:
data = get_data(long_format_df, items, univ_list, item='이자비용(천원)') # wide는 정상작동

In [131]:
data.shape

(138, 2616)

In [132]:
data = get_data(long_format_df, items, univ_list, item=['수익률 (1개월)(%)', '이자비용(천원)'], multiindex=True) # long은 정상작동

In [133]:
data

Unnamed: 0_level_0,Item Name,수익률 (1개월)(%),이자비용(천원)
date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-31,A000020,-0.0017,3.843350e+08
2013-01-31,A000040,0.0020,8.133330e+08
2013-01-31,A000050,-0.0346,1.394462e+10
2013-01-31,A000070,-0.0451,1.717600e+10
2013-01-31,A000080,0.0526,5.643810e+10
...,...,...,...
2024-09-19,A037950,-0.0910,
2024-09-19,A038880,-0.1603,
2024-09-19,A001260,-0.0537,
2024-09-19,A019660,-0.0184,


In [87]:
item=['수익률 (1개월)(%)', '이자비용(천원)']
data = long_format_df.loc[:, FnData.FN_INDEX_COLS + item]

In [88]:
data.drop(columns=['Symbol Name', 'Kind', 'Frequency'], inplace=True)

In [89]:
data.index.name = None

In [90]:
univ = set(univ_list)
data_univ = set(data['Symbol'].unique())

In [91]:
univ - data_univ

set()

In [94]:
data[ data[['date', 'Symbol']].duplicated() ]

Item Name,date,Symbol,수익률 (1개월)(%),이자비용(천원)
1,2013-01-31,A000020,-0.17,
3,2013-01-31,A000040,0.20,
5,2013-01-31,A000050,-3.46,
7,2013-01-31,A000070,-4.51,
9,2013-01-31,A000080,5.26,
...,...,...,...,...
574236,2024-05-31,A097870,-0.82,
574529,2024-05-31,A189690,-7.71,
576072,2024-06-30,A050860,8.87,
576511,2024-06-30,A097870,-2.57,


In [96]:
check = data[ data['Symbol'] == 'A000020']
check

Item Name,date,Symbol,수익률 (1개월)(%),이자비용(천원)
0,2013-01-31,A000020,,384335.0
1,2013-01-31,A000020,-0.17,
3826,2013-02-28,A000020,,384335.0
3827,2013-02-28,A000020,0.33,
7650,2013-03-31,A000020,,384335.0
...,...,...,...,...
572939,2024-05-31,A000020,-5.05,
575215,2024-06-30,A000020,-4.05,
577496,2024-07-31,A000020,3.47,
579780,2024-08-31,A000020,-2.54,


In [102]:
check['date'].value_counts()

date
2013-01-31    2
2020-01-31    2
2021-03-31    2
2021-02-28    2
2021-01-31    2
             ..
2024-05-31    1
2024-06-30    1
2024-07-31    1
2024-08-31    1
2024-09-19    1
Name: count, Length: 141, dtype: int64

In [101]:
check2 = fn1_df[ fn1_df['Symbol'] == 'A000020']
check2 = check2[ check2['Item Name '] == '수익률 (1개월)(%)' ]
check2

Unnamed: 0,Symbol,Symbol Name,Kind,Item,Item Name,Frequency,date,value
13467,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2013-01-31,-0.17
80769,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2013-02-28,0.33
148071,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2013-03-31,1.33
215373,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2013-04-30,10.47
282675,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2013-05-31,2.67
...,...,...,...,...,...,...,...,...
9166539,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2024-05-31,-5.05
9233841,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2024-06-30,-4.05
9301143,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2024-07-31,3.47
9368445,A000020,동화약품,SSC,S41000180F,수익률 (1개월)(%),DAILY,2024-08-31,-2.54


In [86]:
data.set_index(['date', 'Symbol'], inplace=True)

In [80]:
data

Unnamed: 0_level_0,Item Name,수익률 (1개월)(%),이자비용(천원)
date,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-31,A000020,,384335.0
2013-01-31,A000020,-0.17,
2013-01-31,A000040,,813333.0
2013-01-31,A000040,0.20,
2013-01-31,A000050,,13944625.0
...,...,...,...
2024-09-19,A460860,-5.08,
2024-09-19,A462520,6.18,
2024-09-19,A465770,-18.76,
2024-09-19,A472850,0.57,


In [79]:
data.reindex(univ_list, level=1)

ValueError: cannot reindex on an axis with duplicate labels

In [63]:
dup = long_format_df[ long_format_df[['date', 'Symbol']].duplicated() ][['date', 'Symbol']]
dup

Item Name,date,Symbol
1,2013-01-31,A000020
3,2013-01-31,A000040
5,2013-01-31,A000050
7,2013-01-31,A000070
9,2013-01-31,A000080
...,...,...
574236,2024-05-31,A097870
574529,2024-05-31,A189690
576072,2024-06-30,A050860
576511,2024-06-30,A097870


In [64]:
long_format_df[  ]

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects

In [40]:
right_df = filter_dfs[0].copy()
right_df['_flag'] = 1


In [41]:
right_df

Item Name,date,Symbol,Symbol Name,Kind,Frequency,FnGuide Sector,_flag
0,2013-01-31,A000010,조흥은행,COM,,금융,1
2,2013-01-31,A000030,우리은행,COM,,금융,1
5,2013-01-31,A000060,메리츠화재,COM,,금융,1
10,2013-01-31,A000110,제일은행,COM,,금융,1
31,2013-01-31,A000370,한화손해보험,COM,,금융,1
...,...,...,...,...,...,...,...
527155,2024-09-19,A479880,한국제15호스팩,COM,,금융,1
527156,2024-09-19,A481890,엔에이치스팩31호,COM,,금융,1
527157,2024-09-19,A482520,교보16호스팩,COM,,금융,1
527158,2024-09-19,A482680,미래에셋비전스팩7호,COM,,금융,1


In [42]:

dd = long_format_df.merge(
    right_df,
    on=['date', 'Symbol'],
    how='left',
    suffixes=('', '_right')
)

In [43]:
len(dd)

639194

In [44]:
dd['_flag'].value_counts()

_flag
1.0    35793
Name: count, dtype: int64

In [45]:
dd['_flag'].isnull().sum()

603401

In [32]:
dd.columns

Index(['date', 'Symbol', 'Symbol Name', 'Kind', 'Frequency', '기말발행주식수 (보통)(주)',
       '매출액(천원)', '매출원가(천원)', '보통주자본금(천원)', '수익률 (1개월)(%)', '수정계수', '수정주가(원)',
       '영업이익(천원)', '이연법인세부채(천원)', '이익잉여금(천원)', '이자비용(천원)', '자기주식(천원)',
       '자본잉여금(천원)', '종가(원)', '총자산(천원)', 'Symbol Name_right', 'Kind_right',
       'Frequency_right', 'FnGuide Sector', '_flag'],
      dtype='object', name='Item Name ')

In [27]:
dd['_merge'].unique()

['both']
Categories (3, object): ['left_only', 'right_only', 'both']