In [1]:
import pandas as pd 
import numpy as np

In [2]:
df_1 = pd.read_csv(r'csv/df_company_info.csv')
df_1.columns

Index(['stock_code', 'company_name', 'short_name', 'en_company_name',
       'en_short_name', 'combine_profile', 'industry', 'exchange',
       'foreign_percent', 'issue_share', 'no_shareholders', 'stock_rating',
       'website', 'stock_indices', 'is_bank', 'is_securities'],
      dtype='object')

In [3]:
df_2 = pd.read_parquet(r'csv/financial_statement.parquet')
df_2.head()

Unnamed: 0,stock_code,year,quarter,data,date_added,category_code
0,BID,2020,0,100687502.0,2020-12-30,IS_001
1,BID,2021,0,101007908.0,2021-12-30,IS_001
2,BID,2022,0,121110719.0,2022-12-30,IS_001
3,BID,2023,0,152761316.0,2023-12-30,IS_001
4,BID,2016,0,62957295.0,2016-12-30,IS_001


In [4]:
df_v3_report = pd.read_parquet(r'csv/financial_statement_v3.parquet')

# Financial report

## v2

In [5]:
# Merge df_2 with df_1 to get industry information
df_merged_report_v2 = pd.merge(df_2, df_1[['stock_code', 'industry']], on='stock_code', how='left')

industry_report_v2 = df_merged_report_v2.groupby(['industry', 'year', 'quarter', 'category_code'])['data'].mean().reset_index()

industry_report_v2

Unnamed: 0,industry,year,quarter,category_code,data
0,Automobiles and Parts,2016,0,BS_100,2110035.0
1,Automobiles and Parts,2016,0,BS_110,428688.0
2,Automobiles and Parts,2016,0,BS_111,84155.0
3,Automobiles and Parts,2016,0,BS_113,344533.0
4,Automobiles and Parts,2016,0,BS_120,134298.0
...,...,...,...,...,...
99637,"Utilities (Electricity, Water & Gas)",2024,3,IS_100,507387.0
99638,"Utilities (Electricity, Water & Gas)",2024,3,IS_103,438097.5
99639,"Utilities (Electricity, Water & Gas)",2024,3,IS_104,69289.5
99640,"Utilities (Electricity, Water & Gas)",2024,3,IS_130,594.5


In [7]:
industry_report_v2.to_parquet(r'D:\python\financial statement prj\chatbot_financial_statement\csv\industry_report_v2.parquet')

## v3

In [6]:
df_merged_report_v3 = pd.merge(df_v3_report, df_1[['stock_code', 'industry']], on='stock_code', how='left')

industry_report_v3 = df_merged_report_v3.groupby(['industry', 'year', 'quarter', 'category_code'])['data'].agg(['sum', 'mean']).reset_index()

industry_report_v3.rename(columns={'sum': 'data_sum', 'mean': 'data_mean'}, inplace=True)

industry_report_v3

Unnamed: 0,industry,year,quarter,category_code,data_sum,data_mean
0,Automobiles and Parts,2015,0,BS_100,1.975191e+06,1.975191e+06
1,Automobiles and Parts,2015,0,BS_110,5.768801e+05,5.768801e+05
2,Automobiles and Parts,2015,0,BS_111,2.956801e+05,2.956801e+05
3,Automobiles and Parts,2015,0,BS_113,2.812000e+05,2.812000e+05
4,Automobiles and Parts,2015,0,BS_120,4.524923e+04,4.524923e+04
...,...,...,...,...,...,...
146934,"Utilities (Electricity, Water & Gas)",2024,3,IS_100,3.598148e+06,8.995369e+05
146935,"Utilities (Electricity, Water & Gas)",2024,3,IS_103,3.403254e+06,8.508135e+05
146936,"Utilities (Electricity, Water & Gas)",2024,3,IS_104,1.948937e+05,4.872342e+04
146937,"Utilities (Electricity, Water & Gas)",2024,3,IS_130,2.242000e+03,5.605000e+02


In [7]:
industry_report_v3.to_parquet('csv/industry_report_v3.parquet')

# Financial ratio

In [4]:
df_3_v2 = pd.read_parquet(r'csv/financial_ratio.parquet')
df_3_v2.head()

Unnamed: 0,stock_code,year,quarter,data,ratio_name,ratio_code,date_added
0,ACV,2016,0,3495915.0,EBIT,EBIT,2016-12-30
1,ACV,2016,0,0.5292654,Equity Ratio,ER,2016-12-30
2,ACV,2016,0,1.551783,Long Term Asset Self Financing Ratio,LTASFR,2016-12-30
3,ACV,2016,0,1.741159,Fixed Asset Self Financing Ratio,FASFR,2016-12-30
4,ACV,2016,0,2.124339,General Solvency Ratio,GSR,2016-12-30


In [5]:
df_3_v3 = pd.read_parquet('csv/financial_ratio_v3.parquet')

In [11]:
top_20_stocks = df_merged_report_v2[df_merged_report_v2['category_code'] == 'BS_270'].groupby(['industry', 'year', 'quarter']).apply(
    lambda x: x.nlargest(20, 'data')
).reset_index(drop=True)

top_20_stock_codes = top_20_stocks['stock_code'].unique()

df_3_v2_filtered = df_3_v2[df_3_v2['stock_code'].isin(top_20_stock_codes)]

df_merged_ratio_v2 = pd.merge(df_3_v2_filtered, df_1[['stock_code', 'industry']], on='stock_code', how='left')

industry_ratio_v2 = df_merged_ratio_v2.groupby(['industry', 'year', 'quarter', 'ratio_code'])['data'].mean().reset_index()

industry_ratio_v2.rename(columns={'data': 'data_mean'}, inplace=True)
industry_ratio_v2


  top_20_stocks = df_merged_report_v2[df_merged_report_v2['category_code'] == 'BS_270'].groupby(['industry', 'year', 'quarter']).apply(


Unnamed: 0,industry,year,quarter,ratio_code,data_mean
0,Automobiles and Parts,2016,0,AFDAR,-0.004934
1,Automobiles and Parts,2016,0,AFDATTAR,-0.000405
2,Automobiles and Parts,2016,0,ATDR,1.373261
3,Automobiles and Parts,2016,0,BEP,0.072477
4,Automobiles and Parts,2016,0,BEPR,0.072477
...,...,...,...,...,...
29183,"Utilities (Electricity, Water & Gas)",2024,3,RTSR,1.541659
29184,"Utilities (Electricity, Water & Gas)",2024,3,RTTAR,0.119676
29185,"Utilities (Electricity, Water & Gas)",2024,3,STDTAR,0.268588
29186,"Utilities (Electricity, Water & Gas)",2024,3,STDTER,0.591628


In [13]:
industry_ratio_v2.to_parquet(r'D:\python\financial statement prj\chatbot_financial_statement\csv\industry_ratio_v2.parquet')

In [12]:
top_20_stocks_v3 = df_merged_report_v3[df_merged_report_v3['category_code'] == 'BS_270'].groupby(['industry', 'year', 'quarter']).apply(
    lambda x: x.nlargest(20, 'data')
).reset_index(drop=True)

top_20_stock_codes_v3 = top_20_stocks_v3['stock_code'].unique()

df_3_v3_filtered = df_3_v3[df_3_v3['stock_code'].isin(top_20_stock_codes_v3)]

df_merged_ratio_v3 = pd.merge(df_3_v3_filtered, df_1[['stock_code', 'industry']], on='stock_code', how='left')

industry_ratio_v3 = df_merged_ratio_v3.groupby(['industry', 'year', 'quarter', 'ratio_code'])['data'].mean().reset_index()

industry_ratio_v3.rename(columns={'data': 'data_mean'}, inplace=True)
industry_ratio_v3


  top_20_stocks_v3 = df_merged_report_v3[df_merged_report_v3['category_code'] == 'BS_270'].groupby(['industry', 'year', 'quarter']).apply(


Unnamed: 0,industry,year,quarter,ratio_code,data_mean
0,Automobiles and Parts,2016,0,AFDAR,-0.004935
1,Automobiles and Parts,2016,0,AFDATTAR,-0.000405
2,Automobiles and Parts,2016,0,ATDR,1.373261
3,Automobiles and Parts,2016,0,BEP,0.032609
4,Automobiles and Parts,2016,0,BEPR,0.032609
...,...,...,...,...,...
37925,"Utilities (Electricity, Water & Gas)",2024,3,RTSR,1.031842
37926,"Utilities (Electricity, Water & Gas)",2024,3,RTTAR,0.182978
37927,"Utilities (Electricity, Water & Gas)",2024,3,STDTAR,0.362961
37928,"Utilities (Electricity, Water & Gas)",2024,3,STDTER,0.974953


In [13]:
industry_ratio_v3.to_parquet('csv/industry_ratio_v3.parquet')

In [19]:
industry_report_v2 = pd.read_parquet(r'csv/industry_report_v2.parquet')
industry_report_v3 = pd.read_parquet(r'csv/industry_report_v3.parquet')

industry_report_v2.rename(columns={'universal_code': 'category_code'}, inplace=True)

industry_report_v3.rename(columns={'universal_code': 'category_code'}, inplace=True)
industry_report_v3.to_parquet(r'csv/industry_report_v3.parquet')
industry_report_v2.to_parquet(r'csv/industry_report_v2.parquet')