In [1]:
import pandas as pd
import numpy as np

In [2]:
excel_file = pd.ExcelFile(r"C:\Users\Admin\Downloads\fiin\year\FiinProX_DuLieuTaiChinh_BaoCaoTaiChinh_Yearly_VPB_20241218.xlsx", engine="openpyxl")

In [3]:
mapping_file = pd.ExcelFile('vietnames_to_fiin.xlsx')

In [4]:
df_map_bank = mapping_file.parse('fiin_bank')
df_map_sec = mapping_file.parse('fiin_sec')
df_map_corp = mapping_file.parse('fiin_corp')

In [5]:
df_map_sec.dropna(subset=['category_code'], inplace=True)
df_map_bank.dropna(subset=['category_code'], inplace=True)
df_map_corp.dropna(subset=['category_code'], inplace=True)

In [6]:
df_map_sec.to_csv('../csv/map_category_sec_v3.csv', index=False)
df_map_bank.to_csv('../csv/map_category_bank_v3.csv', index=False)
df_map_corp.to_csv('../csv/map_category_corp_v3.csv', index=False)

In [7]:
df_bs = mapping_file.parse('Standard BS')
df_is = mapping_file.parse('Standard IS')
df_cf = mapping_file.parse('Standard CF')

df_bs['Universal_caption'] = df_bs['Universal_caption'].apply(lambda x: f'(Balance sheet) {x}')
df_is['Universal_caption'] = df_is['Universal_caption'].apply(lambda x: f'(Income statement) {x}')
df_cf['Universal_caption'] = df_cf['Universal_caption'].apply(lambda x: f'(Cash flow) {x}')
df = pd.concat([df_bs, df_is, df_cf], ignore_index=True)

df.to_csv('../csv/map_category_code_universal_v3.csv', index=False)

In [8]:
def get_source(text):
    if isinstance(text, str):
        return text.split('_')[0]
    return text

df_map_bank['source'] = df_map_bank['category_code'].apply(get_source)
df_map_sec['source'] = df_map_sec['category_code'].apply(get_source)    
df_map_corp['source'] = df_map_corp['category_code'].apply(get_source)  

In [9]:
df_map_bank_bs = df_map_bank[df_map_bank['source'] == 'BS']
df_map_bank_is = df_map_bank[df_map_bank['source'] == 'IS']
df_map_bank_cf = df_map_bank[df_map_bank['source'] == 'CF']

df_map_corp_bs = df_map_corp[df_map_corp['source'] == 'BS']
df_map_corp_is = df_map_corp[df_map_corp['source'] == 'IS']
df_map_corp_cf = df_map_corp[df_map_corp['source'] == 'CF']

df_map_sec_bs = df_map_sec[df_map_sec['source'] == 'BS']
df_map_sec_is = df_map_sec[df_map_sec['source'] == 'IS']
df_map_sec_cf = df_map_sec[df_map_sec['source'] == 'CF']


In [10]:
bank_args = {
    'Kết quả kinh doanh': {
        'nrows': 30,
        'mapping': df_map_bank_is
    },
    'Bảng cân đối kế toán': {
        'nrows': 92,
        'mapping': df_map_bank_bs
    },
    'Lưu chuyển tiền tệ': {
        'nrows': 62,
        'mapping': df_map_bank_cf
    }
    
}

corp_args = {
    'Kết quả kinh doanh': {
        'nrows': 29,
        'mapping': df_map_corp_is
    },
    'Bảng cân đối kế toán': {
        'nrows': 126,
        'mapping': df_map_corp_bs
    },
    'Lưu chuyển tiền tệ': {
        'nrows': 45,
        'mapping': df_map_corp_cf
    }
    
}

sec_args = {
    'Kết quả kinh doanh': {
        'nrows': 93,
        'mapping': df_map_sec_is
    },
    'Bảng cân đối kế toán': {
        'nrows': 216,
        'mapping': df_map_sec_bs
    },
    'Lưu chuyển tiền tệ': {
        'nrows': 159,
        'mapping': df_map_sec_cf
    }
    
}

In [11]:

def get_data(excel_file, type_):
    
    dfs = []
    if type_ == 'bank':
        args = bank_args
    elif type_ == 'corp':
        args = corp_args
    else:
        args = sec_args
       
    sheet_names = list(args.keys())
    for sheet_name in sheet_names:

        df_bs = excel_file.parse(
            sheet_name = sheet_name,
            skiprows=10,
            nrows=args[sheet_name]['nrows'],
        )

        df_bs = df_bs.iloc[3:]
        fiin_cate = df_bs['Chỉ tiêuTriệu VND'].values.tolist()
        map_cate = args[sheet_name]['mapping'][['category_code', 'vi_caption']].values.tolist()

        # Map the corresponding category code

        map_index = 0
        cate_index = 0
        cate_code = []
        for i in range(len(fiin_cate)):
            cate_index = i
            if fiin_cate[i] == map_cate[map_index][1]:
                
                cate_code.append(map_cate[map_index][0])
                map_index += 1
            else:
                cate_code.append(np.nan)
            if map_index == len(map_cate):
                break
            
        cate_code.extend([np.nan]*(len(fiin_cate) - len(cate_code)))


        df_bs['category_code'] = cate_code
    #     dfs.append(df_bs)
    # return dfs
    
        # Convert to table

        table = []
        df_bs.dropna(subset=['category_code'], inplace=True)

        for index, row in df_bs.iterrows():
            
            for col in df_bs.columns:
                if 'VND' in col or col == 'category_code':
                    continue
                
                time = col 
                data = row[col]
                cate = row['category_code']
                
                table.append([cate, time, data])
                
        df = pd.DataFrame(table, columns=['category_code', 'time', 'data'])
        dfs.append(df)
        
    df = pd.concat(dfs)
    return df

In [12]:
non_bank_stock_code = ["HSG", "ELC", "VSC", "ACV", "REE", "SZC", "CSV", "PAN", "BSR", "SGP", "GMD", "ITD","FOX", "KDC", "SBT", "VGC", "HBC", "CTD", "DIG", "SCR", "KBC","MWG", "NHA", "VNM", "HPG", "VHM", "PNJ", "YEG", "FPT","MSN", "GAS", "VRE", "VJC", "VIC", "PLX", "SAB", "POW", "GVR", "BCM", "VPI", "DVM", "KDH", "HDC", "TCH", "CEO", "HUT", "NVL", "DBC", "SAF", "DHT", "VTP", "PVT", "FRT", "DGC", "DCM", "NKG", "CMG", "VGI", "PVC", "CAP", "DTD", "HLD", "L14", "L18", "LAS", "LHC", "NTP", "PLC", "PSD", "PVG", "PVS", "SLS", "TIG", "TMB", "TNG", "TVD", "VC3", "VCS", "DXG"]
bank_stock_code = ["BID", "EIB", "OCB", "CTG", "VCB", "ACB", "MBB", "HDB", "TPB", "VPB",  "STB", "TCB",  "SHB", "VIB", "CTG",  "ABB", "LPB", "NVB"]
securities_stock_code = ["MBS", "VND", "SSI", "VIX", "ORS"]

In [13]:
file_dir = r"C:\Users\Admin\Downloads\fiin\year\FiinProX_DuLieuTaiChinh_BaoCaoTaiChinh_Yearly_{code}_20241218.xlsx"

In [14]:
from tqdm import tqdm

In [15]:

dfs_corp = []
for code in tqdm(non_bank_stock_code):
    
    excel_file = pd.ExcelFile(file_dir.format(code=code), engine="openpyxl")
    df_corp_y = get_data(excel_file, 'corp')
    df_corp_y['stock_code'] = code
    dfs_corp.append(df_corp_y)
    
dfs_bank = []
for code in tqdm(bank_stock_code):
    excel_file = pd.ExcelFile(file_dir.format(code=code), engine="openpyxl")
    df_bank_y = get_data(excel_file, 'bank')
    df_bank_y['stock_code'] = code
    dfs_bank.append(df_bank_y)
    
dfs_sec = []
for code in securities_stock_code:
    excel_file = pd.ExcelFile(file_dir.format(code=code), engine="openpyxl")
    df_sec_y = get_data(excel_file, 'sec')
    df_sec_y['stock_code'] = code
    dfs_sec.append(df_sec_y)
    

100%|██████████| 79/79 [00:05<00:00, 15.60it/s]
100%|██████████| 18/18 [00:01<00:00, 14.17it/s]


In [16]:
df_bank_y

Unnamed: 0,category_code,time,data,stock_code
0,IS_003,2006,2.947621e+04,NVB
1,IS_003,2007,7.577242e+04,NVB
2,IS_003,2008,2.123786e+05,NVB
3,IS_003,2009,2.869541e+05,NVB
4,IS_003,2010,4.902644e+05,NVB
...,...,...,...,...
760,CF_045,2019,1.511652e+07,NVB
761,CF_045,2020,1.386262e+07,NVB
762,CF_045,2021,5.273563e+06,NVB
763,CF_045,2022,1.579241e+07,NVB


In [17]:
file_dir_quarter = r"C:\Users\Admin\Downloads\fiin\quarter\FiinProX_DuLieuTaiChinh_BaoCaoTaiChinh_Quarterly_{code}_20241218.xlsx"

for code in tqdm(non_bank_stock_code):
    excel_file = pd.ExcelFile(file_dir_quarter.format(code=code), engine="openpyxl")
    df_corp_q = get_data(excel_file, 'corp')
    df_corp_q['stock_code'] = code
    dfs_corp.append(df_corp_q)
    
for code in tqdm(bank_stock_code):
    excel_file = pd.ExcelFile(file_dir_quarter.format(code=code), engine="openpyxl")
    df_bank_q = get_data(excel_file, 'bank')
    df_bank_q['stock_code'] = code
    dfs_bank.append(df_bank_q)
    
for code in securities_stock_code:
    excel_file = pd.ExcelFile(file_dir_quarter.format(code=code), engine="openpyxl")
    df_sec_q = get_data(excel_file, 'sec')
    df_sec_q['stock_code'] = code
    dfs_sec.append(df_sec_q)


100%|██████████| 79/79 [00:10<00:00,  7.29it/s]
100%|██████████| 18/18 [00:02<00:00,  8.06it/s]


In [18]:
df_bank = pd.concat(dfs_bank)
df_corp = pd.concat(dfs_corp)
df_sec = pd.concat( dfs_sec)

In [19]:
df_bank

Unnamed: 0,category_code,time,data,stock_code
0,IS_003,2003,555890.0,BID
1,IS_003,2004,1913940.0,BID
2,IS_003,2005,2929368.0,BID
3,IS_003,2006,3350841.0,BID
4,IS_003,2007,4856449.0,BID
...,...,...,...,...
2470,CF_045,Q3/2023,10709418.0,NVB
2471,CF_045,Q4/2023,10996473.0,NVB
2472,CF_045,Q1/2024,11437155.0,NVB
2473,CF_045,Q2/2024,10333757.0,NVB


In [20]:
def get_quarter_time(text):
    if '/' not in text:
        return 0, int(text)
    
    quarter, year = text.split('/')
    return int(quarter[1]), int(year)

df_bank['quarter'], df_bank['year'] = zip(*df_bank['time'].apply(get_quarter_time))

In [21]:
df_corp['quarter'], df_corp['year'] = zip(*df_corp['time'].apply(get_quarter_time))
df_sec['quarter'], df_sec['year'] = zip(*df_sec['time'].apply(get_quarter_time))

In [22]:
df_bank.drop(columns=['time'], inplace=True)
df_corp.drop(columns=['time'], inplace=True)
df_sec.drop(columns=['time'], inplace=True)

In [23]:
quarter_to_month = {
    0: 12,  # Quarter 0 is December of the same year
    1: 3,
    2: 6,
    3: 9,
    4: 12
}

df_sec['date_added'] = pd.to_datetime(df_sec.apply(lambda row: f"{row['year']}-{quarter_to_month[row['quarter']]}-30", axis=1))
df_bank['date_added'] = pd.to_datetime(df_bank.apply(lambda row: f"{row['year']}-{quarter_to_month[row['quarter']]}-30", axis=1))
df_corp['date_added'] = pd.to_datetime(df_corp.apply(lambda row: f"{row['year']}-{quarter_to_month[row['quarter']]}-30", axis=1))

In [24]:
test_bank = df_bank[(df_bank['stock_code'] == 'VPB') & (df_bank['year'] == 2023)]
test_bank

Unnamed: 0,category_code,data,stock_code,quarter,year,date_added
17,IS_003,38174676.0,VPB,0,2023,2023-12-30
35,IS_001,76557377.0,VPB,0,2023,2023-12-30
53,IS_002,-38382701.0,VPB,0,2023,2023-12-30
71,IS_006,7211882.0,VPB,0,2023,2023-12-30
89,IS_004,12307787.0,VPB,0,2023,2023-12-30
...,...,...,...,...,...,...
2064,CF_044,0.0,VPB,4,2023,2023-12-30
2108,CF_045,75175465.0,VPB,1,2023,2023-03-30
2109,CF_045,107910073.0,VPB,2,2023,2023-06-30
2110,CF_045,96217984.0,VPB,3,2023,2023-09-30


In [25]:
pivot_df  = df_bank[df_bank['category_code'].isin(['BS_310', 'BS_320', 'BS_321', 'BS_322', 'BS_330', 'BS_340', 'BS_350', 'BS_360'])].pivot_table(
    index=["stock_code", "year", "quarter"], 
    columns="category_code", 
    values="data"
).reset_index()

pivot_df['BS_361'] = pivot_df['BS_310'] + pivot_df['BS_320'] + pivot_df['BS_321'] + pivot_df['BS_322'] + pivot_df['BS_330'] + pivot_df['BS_340'] + pivot_df['BS_350'] + pivot_df['BS_360']

new_rows = pivot_df.melt(
    id_vars=["stock_code", "year", "quarter"], 
    value_vars=["BS_361"], 
    var_name="category_code", 
    value_name="data"
)

df_bank = pd.concat([df_bank, new_rows], ignore_index=True)
# df_bank.to_parquet('../csv/bank_financial_report_v2_2.parquet', index=False)

In [None]:
pivot_df = df_sec[df_sec['category_code'].isin(['IS_040.1', 'IS_050', 'IS_060', 'IS_061', 'IS_062'])].pivot_table(
    index=["stock_code", "year", "quarter"], 
    columns="category_code", 
    values="data"
).reset_index()

pivot_df['IS_070'] = pivot_df['IS_040.1'] + pivot_df['IS_050'] - pivot_df['IS_060'] - pivot_df['IS_061'] - pivot_df['IS_062']

new_rows = pivot_df.melt(
    id_vars=["stock_code", "year", "quarter"], 
    value_vars=["IS_040.1"], 
    var_name="category_code", 
    value_name="data"
)
df_sec = pd.concat([df_sec, new_rows], ignore_index=True)

In [27]:
df_sec = df_sec[df_sec['year']>=2016]
df_bank = df_bank[df_bank['year']>=2016]
df_corp = df_corp[df_corp['year']>=2016]


In [28]:
df_sec.fillna(0, inplace=True)
df_bank.fillna(0, inplace=True)
df_corp.fillna(0, inplace=True)

  df_sec.fillna(0, inplace=True)
  df_bank.fillna(0, inplace=True)


In [29]:
df_sec.to_parquet('../csv/securities_financial_report_v3.parquet', index=False)
df_bank.to_parquet('../csv/bank_financial_report_v3.parquet', index=False)
df_corp.to_parquet('../csv/non_bank_financial_report_v3.parquet', index=False)

In [30]:
dftest = pd.read_parquet('../csv/non_bank_financial_report_v2_2.parquet')

In [31]:
df_corp

Unnamed: 0,category_code,data,stock_code,quarter,year,date_added
11,IS_001,1.985372e+07,HSG,0,2016,2016-12-30
12,IS_001,2.847371e+07,HSG,0,2017,2017-12-30
13,IS_001,3.419811e+07,HSG,0,2018,2018-12-30
14,IS_001,2.712138e+07,HSG,0,2019,2019-12-30
15,IS_001,3.034337e+07,HSG,0,2020,2020-12-30
...,...,...,...,...,...,...
2199,CF_070,6.438210e+05,DXG,3,2023,2023-09-30
2200,CF_070,2.760686e+05,DXG,4,2023,2023-12-30
2201,CF_070,1.399343e+06,DXG,1,2024,2024-03-30
2202,CF_070,1.069653e+06,DXG,2,2024,2024-06-30


In [37]:
df_bank = pd.read_parquet('../csv/bank_financial_report_v3.parquet')
df_sec = pd.read_parquet('../csv/securities_financial_report_v3.parquet')
df_corp = pd.read_parquet('../csv/non_bank_financial_report_v3.parquet')

In [38]:
df_bank.rename(columns={'category_code': 'bank_code'}, inplace=True)
df_sec.rename(columns={'category_code': 'sec_code'}, inplace=True)
df_corp.rename(columns={'category_code': 'corp_code'}, inplace=True)

In [39]:
df_sec = pd.merge(df_sec, df[['sec_code', 'universal_code']], how='left', on='sec_code')
df_corp = pd.merge(df_corp, df[['corp_code', 'universal_code']], how='left', on='corp_code')
df_bank = pd.merge(df_bank, df[['bank_code', 'universal_code']], how='left', on='bank_code')
# df_bank.drop(columns=['bank_code'], inplace=True)
# df_sec.drop(columns=['sec_code'], inplace=True)
# df_corp.drop(columns=['corp_code'], inplace=True)
df_bank.rename(columns={'bank_code': 'category_code'}, inplace=True)
df_sec.rename(columns={'sec_code': 'category_code'}, inplace=True)
df_corp.rename(columns={'corp_code': 'category_code'}, inplace=True)

In [40]:
df_fs = pd.concat([df_bank, df_sec, df_corp], ignore_index=True)

In [41]:
df_fs

Unnamed: 0,category_code,data,stock_code,quarter,year,date_added,universal_code
0,IS_003,2.339361e+07,BID,0,2016,2016-12-30,IS_020
1,IS_003,3.095533e+07,BID,0,2017,2017-12-30,IS_020
2,IS_003,3.472085e+07,BID,0,2018,2018-12-30,IS_020
3,IS_003,3.597781e+07,BID,0,2019,2019-12-30,IS_020
4,IS_003,3.579680e+07,BID,0,2020,2020-12-30,IS_020
...,...,...,...,...,...,...,...
755718,CF_070,6.438210e+05,DXG,3,2023,2023-09-30,CF_130
755719,CF_070,2.760686e+05,DXG,4,2023,2023-12-30,CF_130
755720,CF_070,1.399343e+06,DXG,1,2024,2024-03-30,CF_130
755721,CF_070,1.069653e+06,DXG,2,2024,2024-06-30,CF_130


In [42]:
df_fs.isna().sum()

category_code       0
data                0
stock_code          0
quarter             0
year                0
date_added          0
universal_code    215
dtype: int64

In [43]:
df_fs[df_fs['universal_code'].isnull()]

Unnamed: 0,category_code,data,stock_code,quarter,year,date_added,universal_code
115594,CF_002,72135.105403,MBS,0,2016,2016-12-30,
115595,CF_002,291083.138880,MBS,0,2017,2017-12-30,
115596,CF_002,267197.150144,MBS,0,2018,2018-12-30,
115597,CF_002,102138.983852,MBS,0,2019,2019-12-30,
115598,CF_002,126894.184265,MBS,0,2020,2020-12-30,
...,...,...,...,...,...,...,...
161137,CF_002,112484.132802,ORS,3,2023,2023-09-30,
161138,CF_002,126391.229291,ORS,4,2023,2023-12-30,
161139,CF_002,84385.968936,ORS,1,2024,2024-03-30,
161140,CF_002,128185.117730,ORS,2,2024,2024-06-30,


In [44]:
df_fs.dropna(subset=['universal_code'], inplace=True)

In [45]:
df_fs.to_parquet('../csv/financial_statement_v3.parquet', index=False)