In [1]:
import tabula
import os
import pandas as pd
%config IPCompleter.greedy=True
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100

In [2]:
source_folder = '../data/output/'
list_of_files = os.listdir(path=source_folder)

# filepath1 = source_folder + list_of_files[-2]
# filepath2 = source_folder + list_of_files[-1]

filepath1 = source_folder + list_of_files[-6]
filepath2 = source_folder + list_of_files[-5]

print('filepath1:',filepath1)
print('filepath2:',filepath2)
company_prefix = 'c172'
record_year = 2018 

filepath1: ../data/output/c172_2016_page_53.pdf
filepath2: ../data/output/c172_2016_page_54.pdf


In [3]:
df = tabula.read_pdf(filepath1) # returns DataFrame

In [4]:
to_rename1 = {'Unnamed: 3': 'mac', 'Marine and': 'mahl', 'Unnamed: 5': 'fire', 'Unnamed: 6': 'motor',
              'Unnamed: 7': 'wic', 'Unnamed: 8': 'pa', 'Unnamed: 10': 'health', 'Unnamed: 11': 'pub_lia',
              'Unnamed: 12': 'bonds', 'Unnamed: 13': 'cnstr_engr', 'Unnamed: 14': 'prof_indm', 'Misc -': 'cpr', 
              'Unnamed: 16': 'others', 'Unnamed: 18': 'total'}

to_rename2 = {2: 'mac', 3: 'mahl', 4: 'fire', 5: 'motor', 6: 'wic', 7: 'pa', 8: 'health', 9: 'pub_lia',
             10: 'bonds', 11: 'cnstr_engr', 12: 'prof_indm', 13: 'cpr', 14: 'others', 16: 'total'}

In [5]:
df.drop(columns=['Unnamed: 0','Unnamed: 1','Unnamed: 2','Unnamed: 9','Unnamed: 17'],inplace=True)
df.drop(labels=[0,1,2,3,4,5,6,7,9,14,22,24,25,26,28,33,34,43,45],inplace=True)
df.rename(columns = to_rename1,inplace=True)

In [6]:
df['index_name']=['prem_acc_direct','prem_acc_rein_sg','prem_acc_rein_asean','prem_acc_rein_other',
                       'prem_acc_total_rein','prem_cede_rein_sg','prem_cede_rein_asean','prem_cede_rein_other',
                       'prem_cede_rein_total','prem_write_net','prem_liab_begin','prem_liab_end','prem_earned',
                       'claim_set_direct','claim_set_rein_sg','claim_set_rein_asean','claim_set_rein_other',
                       'claim_set_total_rein','claim_rcv_rein_sg','claim_rcv_rein_asean','claim_rcv_rein_other',
                       'claim_rcv_rein_total','claim_set_net','claim_liab_end','claim_liab_begin',
                       'claim_incur_net','exp_management','exp_comm_paid','exp_comm_earned','exp_comm_incur_net']

df.set_index('index_name', inplace=True)

In [7]:
# This code is needed only if the table is split across 2 pages
df_next = tabula.read_pdf(filepath2, pandas_options={'header': -1}) # returns with a header row
df_next.drop(labels=[1,5],inplace=True)
df_next.drop(columns=[0,1,15],inplace=True)
df_next['index_name'] = ['exp_other','uw_gain','net_invest_income','operating_result']
df_next.set_index('index_name', inplace=True)
df_next.rename(columns = to_rename2, inplace=True)

In [8]:
# This code is needed only if the table is split across 2 pages
df = pd.concat([df,df_next])

In [9]:
df = df.T # transpose table
df = df.applymap(lambda x: float(str(x).replace(',',''))) # remove commas and change to float
df.reset_index(inplace=True)
df.rename(columns={'index':'class'},inplace=True)
del df.index.name
df.insert(0,column='company',value=company_prefix)
df.insert(0,column='year',value=record_year)

In [10]:
df

index_name,year,company,class,prem_acc_direct,prem_acc_rein_sg,prem_acc_rein_asean,prem_acc_rein_other,prem_acc_total_rein,prem_cede_rein_sg,prem_cede_rein_asean,prem_cede_rein_other,prem_cede_rein_total,prem_write_net,prem_liab_begin,prem_liab_end,prem_earned,claim_set_direct,claim_set_rein_sg,claim_set_rein_asean,claim_set_rein_other,claim_set_total_rein,claim_rcv_rein_sg,claim_rcv_rein_asean,claim_rcv_rein_other,claim_rcv_rein_total,claim_set_net,claim_liab_end,claim_liab_begin,claim_incur_net,exp_management,exp_comm_paid,exp_comm_earned,exp_comm_incur_net,exp_other,uw_gain,net_invest_income,operating_result
0,2018,c172,mac,2134157.0,459437.0,0.0,0.0,459437.0,0.0,0.0,2431350.0,2431350.0,162244.0,297825.0,217931.0,242138.0,494296.0,48618.0,0.0,0.0,48618.0,0.0,0.0,478720.0,478720.0,64194.0,1639668.0,920272.0,783590.0,518869.0,169019.0,201901.0,-32882.0,0.0,-1027439.0,41998.0,-985441.0
1,2018,c172,mahl,69967.0,0.0,0.0,0.0,0.0,0.0,0.0,69967.0,69967.0,0.0,19600.0,0.0,19600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,170114.0,45461.0,124653.0,13997.0,7950.0,-161.0,8111.0,0.0,-127161.0,1133.0,-126028.0
2,2018,c172,fire,13865343.0,1880290.0,0.0,0.0,1880290.0,0.0,0.0,13474551.0,13474551.0,2271082.0,1650135.0,1735721.0,2185496.0,3777090.0,760133.0,0.0,0.0,760133.0,0.0,0.0,3061474.0,3061474.0,1475749.0,3005870.0,3874600.0,607019.0,3150036.0,801006.0,1505865.0,-704859.0,0.0,-866700.0,254975.0,-611725.0
3,2018,c172,motor,224587.0,0.0,0.0,0.0,0.0,0.0,0.0,215632.0,215632.0,8955.0,5958.0,8506.0,6407.0,327173.0,0.0,0.0,0.0,0.0,0.0,0.0,327173.0,327173.0,0.0,2088.0,0.0,2088.0,44930.0,23619.0,62983.0,-39364.0,0.0,-1247.0,3637.0,2390.0
4,2018,c172,wic,1747934.0,0.0,0.0,0.0,0.0,0.0,0.0,1747934.0,1747934.0,0.0,82514.0,60317.0,22197.0,669914.0,0.0,0.0,0.0,0.0,0.0,0.0,669914.0,669914.0,0.0,0.0,0.0,0.0,349688.0,112947.0,271027.0,-158080.0,0.0,-169411.0,28305.0,-141106.0
5,2018,c172,pa,4200125.0,401928.0,0.0,0.0,401928.0,0.0,0.0,744447.0,744447.0,3857606.0,769779.0,863142.0,3764243.0,2213168.0,132476.0,0.0,0.0,132476.0,0.0,0.0,5850.0,5850.0,2339794.0,2005757.0,1387672.0,2957879.0,920676.0,1043381.0,86356.0,957025.0,0.0,-1071337.0,74523.0,-996814.0
6,2018,c172,health,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2018,c172,pub_lia,6359008.0,41003.0,0.0,0.0,41003.0,0.0,0.0,5769931.0,5769931.0,630080.0,989031.0,557288.0,1061823.0,477818.0,0.0,0.0,0.0,0.0,0.0,0.0,262148.0,262148.0,215670.0,2835791.0,2355918.0,695543.0,1280372.0,713706.0,1429861.0,-716155.0,0.0,-197937.0,103638.0,-94299.0
8,2018,c172,bonds,1441186.0,0.0,0.0,0.0,0.0,0.0,0.0,1238045.0,1238045.0,203141.0,370218.0,339290.0,234069.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6994.0,4467.0,2527.0,288320.0,556229.0,441444.0,114785.0,0.0,-171563.0,23338.0,-148225.0
9,2018,c172,cnstr_engr,2389195.0,85538.0,0.0,0.0,85538.0,0.0,0.0,2468733.0,2468733.0,6000.0,916072.0,799582.0,122490.0,2145780.0,728988.0,0.0,0.0,728988.0,0.0,0.0,2856367.0,2856367.0,18401.0,142758.0,393485.0,-232326.0,495090.0,97911.0,293280.0,-195369.0,0.0,55095.0,40074.0,95169.0
