In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window
import datetime
import calendar
from dateutil.relativedelta import relativedelta
import numpy as np
import pandas as pd

#riskpf_path = '/group/axa_malaysia/data/adm_riskpf'
#transv_pr_gr_psea_path = 'data/sas_402/transv_pr_gr_psea.parquet'
#transv_polhistory_psea_path = 'data/sas_401/transv_polhistory_psea.parquet'
#transv_pol_psea_path = 'data/sas_401/transv_pol_psea.parquet'
#acc_yrm = 201707

def format_date(strdate):
    try: 
        return datetime.strptime(str(strdate),'%Y%m%d').strftime('%Y-%m-%d')
    except: 
        return '2999-12-31'
_format_date = udf(format_date,StringType())

def pillar3(riskpf_path, transv_pr_gr_psea_path, transv_polhistory_psea_path, transv_pol_psea_path, acc_yrm, output_folder='data/sas_406/' ):
    # /****************************************************************************************
    # *																						*
    # *																						*
    # *						P3 TABLE: GWP & RISK EXPOSURE 									*
    # *			(KEY: chdrnum, rskno, tranno, zrenno, D_from, rskno) 						*
    # *																						*
    # *																						*
    # ****************************************************************************************/

    # /****************************************************************************************
    # *																						*
    # *						(1) SUMMARIZE PREMIUM TABLE BY rskno & tranno 					*
    # *																						*
    # ****************************************************************************************/
    # Remove 'future' rows - not in scope. The summary is by riskno and tranno only.
    # The addition variables is added just for additional information - for example tranno will change with trantype (at least) but 
    # not the other way round

    riskpf = spark.read.parquet(riskpf_path)
    transv_pr_gr_psea = spark.read.parquet(transv_pr_gr_psea_path)
    transv_polhistory_psea = spark.read.parquet(transv_polhistory_psea_path)
    transv_pol_psea = spark.read.parquet(transv_pol_psea_path)

    # Testing script - this one to be used for testing
    prem = transv_pr_gr_psea\
    .filter(col('yrm') <= acc_yrm)\
    .groupBy('chdrnum','tranno','rskno','d_eff','d_exp','trantype','yrm').sum('gwp','cwp','gwc','cwc')\
    .withColumnRenamed('sum(gwp)','gwp').withColumnRenamed('sum(cwp)','cwp')\
    .withColumnRenamed('sum(gwc)','gwc').withColumnRenamed('sum(cwc)','cwc')

    # Testing script - this one to be used for testing
    polhist = transv_polhistory_psea[['chdrnum','tranno','zrenno','d_cancel']].orderBy('chdrnum','tranno')
    polhist.cache()

    prem2 = prem.join(polhist, on=['chdrnum','tranno'], how='left')\
    .withColumn('gwp2',col('gwp')-col('cwp'))\
    .withColumn('gwc2',col('gwc')-col('cwc'))\
    .drop('gwp','gwc').withColumnRenamed('gwp2','gwp').withColumnRenamed('gwc2','gwc')\
    .groupBy('chdrnum','rskno','zrenno','d_eff','d_exp').sum('gwp','gwc')\
    .withColumnRenamed('sum(gwp)','gwp').withColumnRenamed('sum(gwc)','gwc')\
    .filter(abs(col('gwp'))>0.01)

    # /********************************************************************************************************************
    # *																													*
    # *	(3) IN THE NEWLY MERGED TABLE, INDEX THE DIFFERENT PERIODS (chdrnum * rskno * zrenno * d_from * d_to) 			*
    # *							AND CALCULATE THE DAILY RATES FOR EACH PERIOD 											*
    # *																													*
    # ********************************************************************************************************************/
    prem3 = prem2\
    .withColumn('daily_rate_p',(col('gwp')/(datediff(col('d_exp'),col('d_eff'))+1)))\
    .withColumn('daily_rate_c',(col('gwc')/(datediff(col('d_exp'),col('d_eff'))+1)))\
    .withColumn('period', row_number().over(Window.partitionBy('chdrnum','rskno','zrenno').orderBy('chdrnum','rskno','zrenno','d_eff','d_exp')))\
    .drop('gwp','gwc')
    prem3.cache()

    # /********************************************************************************************************************
    # *																													*
    # *		(4) TO SPLIT THE DIFFERENT PERIODS THE RIGHT AMOUNT OF TIMES, GET LIST OF ALL POSSIBLE 						*
    # *								EFFECTIVE DATES WITHIN EACH PERIOD 													*
    # *																													*
    # ********************************************************************************************************************/
    listdates = prem3[['chdrnum','rskno','zrenno','d_eff','d_exp']]\
    .withColumn('d_temp',explode(array(col('d_eff'),col('d_exp')))).drop('d_eff','d_exp')\
    .orderBy('chdrnum','rskno','zrenno','d_temp')\
    .dropDuplicates(['chdrnum','rskno','zrenno','d_temp'])
    listdates.cache()

    # /*********************************************************************************
    # *																				 *
    # *	(5) GET THE NUMBER OF PERIODS FOR EACH POLICY x RISK PERIOD (zrenno). 		 *
    # *	 		WHEN THERE IS ONLY 1, WE WON'T SPLIT THE DATES. 					 *
    # *				   MAP THE COUNT TO THE LISTDATES TABLE 	 					 *
    # *																				 *
    # *********************************************************************************/
    nbperiod = prem3[['chdrnum','rskno','zrenno','period']]\
    .orderBy('chdrnum','rskno','zrenno')\
    .withColumn('periodcount',col('period'))\
    .withColumn('last',row_number().over(Window.partitionBy('chdrnum','rskno','zrenno').orderBy(desc('chdrnum'),
                                                                                                desc('rskno'),
                                                                                                desc('zrenno'),
                                                                                                desc('periodcount'))))\
    .filter(col('last')==1).drop('last')

    nbperiod.cache()

    nbperiod_inter = nbperiod.select('chdrnum','rskno','zrenno','periodcount')
    listdates2 = listdates.join(nbperiod_inter,on=['chdrnum','rskno','zrenno'],how='left')

    # /*********************************************************************************
    # *																				 *
    # *	(6) IN THE PREMIUM SUMMARY TABLE, MAP THE DIFFERENT EFFECTIVE DATES ON 		 *
    # *		THE PERIODS WHERE THE TOTAL COUNT OF PERIODS PER zrenno IS MORE THAN 1 	 *
    # * 				(RQ: THIS WILL NATURALLY DUPLICATE THE RECORDS) 			 	 *
    # *																				 *
    # *********************************************************************************/
    cond = [prem3['chdrnum']==listdates2['chdrnum'],
            prem3['rskno']==listdates2['rskno'],
            prem3['zrenno']==listdates2['zrenno'], 
            (((prem3['d_eff'] <= listdates2['d_temp']) & (listdates2['d_temp'] <=prem3['d_exp']) ) & (listdates2['periodcount'] > 1))]
    prem4 = prem3.join(listdates2,cond,how='left').select([prem3[xx] for xx in prem3.columns]+[listdates2['d_temp']])\
    .orderBy('chdrnum','rskno','zrenno','period','d_temp')
    prem4.cache()

    # /*********************************************************************************
    # *																				 *
    # *		(7) SPLIT THE RECORDS BY ALL THE DIFFERENT EFFECTIVE DATES 				 *
    # *						WHICH WERE MAPPED IN STEP 6 		 					 *
    # *																				 *
    # *********************************************************************************/
    prem5 = prem4\
    .withColumn('d_temp_prev', lag(col('d_temp')).over(Window.partitionBy('chdrnum','rskno','zrenno','period').orderBy('d_temp')))\
    .withColumn('d_count', count(col('chdrnum')).over(Window.partitionBy('chdrnum','rskno','zrenno','period')))\
    .withColumn('d_from', when(col('d_temp')==date_add(col('d_temp_prev'),1), col('d_temp')).otherwise(date_add(col('d_temp_prev'),1)))\
    .withColumn('d_to',  when(col('d_temp')==date_add(col('d_temp_prev'),1), col('d_temp')).otherwise(date_add(col('d_temp'),-1)))\
    .withColumn('first', struct(col('d_from').alias('from'), col('d_to').alias('to')))\
    .withColumn('second', struct(col('d_temp').alias('from'), col('d_temp').alias('to')))\
    .withColumn('final', explode(array(col('first'), col('second'))))\
    .withColumn('d_from', when(col('d_count')==1, col('d_eff')).otherwise(col('final')['from']))\
    .withColumn('d_to', when(col('d_count')==1, col('d_exp')).otherwise(col('final')['to']))\
    .drop("first","second","final")\
    .filter(col('d_from').isNotNull())\
    .drop_duplicates(['chdrnum','rskno','zrenno','d_from','d_to','period'])\
    .groupBy('chdrnum','rskno','zrenno','d_from','d_to').sum('daily_rate_p','daily_rate_c')\
    .withColumnRenamed('sum(daily_rate_p)','daily_rate_p').withColumnRenamed('sum(daily_rate_c)','daily_rate_c')
    prem5.cache()

    # /************************************************************************************
    # *																				  	*
    # *  (8) FOR EACH RECORD IN OUR NEW PREM5, WE'D LIKE A tranno TO BE ABLE TO MERGE   	*
    # *							WITH ANY RISK TABLE LATER ON.							*
    # *		WE FIND THE CORRESPONDING tranno IN RISKPF BASED ON THE EFFECTIVE DATES	  	*
    # *  (d_from IN PREM5 MATCHING dteeff IN RISKPF (OR THE CLOSEST MATCH IN THE zrenno).	*
    # *			FOR THE ONES WHERE THERE IS NO POSSIBLE MATCHING tranno,				*
    # *					  WE OUTPUT THE RECORD IN ERROR TABLE.							*
    # *																				  	*
    # ************************************************************************************/
    riskpf2 = riskpf[['chdrno','tranno','rskno','dteatt','dteeff','dteter','datime','rsktyp','recformat']]\
    .orderBy(['chdrno','tranno','rskno','datime'],ascending=[1,1,1,0]).drop('datime')\
    .dropDuplicates(['chdrno','tranno','rskno'])

    polhist_risk = riskpf2.join(polhist.withColumnRenamed('chdrnum','chdrno'),on=['chdrno','tranno'],how='inner')\
    .withColumn('d_starteff', to_date(_format_date(col('dteeff'))))\
    .select('chdrno','rskno','zrenno','tranno','d_starteff','dteter','rsktyp','recformat')\
    .orderBy(['chdrno','rskno','zrenno','d_starteff','dteter','tranno'],ascending=[1,1,1,0,0,0])
    polhist_risk.cache()

    # /**************************************************************************************************************
    # *  We Map the closest transaction of RISKPF to our PREM TABLE. If not a perfect match (d_starteff NE d_from)  *
    # *							 then we move on to the previous d_starteff.									  *
    # * 		We also map the lob and date of cancellation (useful when we calculate exposure) from Policy Header   *
    # **************************************************************************************************************/

    # Read in pol_psea
    pol_psea = transv_pol_psea[['chdrnum','d_cancel','chdrstcdc']]
    # Have to define udf to handle dynamic substring function
    _substring_udf = udf(lambda x: x[0:len(x)-3])
    polhist_risk2 = polhist_risk.withColumnRenamed('chdrno','chdrnum').withColumn('rsktabl', _substring_udf(col('recformat'))).drop('recformat')


    big = prem5.join(polhist_risk2, on=(['chdrnum','rskno','zrenno']),how='inner')\
    .join(pol_psea, on=(['chdrnum']), how ='left')
    big.cache()

    prem6 = big.filter(col('d_from') >= col('d_starteff'))\
    .withColumn('indicator',row_number().over(
            Window.partitionBy('chdrnum','rskno','zrenno','d_from','d_to').orderBy(desc('d_starteff'),desc('dteter'),desc('tranno'))))\
    .filter(col('indicator') == 1)\
    .select('chdrnum','zrenno','rskno','d_from','d_to','d_cancel','tranno','chdrstcdc','daily_rate_p','daily_rate_c','rsktyp','rsktabl')

    error_inter = big.withColumn('condition',when(col('d_from') >= col('d_starteff'), lit(1)).otherwise(lit(0)))\
    .groupBy('chdrnum','rskno','zrenno').sum('condition').filter(col('sum(condition)')==0)

    error = big.join(error_inter,on=(['chdrnum','rskno','zrenno']),how='inner')\
    .select('chdrnum','zrenno','rskno','d_from','d_to','d_cancel','chdrstcdc','daily_rate_p','daily_rate_c','rsktyp','rsktabl')

    #/**********************************************************************************
    #*																				  *
    #*	  (9) FOR SEVERAL RECORDS, WE MAY NOT HAVE FOUND ANY POSSIBLE CANDIDATE 	  *
    #*						FOR A tranno WITHIN THE SAME POI. 						  *
    #*			WE THEN MAP THE INFO FROM THE FIRST RECORD OF POLHIST_RISK.			  *
    #*																				  *
    #**********************************************************************************/

    cond = [error['chdrnum']==polhist_risk['chdrno'],
            error['rskno'] == polhist_risk['rskno']]
    error2 = error.join(polhist_risk, cond, how='left')\
    .select([error[xx] for xx in error.columns] + [polhist_risk['tranno'],polhist_risk['d_starteff']])\
    .orderBy('chdrnum','rskno','zrenno','d_from','d_to',desc('d_starteff'))\
    .dropDuplicates(['chdrnum','rskno','zrenno','d_from','d_to']).drop('d_starteff')

    # /**********************************************************************************
    # *																				  *
    # * (10) FINAL TABLE IS THE CONCATENATION OF THE PREVIOUS AND THE CORRECTED ERRORS. *
    # *																				  *
    # **********************************************************************************/

    transv_p3_psea = prem6.union(error2.select(prem6.columns)).orderBy('chdrnum','rskno','zrenno','d_from')
    transv_p3_psea.write.parquet('{}transv_p3_psea.parquet'.format(output_folder))

In [2]:
#transv_p3_psea_path = 'data/sas_406/transv_p3_psea.parquet'
#acc_yr = 2017
#acc_mth = 7
#period = 5

def loop_yrm(transv_p3_psea_path, acc_yr, acc_mth, period, output_folder='data/sas_406/'):
    
    transv_p3_psea = spark.read.parquet(transv_p3_psea_path)
    transv_p3_psea.cache()

    # /* LOOP ON DIFFERENT YRMs */
    earn_start = datetime.datetime.strptime('{}-01-01'.format(acc_yr - (period + 2)), '%Y-%m-%d')
    earn_end = datetime.date(acc_yr, acc_mth, calendar.monthrange(acc_yr,acc_mth)[1])

    # Compute the number of years required to run all months
    n = relativedelta(earn_end, earn_start)
    n = n.years*12 + n.months + 1

    list_range = []
    for i in range(np.abs(n)):
        u_bound = earn_end - relativedelta(months=(i)) # Subtracting those months might push to different year
        l_bound = datetime.date(u_bound.year,u_bound.month, 1) # Same as u_bound, just beginning of month
        year_length = (datetime.date(u_bound.year,12,31) - datetime.date(u_bound.year,1,1)).days + 1
        yrm = int(u_bound.strftime('%Y%m'))
        list_range.append(
        {'u_bound':u_bound,
          'l_bound':l_bound,
          'year_length':year_length,
          'yrm':yrm})
    list_range_bc = sc.broadcast(list_range)


    def get_valid_range(d_from, d_to):
        return [d for d in list_range_bc.value if ((d_from <= d["u_bound"]) and (d_to >= d["l_bound"]))]
    _get_valid_range = udf(get_valid_range, ArrayType(
            StructType(
                [StructField('u_bound',DateType()),
                StructField('l_bound',DateType()),
                StructField('year_length',ShortType()),
                StructField('yrm',IntegerType())])))

    p3_psea_monthly = transv_p3_psea.withColumn('temp', _get_valid_range(col('d_from'),col('d_to')))\
    .withColumn('temp',explode('temp'))\
    .withColumn('u_bound',col('temp')['u_bound'])\
    .withColumn('l_bound',col('temp')['l_bound'])\
    .withColumn('year_length',col('temp')['year_length'])\
    .withColumn('yrm',col('temp')['yrm']).drop('temp')\
    .withColumn('gep',((datediff(least(col('d_to'),col('u_bound')), greatest(col('d_from'),col('l_bound')) )+1) * col('daily_rate_p')))\
    .withColumn('gec',((datediff(least(col('d_to'),col('u_bound')), greatest(col('d_from'),col('l_bound')) )+1) * col('daily_rate_c')))\
    .withColumn('exp', when(col('d_cancel') < col('l_bound'), lit(0))\
               .when(col('d_cancel') > col('u_bound'), datediff(col('u_bound'),greatest(col('d_from'),col('l_bound')))/col('year_length'))\
               .otherwise(datediff(least(col('d_to'),col('u_bound')), date_add(greatest(col('d_from'),col('l_bound')),1))/col('year_length')))\
    .withColumn('rif', when(col('d_cancel') < col('l_bound'), lit(0))\
                .when( col('d_cancel') > col('u_bound'), 
                     when((col('d_from') <= col('u_bound')) & (col('d_to') > col('u_bound')), lit(1)).otherwise(lit(0)))\
                .otherwise(lit(0))
               )\
    .groupBy('chdrnum','rskno','zrenno','tranno','rsktyp','rsktabl','chdrstcdc','yrm')\
    .sum('gep','gec','exp','rif')\
    .withColumnRenamed('sum(gep)','gep').withColumnRenamed('sum(gec)','gec')\
    .withColumnRenamed('sum(exp)','exp').withColumnRenamed('sum(rif)','rif')

    p3_psea_monthly.write.parquet('{}transv_p3_psea_monthly.parquet'.format(output_folder))

In [3]:
#transv_cl_quali_psea_path = 'data/sas_403/transv_cl_quali_psea.parquet'
#transv_cl_quanti_psea_path = 'data/sas_403/transv_cl_quanti_psea.parquet'
#transv_polhistory_psea_path = 'data/sas_401/transv_polhistory_psea.parquet'
#transv_p3_psea_path = 'data/sas_406/transv_p3_psea.parquet'
#transv_p3_psea_monthly_path = 'data/sas_406/transv_p3_psea_monthly.parquet'
#adm_mapping_path = 'ADM Mapping.xlsm'
#acc_yr = 2017
#acc_mth = 7
#period = 5
#delay = 2

def pillar3_psea_12rm(transv_cl_quali_psea_path, 
                      transv_cl_quanti_psea_path, 
                      transv_polhistory_psea_path,
                      transv_p3_psea_path, 
                      transv_p3_psea_monthly_path,
                      adm_mapping_path, 
                      acc_yr, acc_mth, period, delay, 
                      output_folder='data/sas_406/'):
    
    transv_cl_quali_psea = spark.read.parquet(transv_cl_quali_psea_path)
    transv_cl_quanti_psea = spark.read.parquet(transv_cl_quanti_psea_path)
    transv_polhistory_psea = spark.read.parquet(transv_polhistory_psea_path)
    transv_p3_psea = spark.read.parquet(transv_p3_psea_path)
    transv_p3_psea_monthly = spark.read.parquet(transv_p3_psea_monthly_path)
    act_claim_largeclaim = spark.createDataFrame(pd.read_excel(adm_mapping_path,sheetname = '112'))

    # Prepare claims
    claim_quali = transv_cl_quali_psea[['claim','chdrnum','rskno','d_occ','chdrstcdc','natcat']]\
    .orderBy('claim').dropDuplicates(['claim'])\
    .withColumn('loss_yrm',(year(col('d_occ')) * 100 + month(col('d_occ'))))

    # Get the mapping of trannos from P3
    trannomap = transv_p3_psea[['chdrnum','rskno','d_from','d_to','zrenno','tranno','rsktabl']]

    cond = [(claim_quali['chdrnum'] == trannomap['chdrnum']) &
            (claim_quali['rskno'] == trannomap['rskno']) &
            ((claim_quali['d_occ'] >= trannomap['d_from']) &
            (claim_quali['d_occ'] <= trannomap['d_to']))]

    claimquali2 = claim_quali.join(trannomap,cond,how='left')\
    .select([claim_quali[c] for c in claim_quali.columns] + 
            [trannomap['zrenno'], trannomap['tranno'], trannomap['d_from'], trannomap['rsktabl']])\
    .orderBy('claim','zrenno','d_from')\
    .dropDuplicates(['claim']).drop('d_from','d_to')

    claimquanti = transv_cl_quanti_psea[['claim','yrm','tranno','d_tran','clstat','gpay','gmov']]\
    .groupBy('claim','tranno','d_tran','yrm','clstat').sum('gpay','gmov')\
    .withColumnRenamed('sum(gpay)','gpay').withColumnRenamed('sum(gmov)','gmov').withColumnRenamed('tranno','tranno_cl')

    claim = claimquali2.join(claimquanti,on='claim',how='inner')

    # Start of loops
    p3_start = datetime.datetime.strptime('{}-01-01'.format(acc_yr - (period + 2)), '%Y-%m-%d')
    p3_end = datetime.date(acc_yr, acc_mth, calendar.monthrange(acc_yr,acc_mth)[1])

    # Computer the number of steps required to run all months
    n = relativedelta(p3_end, p3_start)
    n = n.years*12 + n.months + 1

    list_range = []
    for i in range(np.abs(n)):
        vision_yrm = int((p3_end - relativedelta(months=(i))).strftime('%Y%m'))
        l_bound_yrm = int((p3_end - relativedelta(months=(i)) - relativedelta(months=(11+delay))).strftime('%Y%m'))
        u_bound_yrm = int((p3_end - relativedelta(months=(i)) - relativedelta(months=(delay))).strftime('%Y%m'))
        list_range.append(
        {'vision_yrm':vision_yrm,
          'l_bound_yrm':l_bound_yrm,
          'u_bound_yrm':u_bound_yrm})
    list_range_bc = sc.broadcast(list_range)


    def get_valid_range_premium(yrm):
        return [d for d in list_range_bc.value if ((yrm >= d["l_bound_yrm"]) and (yrm <= d["u_bound_yrm"]))]
    def get_valid_range_claim(loss_yrm, yrm):
        return [d for d in list_range_bc.value if 
                ((loss_yrm >= d["l_bound_yrm"]) and (loss_yrm <= d["u_bound_yrm"]) and (yrm <= d["vision_yrm"]))]

    _get_valid_range_premium = udf(get_valid_range_premium, ArrayType(
            StructType(
                [StructField('vision_yrm',IntegerType()),
                StructField('l_bound_yrm',IntegerType()),
                StructField('u_bound_yrm',IntegerType())])))
    _get_valid_range_claim = udf(get_valid_range_claim, ArrayType(
            StructType(
                [StructField('vision_yrm',IntegerType()),
                StructField('l_bound_yrm',IntegerType()),
                StructField('u_bound_yrm',IntegerType())])))

    # 12 RM premium from p3_monthly
    premiums = transv_p3_psea_monthly.withColumn('temp', _get_valid_range_premium(col('yrm')))\
    .withColumn('temp',explode('temp'))\
    .withColumn('yrm',col('temp')['vision_yrm'])\
    .withColumn('l_bound_yrm',col('temp')['l_bound_yrm'])\
    .withColumn('u_bound_yrm',col('temp')['u_bound_yrm'])\
    .drop('temp')\
    .groupBy('chdrnum','rskno','zrenno','tranno','rsktabl','yrm').sum('gep','gec','exp','rif')\
    .withColumnRenamed('sum(gep)','gep').withColumnRenamed('sum(gec)','gec')\
    .withColumnRenamed('sum(exp)','exp').withColumnRenamed('sum(rif)','rif')

    # 12 RM claims - split between attritional/catnat/large + exclude claims closed @ nil from the count
    tempcl = claim.withColumn('temp', _get_valid_range_claim(col('loss_yrm'),col('yrm')))\
    .withColumn('temp',explode('temp'))\
    .withColumn('yrm',col('temp')['vision_yrm'])\
    .withColumn('l_bound_yrm',col('temp')['l_bound_yrm'])\
    .withColumn('u_bound_yrm',col('temp')['u_bound_yrm'])\
    .drop('temp')\
    .withColumn('lastclaim',row_number().over(Window.partitionBy('yrm','claim').orderBy(desc('tranno_cl'),desc('d_tran'))))\
    .withColumn('gpay_ult',sum('gpay').over(Window.partitionBy('yrm','claim','tranno','d_tran').orderBy('tranno_cl','d_tran')))\
    .withColumn('gmov_ult',sum('gmov').over(Window.partitionBy('yrm','claim','tranno','d_tran').orderBy('tranno_cl','d_tran')))\
    .filter(col('lastclaim')==1)

    tempcl = tempcl.join(act_claim_largeclaim, on='chdrstcdc',how='left')\
    .withColumn('threshold_breach', 
                when(col('threshold').isNotNull() & ((col('gpay_ult') + col('gmov_ult')) > col('threshold')),
                     True).otherwise(False))\
    .withColumn('ginc_nc', when(col('natcat')==1, col('gpay_ult')+col('gmov_ult')).otherwise(lit(0)) )\
    .withColumn('nbclaim_nc', when(col('natcat')==1, lit(1)).otherwise(lit(0)) )\
    .withColumn('ginc_l', 
                when((col('natcat') != 1) & (col('threshold_breach')==True), col('gpay_ult') + col('gmov_ult')).otherwise(lit(0)) )\
    .withColumn('nbclaim_l', when((col('natcat') != 1) & (col('threshold_breach')==True), lit(1)).otherwise(lit(0)) )\
    .withColumn('ginc_a',
                when((col('natcat') != 1) & (col('threshold_breach')==False), col('gpay_ult') + col('gmov_ult')).otherwise(lit(0)) )\
    .withColumn('nbclaim_a', when((col('natcat') != 1) & (col('threshold_breach')==False), lit(1)).otherwise(lit(0)) )\
    .withColumn('closed_notpaid', when(((col('clstat')==lit('2')) & (abs(col('gpay_ult') + col('gmov_ult')) < 0.01)), True).otherwise(False))\
    .withColumn('nbclaim_nc', when((col('closed_notpaid')==True), lit(0)).otherwise(col('nbclaim_nc')))\
    .withColumn('nbclaim_l', when((col('closed_notpaid')==True), lit(0)).otherwise(col('nbclaim_l')))\
    .withColumn('nbclaim_a', when((col('closed_notpaid')==True), lit(0)).otherwise(col('nbclaim_a')))\
    .select('chdrnum','rskno','zrenno','tranno','yrm','rsktabl','ginc_a','ginc_l','ginc_nc','nbclaim_a','nbclaim_l','nbclaim_nc')

    claims = tempcl.groupBy('chdrnum','rskno','zrenno','tranno','yrm','rsktabl')\
    .sum('ginc_a','ginc_l','ginc_nc','nbclaim_a','nbclaim_l','nbclaim_nc')\
    .withColumnRenamed('sum(ginc_a)','ginc_a').withColumnRenamed('sum(ginc_l)','ginc_l').withColumnRenamed('sum(ginc_nc)','ginc_nc')\
    .withColumnRenamed('sum(nbclaim_a)','nbclaim_a').withColumnRenamed('sum(nbclaim_l)','nbclaim_l').withColumnRenamed('sum(nbclaim_nc)','nbclaim_nc')


    # Setting up the premiums and claims dataset for merging
    cmissing = set(premiums.columns).difference(set(claims.columns))
    pmissing = set(claims.columns).difference(set(premiums.columns))

    for i in iter(pmissing):
        premiums = premiums.withColumn(i,lit(None))

    for i in iter(cmissing):
        claims = claims.withColumn(i,lit(None))

    p3_12rm = premiums.union(claims.select(premiums.columns))

    p3_12rm = p3_12rm.fillna(0)
    p3_psea_12rm = p3_12rm.groupBy('chdrnum','rskno','zrenno','tranno','yrm','rsktabl')\
    .sum('gep','gec','exp','rif','ginc_a','ginc_l','ginc_nc','nbclaim_a','nbclaim_l','nbclaim_nc')\
    .withColumnRenamed('sum(gep)','gep').withColumnRenamed('sum(gec)','gec').withColumnRenamed('sum(exp)','exp')\
    .withColumnRenamed('sum(rif)','rif').withColumnRenamed('sum(ginc_a)','ginc_a').withColumnRenamed('sum(ginc_l)','ginc_l')\
    .withColumnRenamed('sum(ginc_nc)','ginc_nc').withColumnRenamed('sum(nbclaim_a)','nbclaim_a')\
    .withColumnRenamed('sum(nbclaim_l)','nbclaim_l').withColumnRenamed('sum(nbclaim_nc)','nbclaim_nc')

    transv_p3_psea_12rm = p3_psea_12rm.join(transv_polhistory_psea[['chdrnum','tranno','cnttype','chdrstcdc','agentid']], 
                                      on=['chdrnum','tranno'], how = 'left')
    transv_p3_psea_12rm.write.parquet('{}transv_p3_psea_12rm.parquet'.format(output_folder))

In [4]:
riskpf_path = '/group/axa_malaysia/data/adm_riskpf'
transv_pr_gr_psea_path = 'data/sas_402/transv_pr_gr_psea.parquet'
transv_polhistory_psea_path = 'data/sas_401/transv_polhistory_psea.parquet'
transv_pol_psea_path = 'data/sas_401/transv_pol_psea.parquet'
acc_yrm = 201707
pillar3(riskpf_path, transv_pr_gr_psea_path, transv_polhistory_psea_path, transv_pol_psea_path, acc_yrm)

In [5]:
transv_p3_psea_path = 'data/sas_406/transv_p3_psea.parquet'
acc_yr = 2017
acc_mth = 7
period = 5
loop_yrm(transv_p3_psea_path, acc_yr, acc_mth, period)

In [6]:
transv_cl_quali_psea_path = 'data/sas_403/transv_cl_quali_psea.parquet'
transv_cl_quanti_psea_path = 'data/sas_403/transv_cl_quanti_psea.parquet'
transv_polhistory_psea_path = 'data/sas_401/transv_polhistory_psea.parquet'
transv_p3_psea_path = 'data/sas_406/transv_p3_psea.parquet'
transv_p3_psea_monthly_path = 'data/sas_406/transv_p3_psea_monthly.parquet'
adm_mapping_path = 'ADM Mapping.xlsm'
acc_yr = 2017
acc_mth = 7
period = 5
delay = 2
pillar3_psea_12rm(transv_cl_quali_psea_path, transv_cl_quanti_psea_path, transv_polhistory_psea_path,
                 transv_p3_psea_path, transv_p3_psea_monthly_path, adm_mapping_path,
                 acc_yr, acc_mth, period, delay)

In [18]:
spark.read.parquet('/user/cchin/data/sas_406/transv_p3_psea.parquet').count()

15742409

In [19]:
spark.read.parquet('/user/cchin/data/sas_406/transv_p3_psea_monthly.parquet').count()

118053853

In [20]:
spark.read.parquet('/user/cchin/data/sas_406/transv_p3_psea_12rm.parquet').count()

195550693