In [1]:
import re
import pandas as pd
import numpy as np
import string
from datetime import datetime
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window
"""
/* ***************************** */
/* Premium Tables for Policy Sea */
/* ***************************** */
"""

#adm_mapping_path = 'ADM Mapping.xlsm'
#ztrnpf_path = '/group/axa_malaysia/data/adm_ztrnpf'
#prempf_path = '/group/axa_malaysia/data/adm_prempf'
#rprmpf_co_path = '/group/axa_malaysia/data/adm_rprmpf_co'
#rprmpf_rp_path = '/group/axa_malaysia/data/adm_rprmpf_rp'
#monthend = 0



def PREMIUM_PSEA(adm_mapping_path, ztrnpf_path, prempf_path, rprmpf_co_path, rprmpf_rp_path, monthend=0, output_folder='data/sas_402/'):
    
    def format_date(strdate):
        try: 
            return datetime.strptime(str(strdate),'%Y%m%d').strftime('%Y-%m-%d')
        except: 
            return '2999-12-31'
    _format_date = udf(format_date,StringType())
    
    """
        /* 1) sacscode filter */
        /* Consider Gross Prem (FG) and Co-insurance Prem      */
        /* we will consider RP (Re-insurance premium) in  	   */
        /* the next section, since it only needs to run   	   */
        /* once a month - it is a massive table			       */
        /* 2) batctrcde filter */
        /* Only consider real financial transactions      	   */
    """

    """
        /* Map Transaction Types */
        /* Note - the mapping is limited to the filter, so the mapping is only needed for the
           transaction types applied in the filter */
        Transaction Types are read from the excel mapping files
    """
    transtype_map = pd.read_excel(adm_mapping_path, sheetname='300').set_index('batctrcde').to_dict()['trantype']
    transtype_map_bc = sc.broadcast(transtype_map)

    ztrnpf = spark.read.parquet(ztrnpf_path)[['batcactyr',
     'batcactmn',
     'rldgacct',
     'tranno',
     'ccdate',
     'effdate',
     'accnum',
     'expiry_date',
     'batctrcde',
     'sacscode',
     'trandate',
     'chdrstcdc',
     'tranamt01',
     'tranamt02',
     'tranamt03',
     'tranamt04',
     'tranamt05',
     'batcbrn']].filter( col("batctrcde").isin(["T405", "T409", "T413", "TA39", "T495", "T454"]) )#.limit(100000)
    ztrnpf_fgco = ztrnpf.filter(col("sacscode").isin("FG","CO"))
    ztrnpf_rp = ztrnpf.filter(col("sacscode")=='RP')

    def transformations(df):
        df = df.withColumn('yrm', col('batcactyr')*100+col('batcactmn'))

        #dates formatting
        df = df.withColumn('d_tran', to_date(_format_date(col('trandate'))))\
        .withColumn('d_eff', to_date(_format_date(col('effdate'))))\
        .withColumn('d_com', to_date(_format_date(col('ccdate'))))\
        .withColumn('d_exp', to_date(_format_date(col('expiry_date'))))

        df = df.drop('batcactyr','batcactmn','effdate','ccdate','expiry_date','trandate','tranamt01','tranamt02','tranamt03','tranamt04','tranamt05')

        #map transaction types
        df = df.withColumn('trantype', udf(lambda x: transtype_map_bc.value[x])(col('batctrcde')))
        return df

    df1 = ztrnpf_fgco.withColumnRenamed('rldgacct', 'chdrnum')\
                .withColumnRenamed('accnum', 'agentid')

    df2 = df1.withColumn('gwptotal', when(col('sacscode')=='FG', col('tranamt01') - col('tranamt03') ).otherwise(lit(0)) )\
    .withColumn('gwctotal', when(col('sacscode')=='FG', col('tranamt04') + col('tranamt05') ).otherwise(lit(0)) )\
    .withColumn('cwptotal', when(col('sacscode')!='FG', col('tranamt01') - col('tranamt03') ).otherwise(lit(0)) )\
    .withColumn('cwctotal', when(col('sacscode')!='FG', col('tranamt04') + col('tranamt05') ).otherwise(lit(0)) )

    df3 = transformations(df2)

    #FIRST GET BASE
    base_df = df3.filter(col('sacscode')=='FG').drop('gwptotal','gwctotal','cwptotal','cwctotal','sacscode').\
    sort('chdrnum', 'tranno').dropDuplicates(subset=['chdrnum', 'tranno'])
    base_df.cache()

    """THEN AGGREGATE ZTRNPF
        /* ****************************************************************** */
        /* Sums gwptotal gwctotal cwptotal cwctotal by chdrnum and tranno     */
        /* Since we took only FG transactions the filter is already leaving only one unique chdrnum and tranno combination, 
        this aggregation mainly used to ensure that there are only one combination.         */
        /* ****************************************************************** */
    """
    df4 = df3.groupBy('chdrnum','tranno').sum('gwptotal','gwctotal','cwptotal','cwctotal')\
    .withColumnRenamed('sum(gwptotal)','gwptotal')\
    .withColumnRenamed('sum(gwctotal)','gwctotal')\
    .withColumnRenamed('sum(cwptotal)','cwptotal')\
    .withColumnRenamed('sum(cwctotal)','cwctotal')

    #GET DETAIL OF GROSS TRANSACTIONS IN PREMPF
    prempf = spark.read.parquet(prempf_path)[["chdrno","rskno","tranno","premcl","extr01","extr02","extr03","extr04","extr05"]]
    prempf = prempf.withColumnRenamed('chdrno', 'chdrnum')\
    .withColumn('gwp',col('extr01')-col('extr03'))\
    .withColumn('gwc',col('extr04')+col('extr05'))\
    .drop("extr01","extr02","extr03","extr04","extr05")
    prempf = prempf.groupBy('chdrnum','tranno','rskno','premcl').sum('gwp','gwc')\
    .withColumnRenamed('sum(gwp)','gwp')\
    .withColumnRenamed('sum(gwc)','gwc')

    #GET DETAIL OF CO-INSURANCE TRANSACTIONS IN RPRMPF_CO
    rprmpf = spark.read.parquet(rprmpf_co_path)\
    [["chdrno","rskno","tranno","premcl","extr01","extr02","extr03","extr04","extr05"]].filter(col('sacscode') == "CO")
    rprmpf = rprmpf.withColumnRenamed('chdrno', 'chdrnum')\
    .withColumn('cwp',col('extr01')-col('extr03'))\
    .withColumn('cwc',col('extr04')+col('extr05'))\
    .drop("extr01","extr02","extr03","extr04","extr05")
    rprmpf = rprmpf.groupBy('chdrnum','tranno','rskno','premcl').sum('cwp','cwc')\
    .withColumnRenamed('sum(cwp)','cwp')\
    .withColumnRenamed('sum(cwc)','cwc')

    #MERGE GROSS AND CO-INSURANCE TABLES
    combined = prempf.join(rprmpf, on=['chdrnum','tranno','rskno','premcl'], how='left').fillna(0, subset=['cwp','cwc'])
    combined.cache()

    """
    /* Now we do a small check (not needed each time) to see if the number
       of rows in the original PREMPF containing the gross premium matches
       at least with the new combined data. Hence the number of rows in
       PREMPF should equal the new table COMBINED					      */

    /* Aggregate the Gross premium and commission on cdhrnum and tranno in
       order to finally recon with ztrnpf 								  */
    """
    #/* Since PREMPF is more granual than ZTRNPF, this aggregation should make it match with ZTRNPF
    combined_sum = combined.groupBy('chdrnum','tranno').sum("gwp","gwc","cwp","cwc")\
    .withColumnRenamed('sum(gwp)','gwp')\
    .withColumnRenamed('sum(gwc)','gwc')\
    .withColumnRenamed('sum(cwp)','cwp')\
    .withColumnRenamed('sum(cwc)','cwc')

    ##RECON ON GROSS SUB ACCOUNTS

    #This table will contain all the information will be used in the next step
    recon_pr_gr_psea_recon = df4.join(combined_sum, on=['chdrnum','tranno'], how='left').fillna(0, subset=['gwp','gwc','cwp','cwc'])\
    .filter( (abs(col('gwptotal')-col('gwp'))>0.01)|(abs(col('gwctotal')-col('gwc'))>0.01)|
             (abs(col('cwptotal')-col('cwp'))>0.01)|(abs(col('cwctotal')-col('cwc'))>0.01) )

    #This table will not contain the proportion info
    recon_pr_gr_psea_recon.cache()
    recon_pr_gr_psea_recon.write.parquet('{}recon_pr_gr_psea_recon.parquet'.format(output_folder))

    #Creating the final premium table and we are reallocating the portion of  the data that did not match
    recon = recon_pr_gr_psea_recon\
    .withColumn('propGWP',when( col('gwp')!=0, col('gwptotal')/col('gwp')).otherwise(0) )\
    .withColumn('propGWC',when( col('gwc')!=0, col('gwctotal')/col('gwc')).otherwise(0) )\
    .withColumn('propCWP',when( col('cwp')!=0, col('cwptotal')/col('cwp')).otherwise(0) )\
    .withColumn('propCWC',when( col('cwc')!=0, col('cwctotal')/col('cwc')).otherwise(0) )
    recon_gr = recon[["chdrnum","tranno","propGWP","propGWC","propCWP","propCWC"]].withColumn('recon_exist', lit(True))

    transv_pr_gr_psea = base_df.join(combined, on=['chdrnum', 'tranno'], how='inner').\
    join(recon_gr, on=['chdrnum', 'tranno'], how='left')

    transv_pr_gr_psea = transv_pr_gr_psea\
    .withColumn('gwp', when(col('recon_exist')==True, round(col('gwp')*col('propGWP'),2) ).otherwise(col('gwp')) )\
    .withColumn('gwc', when(col('recon_exist')==True, round(col('gwc')*col('propGWC'),2) ).otherwise(col('gwc')) )\
    .withColumn('cwp', when(col('recon_exist')==True, round(col('cwp')*col('propCWP'),2) ).otherwise(col('cwp')) )\
    .withColumn('cwc', when(col('recon_exist')==True, round(col('cwc')*col('propCWC'),2) ).otherwise(col('cwc')) )\
    .drop("propGWP","propGWC","propCWP","propCWC","recon_exist")

    #The number of rows in the combination missmatch (between BASE and COMBINED) because we did not apply the same filters on COMBINED (such as the financial transactions)

    transv_pr_gr_psea.write.parquet('{}transv_pr_gr_psea.parquet'.format(output_folder))

    if monthend==1:

        ztrnpf_rp1 = ztrnpf_rp.withColumnRenamed('rldgacct', 'chdrnum')\
                    .withColumnRenamed('accnum', 'accntid')

        ztrnpf_rp2 = ztrnpf_rp1.withColumn('rwptotal', col('tranamt01') - col('tranamt03') )\
        .withColumn('rwctotal', col('tranamt04') + col('tranamt05') )

        ztrnpf_rp3 = transformations(ztrnpf_rp2)

        ztrnpf_ri = ztrnpf_rp3.groupBy('chdrnum','tranno','accntid').sum('rwptotal','rwctotal')\
        .withColumnRenamed('sum(rwptotal)','rwptotal')\
        .withColumnRenamed('sum(rwctotal)','rwctotal')

        #GET DETAIL OF RE-INSURANCE TRANSACTIONS IN PREMPF_RP
        rprmpf_ri = spark.read.parquet(rprmpf_rp_path)\
        [["chdrno","rskno","tranno","premcl","extr01","extr02","extr03","extr04","extr05","racc","ritype"]].filter(col('sacscode') == "RP")
        rprmpf_ri = rprmpf_ri.withColumnRenamed('chdrno', 'chdrnum').withColumnRenamed('racc', 'accntid')\
        .withColumn('rwp',col('extr01')-col('extr03'))\
        .withColumn('rwc',col('extr04')+col('extr05'))\
        .drop("extr01","extr02","extr03","extr04","extr05")

        #Re-insurance Premium information at a more granular level specifically by accntid AND ritype as well
        rprmpf_ri = rprmpf_ri.groupBy('chdrnum','tranno','rskno','premcl','accntid','ritype').sum('rwp','rwc')\
        .withColumnRenamed('sum(rwp)','rwp')\
        .withColumnRenamed('sum(rwc)','rwc')

        #Above table aggregated in the same way as before in order to reconcile with ZTRNPF_RI grouping
        rprmpf_ri_sum = rprmpf_ri.groupBy('chdrnum','tranno','accntid').sum('rwp','rwc')\
        .withColumnRenamed('sum(rwp)','rwp')\
        .withColumnRenamed('sum(rwc)','rwc')

        #RECON ON GROSS SUB ACCOUNTS
        recon_pr_ri_psea_recon = ztrnpf_ri.join(rprmpf_ri_sum, on=['chdrnum','tranno','accntid'], how='left').fillna(0, subset=['rwp','rwc'])\
        .filter((abs(col('rwptotal')-col('rwp'))>0.01) | (abs(col('rwctotal')-col('rwc'))>0.01))

        recon_pr_ri_psea_recon.write.parquet('{}recon_pr_ri_psea_recon.parquet'.format(output_folder))

        recon_ri = recon_pr_ri_psea_recon\
        .withColumn('propRWP',when( col('rwp')!=0, col('rwptotal')/col('rwp')).otherwise(0) )\
        .withColumn('propRWC',when( col('rwc')!=0, col('rwctotal')/col('rwc')).otherwise(0) )

        #Creating the final premium table and we are reallocating the portion of the data that did not match
        recon_ri_subset = recon_ri[["chdrnum","tranno","propRWP","propRWC","accntid"]].withColumn('recon_exist', lit(True))\
                            .withColumnRenamed('accntid','recon_accntid')

        transv_pr_ri_psea = base_df.join(rprmpf_ri, on=['chdrnum', 'tranno'], how='inner').\
        join(recon_ri_subset, on=['chdrnum', 'tranno'], how='left') #ritype removed in RECON_RI 

        transv_pr_ri_psea = transv_pr_ri_psea\
        .withColumn('rwp', when(col('recon_exist')==True, round(col('rwp')*col('propRWP'),2) ).otherwise(col('rwp')) )\
        .withColumn('rwc', when(col('recon_exist')==True, round(col('rwc')*col('propRWC'),2) ).otherwise(col('rwc')) )\
        .drop("propRWP","propRWC")

        transv_pr_ri_psea.write.parquet('{}transv_pr_ri_psea.parquet'.format(output_folder))

In [3]:
adm_mapping_path = 'ADM Mapping.xlsm'
ztrnpf_path = '/group/axa_malaysia/data/adm_ztrnpf'
prempf_path = '/group/axa_malaysia/data/adm_prempf'
rprmpf_co_path = '/group/axa_malaysia/data/adm_rprmpf_co'
rprmpf_rp_path = '/group/axa_malaysia/data/adm_rprmpf_rp'
monthend = 1
PREMIUM_PSEA(adm_mapping_path,ztrnpf_path,prempf_path,rprmpf_co_path,rprmpf_rp_path,monthend)

In [3]:
spark.read.parquet(r'/user/cchin/data/sas_402/transv_pr_gr_psea.parquet').count()

33616782

In [4]:
spark.read.parquet(r'/user/cchin/data/sas_402/transv_pr_ri_psea.parquet').count()

34646234

In [None]:
spark.read.parquet(r'/user/cchin/data/sas_402/recon_pr_gr_psea_recon.parquet').count()

7

In [None]:
spark.read.parquet(r'/user/cchin/data/sas_402/recon_pr_ri_psea_recon.parquet').count()

In [4]:
import pickle

In [6]:
import pandas as pd

In [7]:
pr_gr_psea = pickle.load(open('pr_gr_psea.pickle','rb'))

ImportError: No module named 'pandas.core.indexes'

In [10]:
pr_gr_psea = spark.read.csv('pr_gr_psea.csv',header=True)

In [22]:
pr_gr_psea.filter(col('chdrnum')=='01141394').show(1000)

+-------+-------+---------+------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+
|chdrnum|BATCBRN|BATCTRCDE|TRANNO|CHDRSTCDC|agentid|yrm|d_tran|d_eff|d_com|d_exp|trantype|RSKNO|PREMCL|gwp|gwc|cwp|cwc|
+-------+-------+---------+------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+
+-------+-------+---------+------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+



In [23]:
transv_pr_gr_psea.filter(col('chdrnum')=='01141394').show(1000)

+-------+------+-------+---------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+
|chdrnum|tranno|agentid|batctrcde|chdrstcdc|batcbrn|yrm|d_tran|d_eff|d_com|d_exp|trantype|rskno|premcl|gwp|gwc|cwp|cwc|
+-------+------+-------+---------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+
+-------+------+-------+---------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+



In [1]:
transv_pr_gr_psea = spark.read.parquet(r'/user/cchin/data/sas_402/transv_pr_gr_psea.parquet')

In [None]:
from pyspark.sql.functions import *

In [18]:
chdrpf = spark.read.parquet(r'/group/axa_malaysia/data/adm_chdrpf')
ztrnpf = spark.read.parquet(r'/group/axa_malaysia/data/adm_ztrnpf')

In [17]:
chdrpf.filter(col('chdrnum')=='04177419').show()

+-------+-------+--------+--------+-------+------+---------+--------+------+--------+---------+--------+--------+--------+------+------+-------+-------+-------+------+--------+---------+-------+-------+--------+--------+---------+---------+------+-----+------+-------+------+-------+------+------+--------+--------------------+
|CHDRPFX|CHDRCOY| CHDRNUM|SERVUNIT|CNTTYPE|TRANNO|VALIDFLAG|CURRFROM|CURRTO|STATCODE|STATREASN|STATDATE|STATTRAN| OCCDATE|CCDATE|CRDATE|RNLTYPE|RNLDURN|REPTYPE|REPNUM| COWNNUM|CNTBRANCH|AGNTNUM|PAYPLAN|CAMPAIGN|NOFRISKS|CHDRSTCDA|CHDRSTCDC|MPLNUM|COPPN|COTYPE|COVERNT|DTECAN|QUOTENO|ZRENNO|ZENDNO|ZREPOLNO|              datime|
+-------+-------+--------+--------+-------+------+---------+--------+------+--------+---------+--------+--------+--------+------+------+-------+-------+-------+------+--------+---------+-------+-------+--------+--------+---------+---------+------+-----+------+-------+------+-------+------+------+--------+--------------------+
|     CH|      1

In [26]:
"""01141394
01150395
01157348
01158389
01163859
01179543
01184372
01305843
01305848
01305860""".splitlines()

['01141394',
 '01150395',
 '01157348',
 '01158389',
 '01163859',
 '01179543',
 '01184372',
 '01305843',
 '01305848',
 '01305860']

In [27]:
ztrnpf.filter(col('RLDGACCT').isin(['01141394',
 '01150395',
 '01157348',
 '01158389',
 '01163859',
 '01179543',
 '01184372',
 '01305843',
 '01305848',
 '01305860']))[['BATCTRCDE']].distinct().show(100)

+---------+
|BATCTRCDE|
+---------+
|     T922|
|     BR9A|
|     T933|
|     T928|
|     T903|
|     T926|
+---------+



# Testing 

In [1]:
adm_mapping_path = 'ADM Mapping.xlsm'
ztrnpf_path = '/group/axa_malaysia/data/adm_ztrnpf'
prempf_path = '/group/axa_malaysia/data/adm_prempf'
rprmpf_co_path = '/group/axa_malaysia/data/adm_rprmpf_co'
rprmpf_rp_path = '/group/axa_malaysia/data/adm_rprmpf_rp'
monthend = 1
import re
import pandas as pd
import numpy as np
import string
from datetime import datetime
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window

def format_date(strdate):
    try: 
        return datetime.strptime(str(strdate),'%Y%m%d').strftime('%Y-%m-%d')
    except: 
        return '2999-12-31'
_format_date = udf(format_date,StringType())

"""
    /* 1) sacscode filter */
    /* Consider Gross Prem (FG) and Co-insurance Prem      */
    /* we will consider RP (Re-insurance premium) in  	   */
    /* the next section, since it only needs to run   	   */
    /* once a month - it is a massive table			       */
    /* 2) batctrcde filter */
    /* Only consider real financial transactions      	   */
"""

"""
    /* Map Transaction Types */
    /* Note - the mapping is limited to the filter, so the mapping is only needed for the
       transaction types applied in the filter */
    Transaction Types are read from the excel mapping files
"""
transtype_map = pd.read_excel(adm_mapping_path, sheetname='300').set_index('batctrcde').to_dict()['trantype']
transtype_map_bc = sc.broadcast(transtype_map)

ztrnpf = spark.read.parquet(ztrnpf_path)[['batcactyr',
 'batcactmn',
 'rldgacct',
 'tranno',
 'ccdate',
 'effdate',
 'accnum',
 'expiry_date',
 'batctrcde',
 'sacscode',
 'trandate',
 'chdrstcdc',
 'tranamt01',
 'tranamt02',
 'tranamt03',
 'tranamt04',
 'tranamt05',
 'batcbrn']].filter( col("batctrcde").isin(['T409','T44B','T922','T405','T903','T413','T927','T928']) )#.limit(100000)
ztrnpf_fgco = ztrnpf
"""
ztrnpf_fgco = ztrnpf.filter(col("sacscode").isin("FG","CO"))
ztrnpf_rp = ztrnpf.filter(col("sacscode")=='RP')
"""
def transformations(df):
    df = df.withColumn('yrm', col('batcactyr')*100+col('batcactmn'))

    #dates formatting
    df = df.withColumn('d_tran', to_date(_format_date(col('trandate'))))\
    .withColumn('d_eff', to_date(_format_date(col('effdate'))))\
    .withColumn('d_com', to_date(_format_date(col('ccdate'))))\
    .withColumn('d_exp', to_date(_format_date(col('expiry_date'))))

    df = df.drop('batcactyr','batcactmn','effdate','ccdate','expiry_date','trandate','tranamt01','tranamt02','tranamt03','tranamt04','tranamt05')

    #map transaction types
    df = df.withColumn('trantype', udf(lambda x: transtype_map_bc.value[x])(col('batctrcde')))
    return df

df1 = ztrnpf_fgco.withColumnRenamed('rldgacct', 'chdrnum')\
            .withColumnRenamed('accnum', 'agentid')

df2 = df1.withColumn('gwptotal', when(col('sacscode')=='FG', col('tranamt01') - col('tranamt03') ).otherwise(lit(0)) )\
.withColumn('gwctotal', when(col('sacscode')=='FG', col('tranamt04') + col('tranamt05') ).otherwise(lit(0)) )\
.withColumn('cwptotal', when(col('sacscode')!='FG', col('tranamt01') - col('tranamt03') ).otherwise(lit(0)) )\
.withColumn('cwctotal', when(col('sacscode')!='FG', col('tranamt04') + col('tranamt05') ).otherwise(lit(0)) )

df3 = transformations(df2)

#FIRST GET BASE
base_df = df3.drop('gwptotal','gwctotal','cwptotal','cwctotal','sacscode').\
sort('chdrnum', 'tranno').dropDuplicates(subset=['chdrnum', 'tranno'])
base_df.cache()
"""
#FIRST GET BASE
base_df = df3.filter(col('sacscode')=='FG').drop('gwptotal','gwctotal','cwptotal','cwctotal','sacscode').\
sort('chdrnum', 'tranno').dropDuplicates(subset=['chdrnum', 'tranno'])
base_df.cache()
"""

"""THEN AGGREGATE ZTRNPF
    /* ****************************************************************** */
    /* Sums gwptotal gwctotal cwptotal cwctotal by chdrnum and tranno     */
    /* Since we took only FG transactions the filter is already leaving only one unique chdrnum and tranno combination, 
    this aggregation mainly used to ensure that there are only one combination.         */
    /* ****************************************************************** */
"""
df4 = df3.groupBy('chdrnum','tranno').sum('gwptotal','gwctotal','cwptotal','cwctotal')\
.withColumnRenamed('sum(gwptotal)','gwptotal')\
.withColumnRenamed('sum(gwctotal)','gwctotal')\
.withColumnRenamed('sum(cwptotal)','cwptotal')\
.withColumnRenamed('sum(cwctotal)','cwctotal')

#GET DETAIL OF GROSS TRANSACTIONS IN PREMPF
prempf = spark.read.parquet(prempf_path)[["chdrno","rskno","tranno","premcl","extr01","extr02","extr03","extr04","extr05"]]
prempf = prempf.withColumnRenamed('chdrno', 'chdrnum')\
.withColumn('gwp',col('extr01')-col('extr03'))\
.withColumn('gwc',col('extr04')+col('extr05'))\
.drop("extr01","extr02","extr03","extr04","extr05")
prempf = prempf.groupBy('chdrnum','tranno','rskno','premcl').sum('gwp','gwc')\
.withColumnRenamed('sum(gwp)','gwp')\
.withColumnRenamed('sum(gwc)','gwc')

#GET DETAIL OF CO-INSURANCE TRANSACTIONS IN RPRMPF_CO
rprmpf = spark.read.parquet(rprmpf_co_path)\
[["chdrno","rskno","tranno","premcl","extr01","extr02","extr03","extr04","extr05"]].filter(col('sacscode') == "CO")
rprmpf = rprmpf.withColumnRenamed('chdrno', 'chdrnum')\
.withColumn('cwp',col('extr01')-col('extr03'))\
.withColumn('cwc',col('extr04')+col('extr05'))\
.drop("extr01","extr02","extr03","extr04","extr05")
rprmpf = rprmpf.groupBy('chdrnum','tranno','rskno','premcl').sum('cwp','cwc')\
.withColumnRenamed('sum(cwp)','cwp')\
.withColumnRenamed('sum(cwc)','cwc')

#MERGE GROSS AND CO-INSURANCE TABLES
combined = prempf.join(rprmpf, on=['chdrnum','tranno','rskno','premcl'], how='left').fillna(0, subset=['cwp','cwc'])
combined.cache()

"""
/* Now we do a small check (not needed each time) to see if the number
   of rows in the original PREMPF containing the gross premium matches
   at least with the new combined data. Hence the number of rows in
   PREMPF should equal the new table COMBINED					      */

/* Aggregate the Gross premium and commission on cdhrnum and tranno in
   order to finally recon with ztrnpf 								  */
"""
#/* Since PREMPF is more granual than ZTRNPF, this aggregation should make it match with ZTRNPF
combined_sum = combined.groupBy('chdrnum','tranno').sum("gwp","gwc","cwp","cwc")\
.withColumnRenamed('sum(gwp)','gwp')\
.withColumnRenamed('sum(gwc)','gwc')\
.withColumnRenamed('sum(cwp)','cwp')\
.withColumnRenamed('sum(cwc)','cwc')

##RECON ON GROSS SUB ACCOUNTS

#This table will contain all the information will be used in the next step
recon_pr_gr_psea_recon = df4.join(combined_sum, on=['chdrnum','tranno'], how='left').fillna(0, subset=['gwp','gwc','cwp','cwc'])\
.filter( (abs(col('gwptotal')-col('gwp'))>0.01)|(abs(col('gwctotal')-col('gwc'))>0.01)|
         (abs(col('cwptotal')-col('cwp'))>0.01)|(abs(col('cwctotal')-col('cwc'))>0.01) )

#This table will not contain the proportion info
recon_pr_gr_psea_recon.cache()

#Creating the final premium table and we are reallocating the portion of  the data that did not match
recon = recon_pr_gr_psea_recon\
.withColumn('propGWP',when( col('gwp')!=0, col('gwptotal')/col('gwp')).otherwise(0) )\
.withColumn('propGWC',when( col('gwc')!=0, col('gwctotal')/col('gwc')).otherwise(0) )\
.withColumn('propCWP',when( col('cwp')!=0, col('cwptotal')/col('cwp')).otherwise(0) )\
.withColumn('propCWC',when( col('cwc')!=0, col('cwctotal')/col('cwc')).otherwise(0) )
recon_gr = recon[["chdrnum","tranno","propGWP","propGWC","propCWP","propCWC"]].withColumn('recon_exist', lit(True))

transv_pr_gr_psea = base_df.join(combined, on=['chdrnum', 'tranno'], how='inner').\
join(recon_gr, on=['chdrnum', 'tranno'], how='left')

transv_pr_gr_psea = transv_pr_gr_psea\
.withColumn('gwp', when(col('recon_exist')==True, round(col('gwp')*col('propGWP'),2) ).otherwise(col('gwp')) )\
.withColumn('gwc', when(col('recon_exist')==True, round(col('gwc')*col('propGWC'),2) ).otherwise(col('gwc')) )\
.withColumn('cwp', when(col('recon_exist')==True, round(col('cwp')*col('propCWP'),2) ).otherwise(col('cwp')) )\
.withColumn('cwc', when(col('recon_exist')==True, round(col('cwc')*col('propCWC'),2) ).otherwise(col('cwc')) )\
.drop("propGWP","propGWC","propCWP","propCWC","recon_exist")

#The number of rows in the combination missmatch (between BASE and COMBINED) because we did not apply the same filters on COMBINED (such as the financial transactions)

In [2]:
adm_mapping_path = 'ADM Mapping.xlsm'
ztrnpf_path = '/river/axa_my/axa_aaro_psea/data/psea_ztrnpf/merge/yyyy=2017/mm=11/dd=04'
prempf_path = '/river/axa_my/axa_aaro_psea/data/psea_prempf/merge/yyyy=2017/mm=11/dd=04'
rprmpf_path = '/river/axa_my/axa_aaro_psea/data/psea_rprmpf/merge/yyyy=2017/mm=11/dd=04'
monthend = 1
import re
import pandas as pd
import numpy as np
import string
from datetime import datetime
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window
def format_date(strdate):
    try: 
        return datetime.strptime(str(strdate),'%Y%m%d').strftime('%Y-%m-%d')
    except: 
        return '2999-12-31'
_format_date = udf(format_date,StringType())

"""
    /* 1) sacscode filter */
    /* Consider Gross Prem (FG) and Co-insurance Prem      */
    /* we will consider RP (Re-insurance premium) in  	   */
    /* the next section, since it only needs to run   	   */
    /* once a month - it is a massive table			       */
    /* 2) batctrcde filter */
    /* Only consider real financial transactions      	   */
"""

"""
    /* Map Transaction Types */
    /* Note - the mapping is limited to the filter, so the mapping is only needed for the
       transaction types applied in the filter */
    Transaction Types are read from the excel mapping files
"""
transtype_map = pd.read_excel(adm_mapping_path, sheetname='300').set_index('batctrcde').to_dict()['trantype']
transtype_map_bc = sc.broadcast(transtype_map)

ztrnpf = spark.read.parquet(ztrnpf_path)[['batcactyr',
 'batcactmn',
 'rldgacct',
 'tranno',
 'ccdate',
 'effdate',
 'accnum',
 'expiry_date',
 'batctrcde',
 'sacscode',
 'trandate',
 'chdrstcdc',
 'tranamt01',
 'tranamt02',
 'tranamt03',
 'tranamt04',
 'tranamt05',
 'tranamt14',
 'batcbrn']].filter( col("batctrcde").isin(['TA39','T409','T44B','T922','T405','T903','T413','T927','T928',
                                            'BA25','T454','T467','T913','T914','T926','T930','T934','T931']) )#.limit(100000)
#'B470','T46B','T475','T840','B920','BR9A','T8A0','T933'
ztrnpf_fgco = ztrnpf
"""
ztrnpf_fgco = ztrnpf.filter(col("sacscode").isin("FG","CO"))
ztrnpf_rp = ztrnpf.filter(col("sacscode")=='RP')
"""
def transformations(df):
    df = df.withColumn('yrm', col('batcactyr')*100+col('batcactmn'))

    #dates formatting
    df = df.withColumn('d_tran', to_date(_format_date(col('trandate'))))\
    .withColumn('d_eff', to_date(_format_date(col('effdate'))))\
    .withColumn('d_com', to_date(_format_date(col('ccdate'))))\
    .withColumn('d_exp', to_date(_format_date(col('expiry_date'))))

    df = df.drop('batcactyr','batcactmn','effdate','ccdate','expiry_date','trandate','tranamt01','tranamt02','tranamt03','tranamt04',
                 'tranamt05','tranamt14')

    #map transaction types
    df = df.withColumn('trantype', udf(lambda x: transtype_map_bc.value.get(x,'NA'))(col('batctrcde')))
    return df

df1 = ztrnpf.withColumnRenamed('rldgacct', 'chdrnum')\
            .withColumnRenamed('accnum', 'agentid')

df2 = df1.withColumn('fg_premium_total', when(col('sacscode')=='FG', col('tranamt01') - col('tranamt03') + col('tranamt14')).otherwise(0) )\
.withColumn('fg_commission_total', when(col('sacscode')=='FG', col('tranamt04') + col('tranamt05')).otherwise(0) )\
.withColumn('co_premium_total', when(col('sacscode')=='CO', col('tranamt01') - col('tranamt03') + col('tranamt14')).otherwise(0) )\
.withColumn('co_commission_total', when(col('sacscode')=='CO', col('tranamt04') + col('tranamt05')).otherwise(0) )\
.withColumn('rp_premium_total', when(col('sacscode')=='RP', col('tranamt01') - col('tranamt03') + col('tranamt14')).otherwise(0) )\
.withColumn('rp_commission_total', when(col('sacscode')=='RP', col('tranamt04') + col('tranamt05')).otherwise(0) )

df3 = transformations(df2)

df4 = df3.groupBy('chdrnum','tranno', 'batcbrn', 'yrm').sum('fg_premium_total',
                                          'fg_commission_total',
                                          'co_premium_total',
                                          'co_commission_total',
                                          'rp_premium_total',
                                          'rp_commission_total')\
.withColumnRenamed('sum(fg_premium_total)','fg_premium_total')\
.withColumnRenamed('sum(fg_commission_total)','fg_commission_total')\
.withColumnRenamed('sum(co_premium_total)','co_premium_total')\
.withColumnRenamed('sum(co_commission_total)','co_commission_total')\
.withColumnRenamed('sum(rp_premium_total)','rp_premium_total')\
.withColumnRenamed('sum(rp_commission_total)','rp_commission_total')

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.IllegalArgumentException: java.net.UnknownHostException: laphdpmtr03.asia.bigdata.intraxa;laphdpmtr04.asia.bigdata.intraxa
	at org.apache.hadoop.security.SecurityUtil.buildTokenService(SecurityUtil.java:374)
	at org.apache.hadoop.crypto.key.kms.KMSClientProvider.getDelegationTokenService(KMSClientProvider.java:823)
	at org.apache.hadoop.crypto.key.kms.KMSClientProvider.addDelegationTokens(KMSClientProvider.java:779)
	at org.apache.hadoop.crypto.key.KeyProviderDelegationTokenExtension.addDelegationTokens(KeyProviderDelegationTokenExtension.java:86)
	at org.apache.hadoop.hdfs.DistributedFileSystem.addDelegationTokens(DistributedFileSystem.java:2046)
	at org.apache.spark.deploy.yarn.security.HDFSCredentialProvider$$anonfun$obtainCredentials$2.apply(HDFSCredentialProvider.scala:50)
	at org.apache.spark.deploy.yarn.security.HDFSCredentialProvider$$anonfun$obtainCredentials$2.apply(HDFSCredentialProvider.scala:47)
	at scala.collection.immutable.Set$Set1.foreach(Set.scala:94)
	at org.apache.spark.deploy.yarn.security.HDFSCredentialProvider.obtainCredentials(HDFSCredentialProvider.scala:47)
	at org.apache.spark.deploy.yarn.security.ConfigurableCredentialManager$$anonfun$obtainCredentials$2.apply(ConfigurableCredentialManager.scala:82)
	at org.apache.spark.deploy.yarn.security.ConfigurableCredentialManager$$anonfun$obtainCredentials$2.apply(ConfigurableCredentialManager.scala:80)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.MapLike$DefaultValuesIterable.foreach(MapLike.scala:206)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.AbstractTraversable.flatMap(Traversable.scala:104)
	at org.apache.spark.deploy.yarn.security.ConfigurableCredentialManager.obtainCredentials(ConfigurableCredentialManager.scala:80)
	at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:403)
	at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:882)
	at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:171)
	at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.start(YarnClientSchedulerBackend.scala:56)
	at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:156)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:509)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:236)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.UnknownHostException: laphdpmtr03.asia.bigdata.intraxa;laphdpmtr04.asia.bigdata.intraxa
	... 37 more


In [2]:
df4 = df4.withColumn('d_tran', col('d_tran').cast('string'))\
.withColumn('d_eff', col('d_eff').cast('string'))\
.withColumn('d_com', col('d_com').cast('string'))\
.withColumn('d_exp', col('d_exp').cast('string'))

AnalysisException: "cannot resolve '`d_tran`' given input columns: [fg_commission_total, fg_premium_total, rp_premium_total, tranno, co_premium_total, rp_commission_total, co_commission_total, chdrnum];;\n'Project [chdrnum#350, tranno#16, fg_premium_total#770, fg_commission_total#780, co_premium_total#790, co_commission_total#800, rp_premium_total#810, rp_commission_total#820, cast('d_tran as string) AS d_tran#830]\n+- Project [chdrnum#350, tranno#16, fg_premium_total#770, fg_commission_total#780, co_premium_total#790, co_commission_total#800, rp_premium_total#810, sum(rp_commission_total)#760 AS rp_commission_total#820]\n   +- Project [chdrnum#350, tranno#16, fg_premium_total#770, fg_commission_total#780, co_premium_total#790, co_commission_total#800, sum(rp_premium_total)#759 AS rp_premium_total#810, sum(rp_commission_total)#760]\n      +- Project [chdrnum#350, tranno#16, fg_premium_total#770, fg_commission_total#780, co_premium_total#790, sum(co_commission_total)#758 AS co_commission_total#800, sum(rp_premium_total)#759, sum(rp_commission_total)#760]\n         +- Project [chdrnum#350, tranno#16, fg_premium_total#770, fg_commission_total#780, sum(co_premium_total)#757 AS co_premium_total#790, sum(co_commission_total)#758, sum(rp_premium_total)#759, sum(rp_commission_total)#760]\n            +- Project [chdrnum#350, tranno#16, fg_premium_total#770, sum(fg_commission_total)#756 AS fg_commission_total#780, sum(co_premium_total)#757, sum(co_commission_total)#758, sum(rp_premium_total)#759, sum(rp_commission_total)#760]\n               +- Project [chdrnum#350, tranno#16, sum(fg_premium_total)#755 AS fg_premium_total#770, sum(fg_commission_total)#756, sum(co_premium_total)#757, sum(co_commission_total)#758, sum(rp_premium_total)#759, sum(rp_commission_total)#760]\n                  +- Aggregate [chdrnum#350, tranno#16], [chdrnum#350, tranno#16, sum(fg_premium_total#392) AS sum(fg_premium_total)#755, sum(fg_commission_total#414) AS sum(fg_commission_total)#756, sum(co_premium_total#437) AS sum(co_premium_total)#757, sum(co_commission_total#461) AS sum(co_commission_total)#758, sum(rp_premium_total#486) AS sum(rp_premium_total)#759, sum(rp_commission_total#512) AS sum(rp_commission_total)#760]\n                     +- Project [chdrnum#350, tranno#16, agentid#371, batctrcde#7, sacscode#28, chdrstcdc#20, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, rp_commission_total#512, yrm#539, d_tran#567, d_eff#596, d_com#626, d_exp#657, <lambda>(batctrcde#7) AS trantype#708]\n                        +- Project [chdrnum#350, tranno#16, agentid#371, batctrcde#7, sacscode#28, chdrstcdc#20, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, rp_commission_total#512, yrm#539, d_tran#567, d_eff#596, d_com#626, d_exp#657]\n                           +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, ... 6 more fields]\n                              +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, ... 5 more fields]\n                                 +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, ... 4 more fields]\n                                    +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, ... 3 more fields]\n                                       +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, ... 2 more fields]\n                                          +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, rp_premium_total#486, CASE WHEN (sacscode#28 = RP) THEN CheckOverflow((promote_precision(cast(tranamt04#58 as decimal(18,2))) + promote_precision(cast(tranamt05#59 as decimal(18,2)))), DecimalType(18,2)) ELSE cast(0 as decimal(18,2)) END AS rp_commission_total#512]\n                                             +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, co_commission_total#461, CASE WHEN (sacscode#28 = RP) THEN CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(tranamt01#55 as decimal(18,2))) - promote_precision(cast(tranamt03#57 as decimal(18,2)))), DecimalType(18,2)) as decimal(18,2))) + promote_precision(cast(tranamt14#68 as decimal(18,2)))), DecimalType(18,2)) ELSE cast(0 as decimal(18,2)) END AS rp_premium_total#486]\n                                                +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, co_premium_total#437, CASE WHEN (sacscode#28 = CO) THEN CheckOverflow((promote_precision(cast(tranamt04#58 as decimal(18,2))) + promote_precision(cast(tranamt05#59 as decimal(18,2)))), DecimalType(18,2)) ELSE cast(0 as decimal(18,2)) END AS co_commission_total#461]\n                                                   +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, fg_commission_total#414, CASE WHEN (sacscode#28 = CO) THEN CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(tranamt01#55 as decimal(18,2))) - promote_precision(cast(tranamt03#57 as decimal(18,2)))), DecimalType(18,2)) as decimal(18,2))) + promote_precision(cast(tranamt14#68 as decimal(18,2)))), DecimalType(18,2)) ELSE cast(0 as decimal(18,2)) END AS co_premium_total#437]\n                                                      +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, fg_premium_total#392, CASE WHEN (sacscode#28 = FG) THEN CheckOverflow((promote_precision(cast(tranamt04#58 as decimal(18,2))) + promote_precision(cast(tranamt05#59 as decimal(18,2)))), DecimalType(18,2)) ELSE cast(0 as decimal(18,2)) END AS fg_commission_total#414]\n                                                         +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4, CASE WHEN (sacscode#28 = FG) THEN CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(tranamt01#55 as decimal(18,2))) - promote_precision(cast(tranamt03#57 as decimal(18,2)))), DecimalType(18,2)) as decimal(18,2))) + promote_precision(cast(tranamt14#68 as decimal(18,2)))), DecimalType(18,2)) ELSE cast(0 as decimal(18,2)) END AS fg_premium_total#392]\n                                                            +- Project [batcactyr#5, batcactmn#6, chdrnum#350, tranno#16, ccdate#139, effdate#27, accnum#138 AS agentid#371, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4]\n                                                               +- Project [batcactyr#5, batcactmn#6, rldgacct#11 AS chdrnum#350, tranno#16, ccdate#139, effdate#27, accnum#138, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4]\n                                                                  +- Filter batctrcde#7 IN (TA39,T409,T44B,T922,T405,T903,T413,T927,T928,BA25,T454,T467,T913,T914,T926,T930,T934,B470,T46B,T475,T840,B920,BR9A,T8A0,T931,T933)\n                                                                     +- Project [batcactyr#5, batcactmn#6, rldgacct#11, tranno#16, ccdate#139, effdate#27, accnum#138, expiry_date#140, batctrcde#7, sacscode#28, trandate#25, chdrstcdc#20, tranamt01#55, tranamt02#56, tranamt03#57, tranamt04#58, tranamt05#59, tranamt14#68, batcbrn#4]\n                                                                        +- Relation[rrn#0,ci_action#1,batcpfx#2,batccoy#3,batcbrn#4,batcactyr#5,batcactmn#6,batctrcde#7,batcbatch#8,rldgpfx#9,rldgcoy#10,rldgacct#11,origcurr#12,acctcurr#13,crate#14,cnttype#15,tranno#16,cntbranch#17,chdrstcda#18,chdrstcdb#19,chdrstcdc#20,chdrstcdd#21,chdrstcde#22,postmonth#23,... 140 more fields] parquet\n"

In [44]:
spark.read.parquet('/group/axa_malaysia/data/ztrnpf').count()

34860276

In [45]:
spark.read.parquet('/group/axa_malaysia/data/adm_ztrnpf').count()

36006855

In [42]:
spark.read.parquet('/group/axa_malaysia/data/ztrnpf').groupby('RLDGACCT').count().orderBy(desc('count')).show(100)

+--------+-----+
|RLDGACCT|count|
+--------+-----+
|03880496| 5308|
|03346245| 5100|
|03215661| 3571|
|02166079| 2997|
|03490116| 2578|
|01061775| 2410|
|01604404| 2006|
|01688916| 1667|
|03215666| 1644|
|03261762| 1628|
|03447819| 1603|
|01213099| 1569|
|03498017| 1494|
|03162381| 1454|
|01468111| 1294|
|02210115| 1227|
|03453716| 1170|
|01718314| 1151|
|01284295| 1103|
|01591605| 1098|
|03160978| 1070|
|03229060| 1067|
|03412413| 1058|
|04053983| 1042|
|03401062|  996|
|01452265|  992|
|02211082|  903|
|03412801|  900|
|02022914|  896|
|03011902|  880|
|03035771|  844|
|03481401|  834|
|01213019|  830|
|01468129|  814|
|03269684|  799|
|03399897|  798|
|03150581|  782|
|03035781|  750|
|03012814|  742|
|UE035329|  742|
|03171013|  741|
|03153258|  732|
|03718105|  729|
|02212089|  726|
|03057054|  693|
|01820222|  672|
|01029064|  670|
|03412813|  669|
|03487931|  668|
|03376777|  657|
|01783923|  657|
|01582552|  655|
|02024424|  650|
|03074737|  642|
|03257481|  640|
|01576642|  64

In [43]:
spark.read.parquet('/group/axa_malaysia/data/adm_ztrnpf').groupby('RLDGACCT').count().orderBy(desc('count')).show(100)

+--------+-----+
|RLDGACCT|count|
+--------+-----+
|03880496| 6304|
|03346245| 5280|
|03215661| 3637|
|02166079| 2997|
|03490116| 2734|
|01061775| 2410|
|01604404| 2006|
|03261762| 1680|
|01688916| 1667|
|03215666| 1646|
|03498017| 1612|
|03447819| 1603|
|01213099| 1569|
|04053983| 1536|
|03162381| 1468|
|01468111| 1294|
|02210115| 1245|
|03453716| 1222|
|01718314| 1151|
|01284295| 1103|
|03229060| 1103|
|01591605| 1098|
|03160978| 1086|
|03412413| 1058|
|03401062|  996|
|01452265|  992|
|03011902|  929|
|02211082|  918|
|03481401|  901|
|03412801|  900|
|02022914|  896|
|03269684|  871|
|03035771|  850|
|03399897|  834|
|01213019|  830|
|01468129|  814|
|03150581|  802|
|03035781|  784|
|UE035329|  742|
|03012814|  742|
|03171013|  741|
|03718105|  739|
|03153258|  732|
|02212089|  726|
|03057054|  693|
|03505085|  675|
|03376777|  675|
|01820222|  672|
|03087030|  671|
|01029064|  670|
|03412813|  669|
|03487931|  668|
|03257481|  660|
|01783923|  657|
|01582552|  655|
|02024424|  65

In [33]:
spark.read.parquet('/group/axa_malaysia/data/adm_ztrnpf').filter(col('RLDGACCT')=='01916268').show(100)

+-------+---------+---------+---------+--------+--------+--------+-----+-------+------+---------+---------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+------+--------+-----------+--------------------+--------+
|BATCBRN|BATCACTYR|BATCACTMN|BATCTRCDE|RLDGACCT|ORIGCURR|ACCTCURR|CRATE|CNTTYPE|TRANNO|CHDRSTCDA|CHDRSTCDC|TRANDATE| EFFDATE|SACSCODE|SACSTYP06|TRANAMT01|TRANAMT02|TRANAMT03|TRANAMT04|TRANAMT05|TRANAMT06|TRANAMT10|ACCNUM|  CCDATE|EXPIRY_DATE|              datime| RDOCNUM|
+-------+---------+---------+---------+--------+--------+--------+-----+-------+------+---------+---------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+------+--------+-----------+--------------------+--------+
|     89|     2009|        5|     T928|01916268|      RM|      RM|  1.0|    HID|     2|        L|      AHX|20090525|20090509|      GR|       NP|   953.35|     10.0|      0.0|    143

In [17]:
spark.read.parquet('/group/axa_malaysia/data/ztrnpf').filter(col('RLDGACCT')=='01865273').show()

+-------+-------+-------+---------+---------+---------+---------+-------+-------+--------+--------+--------+-----+-------+------+---------+---------+---------+---------+---------+---------+---------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--

In [7]:
df4 = df4.withColumn('tranno',col('tranno').cast('string'))
df4.write.saveAsTable('axa_malaysia.pr_gr_psea14', mode='overwrite')

Py4JJavaError: An error occurred while calling o244.saveAsTable.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:147)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:121)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:101)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.datasources.DataSource.writeInFileFormat(DataSource.scala:484)
	at org.apache.spark.sql.execution.datasources.DataSource.writeAndRead(DataSource.scala:500)
	at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:263)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:404)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:358)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 1.0 failed 1 times, most recent failure: Lost task 2.0 in stage 1.0 (TID 3, localhost, executor driver): java.io.FileNotFoundException: /tmp/blockmgr-0aeb9ab9-deff-4a6d-a636-f03ebfdfca33/3b/temp_shuffle_81b023bb-1741-4f9c-8d05-2cd4de7f5f57 (Too many open files)
	at java.io.FileOutputStream.open(Native Method)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:221)
	at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:102)
	at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:115)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:235)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:152)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:127)
	... 43 more
Caused by: java.io.FileNotFoundException: /tmp/blockmgr-0aeb9ab9-deff-4a6d-a636-f03ebfdfca33/3b/temp_shuffle_81b023bb-1741-4f9c-8d05-2cd4de7f5f57 (Too many open files)
	at java.io.FileOutputStream.open(Native Method)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:221)
	at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:102)
	at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:115)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:235)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:152)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	... 1 more


In [6]:
df4.filter(col('co_premium_total')>0).show()

+--------+------+----------------+-------------------+----------------+-------------------+----------------+-------------------+
| chdrnum|tranno|fg_premium_total|fg_commission_total|co_premium_total|co_commission_total|rp_premium_total|rp_commission_total|
+--------+------+----------------+-------------------+----------------+-------------------+----------------+-------------------+
|04226526|     1|          218.53|              32.78|          174.82|              29.94|           15.04|               2.98|
|01383769|     1|          859.96|             128.99|          515.97|              83.97|          159.63|              27.98|
|01349326|     1|          482.54|              72.38|          289.53|              47.13|           89.57|              15.71|
|01278446|     1|          260.78|              39.12|          156.47|              25.48|           24.78|               4.50|
|03345571|     1|         3136.07|               0.00|          553.42|             106.54|      

In [4]:
spark.read.parquet('/user/cchin/data/sas_403/transv_cl_quali_psea.parquet').write.saveAsTable('axa_malaysia.cl_quali_psea')

In [2]:
transv_pr_gr_psea.cache()

DataFrame[chdrnum: string, tranno: int, agentid: string, batctrcde: string, chdrstcdc: string, batcbrn: int, yrm: int, d_tran: date, d_eff: date, d_com: date, d_exp: date, trantype: string, rskno: int, premcl: string, gwp: double, gwc: double, cwp: double, cwc: double]

In [8]:
transv_pr_gr_psea.write.saveAsTable('axa_malaysia.pr_gr_psea2')

In [3]:
yrm_gwp_spark = transv_pr_gr_psea.groupby('yrm').sum('gwp').orderBy('yrm')

In [10]:
transv_pr_gr_psea.filter(col('chdrnum')=='01141394').show(100)

+-------+------+-------+---------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+
|chdrnum|tranno|agentid|batctrcde|chdrstcdc|batcbrn|yrm|d_tran|d_eff|d_com|d_exp|trantype|rskno|premcl|gwp|gwc|cwp|cwc|
+-------+------+-------+---------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+
+-------+------+-------+---------+---------+-------+---+------+-----+-----+-----+--------+-----+------+---+---+---+---+



In [4]:
yrm_gwp_spark.cache()

DataFrame[yrm: int, sum(gwp): double]

In [5]:
yrm_gwp_spark.count()

182

In [6]:
yrm_gwp_spark.show(200)

+------+--------------------+
|   yrm|            sum(gwp)|
+------+--------------------+
|199902| -21835.879999999997|
|200207|1.6928779479999997E7|
|200208|2.2709575599999998E7|
|200209|2.5989185189999994E7|
|200210|1.9407926239999995E7|
|200211|       1.868669075E7|
|200212|2.4717630510000017E7|
|200301|1.6372270670000004E7|
|200302|3.1312582300000023E7|
|200303|       2.896180276E7|
|200304|       2.221948814E7|
|200305| 4.518518199529997E9|
|200306|2.1683238929999992E7|
|200307|        2.26903872E7|
|200308|1.8556353750000007E7|
|200309| 4.150809044999998E7|
|200310|1.8535794570000004E7|
|200311|1.7626995979999986E7|
|200312| 5.996708582999999E7|
|200401|1.9746300109999996E7|
|200402|3.2714467529999994E7|
|200403|       2.471955417E7|
|200404| 3.790156750000002E7|
|200405|1.7267052630000003E7|
|200406|       2.522090485E7|
|200407|2.1681164340000004E7|
|200408| 2.038332896999999E7|
|200409| 2.408401493999999E7|
|200410|2.3071639579999987E7|
|200411|       2.529502642E7|
|200412|4.

In [7]:
yrm_gwp_spark.coalesce(1).write.csv('yrm_gwp_spark.csv',header=True)

In [1]:
ztrnpf_path = '/river/axa_my/axa_aaro_psea/data/psea_ztrnpf/merge/yyyy=2017/mm=11/dd=04'
spark.read.parquet(ztrnpf_path)[['batcactyr',
 'batcactmn',
 'rldgacct',
 'tranno',
 'ccdate',
 'effdate',
 'accnum',
 'expiry_date',
 'batctrcde',
 'sacscode',
 'trandate',
 'chdrstcdc',
 'tranamt01',
 'tranamt02',
 'tranamt03',
 'tranamt04',
 'tranamt05',
 'tranamt14',
 'batcbrn']]

AnalysisException: 'Path does not exist: hdfs://bigplay-asia-nameservice/river/axa_my/axa_aaro_psea/data/psea_ztrnpf/merge/yyyy=2017/mm=11/dd=04;'