In [2]:
import re
import pandas as pd
import numpy as np
import string
import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *

#agntpf_path = '/group/axa_malaysia/data/adm_agntpf'
#zyampf_path = '/group/axa_malaysia/data/adm_zyampf'
#transv_clients_path = 'data/sas_407/transv_clients.parquet'

def format_date(strdate):
    try: 
        return datetime.strptime(str(strdate),'%Y%m%d').strftime('%Y-%m-%d')
    except: 
        return '2999-12-31'
_format_date = udf(format_date,StringType())


def agents(agntpf_path, zyampf_path, transv_clients_path, output_folder='data/sas_408/'):
    
    #read in the data
    agntpf = spark.read.parquet(agntpf_path)
    zyampf = spark.read.parquet(zyampf_path)
    transv_clients = spark.read.parquet(transv_clients_path)

    #perform cleaning/transformation to the various columns' values on the agntpf
    agent = agntpf[['AGNTNUM','CLNTNUM','AGNTBR','VALIDFLAG','START_DATE','DATEEND']]\
    .filter(col('VALIDFLAG')==1)\
    .withColumn('d_start', to_date(_format_date(col('START_DATE'))))\
    .withColumn('d_end', to_date(_format_date(col('DATEEND'))))\
    .withColumnRenamed('AGNTNUM','agentid')\
    .drop('VALIDFLAG','START_DATE','DATEEND')

    #join the agent table with clients table
    agent1 = agent.join(transv_clients, 'clntnum', 'left')\
    .select([agent[c] for c in agent.columns]+[transv_clients['client']])\
    .withColumnRenamed('client','agent_name')\
    .drop('clntnum')

    zyampf = zyampf.withColumnRenamed('agntnum','agentid')
    
    #join with zyampf (agents extra table)
    agent2 = agent1.join(zyampf, 'agentid', 'left')\
    .select([agent1[c] for c in agent1.columns]+[zyampf['STCDA']])\
    .withColumnRenamed('STCDA','SOURCE_CODE')\
    .withColumn('SOURCE_CODE',upper(col('SOURCE_CODE')))

    #read in the mapping table and join
    act_source_code = spark.createDataFrame(pd.read_excel('ADM Mapping.xlsm', sheetname = '400'))
    agent3 = agent2.join(act_source_code,'SOURCE_CODE','left')

    #map the distribution channels with the appropriate names
    transv_agents = agent3\
    .withColumn('CHANNEL',when(
            col('CHANNEL_GROUP').isin(['General Agents','Local Broker','TISCO','Direct Others','FLD','Other Agents']),
                (when(col('agntbr')==10, concat(trim(col('CHANNEL_GROUP')), lit(' (BKK)')))
                             .otherwise(concat(trim(col('CHANNEL_GROUP')), lit(' (UPC)')))))
           .otherwise(trim(col('CHANNEL_GROUP'))))\
    .withColumn('CHANNEL',when( col('agentid').isin(['BD039','VQ226']), lit('Digital')).otherwise(col('CHANNEL')))\
    .withColumnRenamed('AGNTBR','AGENT_BRANCH')\
    .orderBy('agentid')

    #write out the data
    transv_agents.write.parquet('{}transv_agents_test554.parquet'.format(output_folder))

In [3]:
agntpf_path = '/group/axa_malaysia/data/adm_agntpf'
zyampf_path = '/group/axa_malaysia/data/adm_zyampf'
transv_clients_path = 'data/sas_407/transv_clients.parquet'
agents(agntpf_path, zyampf_path, transv_clients_path)

In [4]:
agents = spark.read.parquet('data/sas_408/transv_agents_test554.parquet')

In [10]:
agents.write.saveAsTable('axa_malaysia.agents555', format='csv')

In [9]:
agents.show()

+-----------+-------+------------+----------+----------+--------------------+----------------+-------------+-----------+-------+
|SOURCE_CODE|agentid|AGENT_BRANCH|   d_start|     d_end|          agent_name|SOURCE_CODE_NAME|CHANNEL_GROUP|CHANNEL_QBR|CHANNEL|
+-----------+-------+------------+----------+----------+--------------------+----------------+-------------+-----------+-------+
|         02|  22463|          63|2999-12-31|2999-12-31|    MS  LEE PEI YUAN|            null|         null|       null|   null|
|         02|  22464|          61|2999-12-31|2999-12-31|  WEEZ RISK MANAG...|            null|         null|       null|   null|
|         02|  22465|          37|2999-12-31|2999-12-31|    MR  OH KOK LEONG|            null|         null|       null|   null|
|         02|  22466|          61|2999-12-31|2999-12-31|  ALADO AUTOMOBIL...|            null|         null|       null|   null|
|         01|  22467|          97|2999-12-31|2999-12-31|        DIRECT-STAFF|            null|   