In [3]:
from pyspark.sql.types import * 
import datetime
from pyspark.sql.functions import *
import re
import pandas as pd
import numpy as np
import string
import datetime

#clntpf_path = '/group/axa_malaysia/data/adm_clntpf'
#clexpf_path = '/group/axa_malaysia/data/adm_clexpf'

def format_date(strdate):
    try: 
        return datetime.strptime(str(strdate),'%Y%m%d').strftime('%Y-%m-%d')
    except: 
        return '2999-12-31'
_format_date = udf(format_date,StringType())

def compressblankspace(s):
    k = " ".join([i for i in s.split(" ") if i])
    return k
_compressblankspace_udf = udf(compressblankspace)

def customers(clntpf_path, clexpf_path, output_folder='data/sas_407'):
    #read in the data
    clntpf = spark.read.parquet(clntpf_path)
    clexpf = spark.read.parquet(clexpf_path)
    
    #perform cleaning/transformation to the various columns' values on the clntpf
    clients = clntpf[['clntnum','clttype','secuityno','cltsex','cltaddr01','cltaddr02',
            'cltaddr03','cltaddr04','cltaddr05','cltpcode','cltphone01','cltphone02',
            'statcode','cltdob','marryd','salutl','natlty','ctrycode','givname','surname']]\
    .filter(col('clntnum')!='')\
    .withColumn('D_birth',when(((col('cltdob')/10000 >= 1900) & (col('cltdob')/10000 <= 2100)), to_date(_format_date(col('cltdob')))))\
    .withColumn('D_birth',_replace_null_dates(col('D_birth')))\
    .withColumn('age', datetime.datetime.today().year - year(col('D_birth')))\
    .withColumn('age', when(col('age')<0, lit(0)))\
    .withColumn('first_name',upper(ltrim(rtrim(_compressblankspace_udf(col('givname'))))))\
    .withColumn('last_name',upper(ltrim(rtrim(_compressblankspace_udf(col('surname'))))))\
    .withColumn('CLIENT',concat(col('salutl'),lit(" "),col('first_name'),lit(" "),col('last_name')))\
    .withColumn('ADDRESS',concat(col('cltaddr01'),lit(" "),col('cltaddr02'),lit(" "),
                                 col('cltaddr03'),lit(" "),col('cltaddr04'),lit(" "),col('cltaddr05')))\
    .withColumn('POSTALCODE',translate('cltpcode',' ',''))\
    .withColumn('POSTALCODE',when(regexp_extract('POSTALCODE','^\d{5}$',0)!='',col('POSTALCODE')).otherwise(lit('NA')))\
    .withColumn('ID',translate('secuityno',string.whitespace+string.punctuation,''))\
    .withColumn('ID',when(length('ID')<=6,lit('NA')).otherwise(col('ID')))\
    .withColumn('gender',when(col('cltsex')=='M',lit('MALE')).when(col('cltsex')=='F',lit('FEMALE')).otherwise(lit('UNKNOWN')))\
    .withColumn('ADDRESS2',lit(None))\
    .withColumnRenamed('marryd','marital_status')\
    .withColumnRenamed('natlty','nationality')\
    .withColumnRenamed('clttype','client_type')\
    .withColumnRenamed('ctrycode','country')\
    .withColumnRenamed('ADDRESS2','ADDRESS_GROUP')\
    .withColumn('cltsex',when(isnull(col('cltsex')),lit('N')).otherwise(col('cltsex')))\
    .withColumn('cltphone01',when(isnull(col('cltphone01')),lit('NA')).otherwise(col('cltphone01')))\
    .withColumn('cltphone02',when(isnull(col('cltphone02')),lit('NA')).otherwise(col('cltphone02')))\
    .withColumn('statcode',when(isnull(col('statcode')),lit('NA')).otherwise(col('statcode')))\
    .withColumn('ADDRESS_GROUP',when(isnull(col('ADDRESS_GROUP')),lit('NA')).otherwise(col('ADDRESS_GROUP')))\
    .drop('salutl','surname','givname','cltaddr01','cltaddr02','cltaddr03','cltaddr04','cltaddr05','cltpcode','secuityno','cltdob')\
    .orderBy('clntnum').dropDuplicates(['clntnum'])

    #perform cleaning/transformation to the various columns' values on the clexpf
    clientsextra = clexpf[['clntnum','rmblphone','rinternet']]\
    .withColumn('rinternet',when(locate('@',col('rinternet'),1)>0,col('rinternet')).otherwise(lit('NA')))\
    .withColumn('rmblphone',translate('rmblphone',string.whitespace+".()-_@+DFM",''))\
    .withColumn('rmblphone',when(col('rmblphone')=='',lit('NA')).otherwise(col('rmblphone')))\
    .withColumnRenamed('rmblphone','mobile')\
    .withColumnRenamed('rinternet','email')\
    .orderBy('clntnum').dropDuplicates(['clntnum'])

    #merging of the client and clientextra tables 
    transv_clients = clients.join(clientsextra, on ='clntnum', how ='left')\
    .withColumn('email',when(isnull(trim(col('email'))),lit('NA')).otherwise(col('email')))\
    .withColumn('mobile',when(isnull(trim(col('mobile'))),lit('NA')).otherwise(col('mobile')))\
    .withColumn('address',when(isnull(trim(col('address'))),lit('NA')).otherwise(col('address')))

    #write the parquet out
    transv_clients.write.parquet('{}transv_clients.parquet'.format(output_folder))

In [4]:
clntpf_path = '/group/axa_malaysia/data/adm_clntpf'
clexpf_path = '/group/axa_malaysia/data/adm_clexpf'
customers(clntpf_path,clexpf_path)