In [1]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy as np
import pandas as pd 

# environment variables
from dotenv import load_dotenv 
import os

In [2]:
# load the environment variables
load_dotenv()

# assign environment variables
PASSWORD = os.getenv('MariaDB_Password')

In [3]:
spark = SparkSession.builder.appName("Credit Card App").getOrCreate()

Customer Extraction

In [4]:
# load json file
customer_df = spark.read.json('cdw_files/cdw_sapp_custmer.json')

Customer Transformation

In [5]:
# name transformation
customer_df = customer_df.withColumn('FIRST_NAME', initcap(customer_df['FIRST_NAME']))          # convert to title case
customer_df = customer_df.withColumn('MIDDLE_NAME', lower(customer_df['MIDDLE_NAME']))          # convert to lower case
customer_df = customer_df.withColumn('LAST_NAME', initcap(customer_df['LAST_NAME']))            # convert to title case

In [6]:
# address transformation
customer_df = customer_df.withColumn('FULL_STREET_ADDRESS', concat_ws(' ', customer_df['APT_NO'], customer_df['STREET_NAME']))
# drop columns 
customer_df = customer_df.drop('APT_NO', 'STREET_NAME')

In [7]:
# phone number transformation
customer_df = customer_df.withColumn('CUST_PHONE', concat(lit('(781)'), 
                                                          substring(customer_df['CUST_PHONE'], 1, 3), 
                                                          lit('-'), 
                                                          substring(customer_df['CUST_PHONE'], 3, 4)))

In [11]:
# convert data types
customer_df = customer_df.withColumn('SSN', customer_df['SSN'].cast('int'))
customer_df = customer_df.withColumn('CUST_ZIP', customer_df['CUST_ZIP'].cast('int'))
customer_df = customer_df.withColumn('LAST_UPDATED', to_timestamp(customer_df['LAST_UPDATED']))

In [14]:
# rearrange columns
rearranged_customer_df = customer_df.select('SSN', 
                                            'FIRST_NAME', 
                                            'MIDDLE_NAME', 
                                            'LAST_NAME',
                                            'CREDIT_CARD_NO',
                                            'FULL_STREET_ADDRESS',
                                            'CUST_CITY',
                                            'CUST_STATE',
                                            'CUST_COUNTRY',
                                            'CUST_ZIP',
                                            'CUST_PHONE',
                                            'CUST_EMAIL',
                                            'LAST_UPDATED')

In [15]:
rearranged_customer_df.dtypes

[('SSN', 'int'),
 ('FIRST_NAME', 'string'),
 ('MIDDLE_NAME', 'string'),
 ('LAST_NAME', 'string'),
 ('CREDIT_CARD_NO', 'string'),
 ('FULL_STREET_ADDRESS', 'string'),
 ('CUST_CITY', 'string'),
 ('CUST_STATE', 'string'),
 ('CUST_COUNTRY', 'string'),
 ('CUST_ZIP', 'int'),
 ('CUST_PHONE', 'string'),
 ('CUST_EMAIL', 'string'),
 ('LAST_UPDATED', 'timestamp')]

Customer Loading

In [13]:
# load/write data to MariaDB
rearranged_customer_df.write.format("jdbc") \
                    .mode("append") \
                    .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
                    .option("dbtable", "CDW_SAPP_CUSTOMER") \
                    .option("user", "root") \
                    .option("password", PASSWORD) \
                    .save()

Branch Extraction

In [None]:
# load json file
branch_df = spark.read.json('cdw_files/cdw_sapp_branch.json')

Branch Transformation

Branch Loading

Credit Extraction

In [None]:
# load json file
credit_df = spark.read.json('cdw_files/cdw_sapp_credit.json')

Credit Transformation

Credit Loading