In [2]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy as np
import pandas as pd 

# environment variables
from dotenv import load_dotenv 
import os

In [3]:
# load the environment variables
load_dotenv()

# assign environment variables
PASSWORD = os.getenv('MariaDB_Password')

In [4]:
spark = SparkSession.builder.appName("Credit Card App").getOrCreate()

Customer Extraction

In [4]:
# load json file
customer_df = spark.read.json('cdw_files/cdw_sapp_custmer.json')

Customer Transformation

In [5]:
# name transformation
customer_df = customer_df.withColumn('FIRST_NAME', initcap(customer_df['FIRST_NAME']))          # convert to title case
customer_df = customer_df.withColumn('MIDDLE_NAME', lower(customer_df['MIDDLE_NAME']))          # convert to lower case
customer_df = customer_df.withColumn('LAST_NAME', initcap(customer_df['LAST_NAME']))            # convert to title case

In [6]:
# address transformation
customer_df = customer_df.withColumn('FULL_STREET_ADDRESS', concat_ws(' ', customer_df['APT_NO'], customer_df['STREET_NAME']))
# drop columns 
customer_df = customer_df.drop('APT_NO', 'STREET_NAME')

In [7]:
# phone number transformation
customer_df = customer_df.withColumn('CUST_PHONE', concat(lit('(781)'), 
                                                          substring(customer_df['CUST_PHONE'], 1, 3), 
                                                          lit('-'), 
                                                          substring(customer_df['CUST_PHONE'], 3, 4)))

In [11]:
# convert data types
customer_df = customer_df.withColumn('SSN', customer_df['SSN'].cast('int'))
customer_df = customer_df.withColumn('CUST_ZIP', customer_df['CUST_ZIP'].cast('int'))
customer_df = customer_df.withColumn('LAST_UPDATED', to_timestamp(customer_df['LAST_UPDATED']))

In [14]:
# rearrange columns
rearranged_customer_df = customer_df.select('SSN', 
                                            'FIRST_NAME', 
                                            'MIDDLE_NAME', 
                                            'LAST_NAME',
                                            'CREDIT_CARD_NO',
                                            'FULL_STREET_ADDRESS',
                                            'CUST_CITY',
                                            'CUST_STATE',
                                            'CUST_COUNTRY',
                                            'CUST_ZIP',
                                            'CUST_PHONE',
                                            'CUST_EMAIL',
                                            'LAST_UPDATED')

In [15]:
rearranged_customer_df.dtypes

[('SSN', 'int'),
 ('FIRST_NAME', 'string'),
 ('MIDDLE_NAME', 'string'),
 ('LAST_NAME', 'string'),
 ('CREDIT_CARD_NO', 'string'),
 ('FULL_STREET_ADDRESS', 'string'),
 ('CUST_CITY', 'string'),
 ('CUST_STATE', 'string'),
 ('CUST_COUNTRY', 'string'),
 ('CUST_ZIP', 'int'),
 ('CUST_PHONE', 'string'),
 ('CUST_EMAIL', 'string'),
 ('LAST_UPDATED', 'timestamp')]

Customer Loading

In [18]:
# load/write data to MariaDB
rearranged_customer_df.write.format("jdbc") \
                    .mode("append") \
                    .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
                    .option("dbtable", "CDW_SAPP_CUSTOMER") \
                    .option("user", "root") \
                    .option("password", PASSWORD) \
                    .save()

Branch Extraction

In [66]:
# load json file
branch_df = spark.read.json('cdw_files/cdw_sapp_branch.json')

Branch Transformation

In [67]:
# zip code transformation
branch_df = branch_df.fillna(999999, subset=['BRANCH_ZIP'])

In [68]:
# phone number transformation
branch_df = branch_df.withColumn('BRANCH_PHONE', concat(lit('(781)'), 
                                                        substring(branch_df['BRANCH_PHONE'], 1, 3), 
                                                        lit('-'), 
                                                        substring(branch_df['BRANCH_PHONE'], 3, 4)))

In [69]:
# convert data type
branch_df = branch_df.withColumn('BRANCH_CODE', branch_df['BRANCH_CODE'].cast('int'))
branch_df = branch_df.withColumn('BRANCH_ZIP', branch_df['BRANCH_ZIP'].cast('int'))
branch_df = branch_df.withColumn('LAST_UPDATED', to_timestamp(branch_df['LAST_UPDATED']))

In [72]:
# rearrange columns
rearranged_branch_df = branch_df.select('BRANCH_CODE',
                             'BRANCH_NAME',
                             'BRANCH_STREET',
                             'BRANCH_CITY',
                             'BRANCH_STATE',
                             'BRANCH_ZIP',
                             'BRANCH_PHONE',
                             'LAST_UPDATED')

In [73]:
rearranged_branch_df.dtypes

[('BRANCH_CODE', 'int'),
 ('BRANCH_NAME', 'string'),
 ('BRANCH_STREET', 'string'),
 ('BRANCH_CITY', 'string'),
 ('BRANCH_STATE', 'string'),
 ('BRANCH_ZIP', 'int'),
 ('BRANCH_PHONE', 'string'),
 ('LAST_UPDATED', 'timestamp')]

Branch Loading

In [74]:
# load/write data to MariaDB
rearranged_branch_df.write.format("jdbc") \
                    .mode("append") \
                    .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
                    .option("dbtable", "CDW_SAPP_BRANCH") \
                    .option("user", "root") \
                    .option("password", PASSWORD) \
                    .save()

Credit Extraction

In [18]:
# load json file
credit_df = spark.read.json('cdw_files/cdw_sapp_credit.json')

Credit Transformation

In [20]:
# date transformation
credit_df = credit_df.withColumn('TIMEID', concat_ws('-', credit_df['YEAR'], credit_df['MONTH'], credit_df['DAY']).cast('date'))
# remove all hypens
credit_df = credit_df.withColumn('TIMEID', regexp_replace(credit_df['TIMEID'], '-', ''))
# drop columns
credit_df = credit_df.drop('YEAR', 'MONTH', 'DAY')

In [22]:
# convert data type
credit_df = credit_df.withColumn('BRANCH_CODE', credit_df['BRANCH_CODE'].cast('int'))
credit_df = credit_df.withColumn('CUST_SSN', credit_df['CUST_SSN'].cast('int'))
credit_df = credit_df.withColumn('TRANSACTION_ID', credit_df['TRANSACTION_ID'].cast('int'))

In [24]:
# rename column
credit_df = credit_df.withColumnRenamed('CREDIT_CARD_NO', 'CUST_CC_NO')

In [26]:
# rearrange columns
rearranged_credit_df = credit_df.select('CUST_CC_NO',
                                        'TIMEID',
                                        'CUST_SSN',
                                        'BRANCH_CODE',
                                        'TRANSACTION_TYPE',
                                        'TRANSACTION_VALUE',
                                        'TRANSACTION_ID')

In [27]:
rearranged_credit_df.dtypes

[('CUST_CC_NO', 'string'),
 ('TIMEID', 'string'),
 ('CUST_SSN', 'int'),
 ('BRANCH_CODE', 'int'),
 ('TRANSACTION_TYPE', 'string'),
 ('TRANSACTION_VALUE', 'double'),
 ('TRANSACTION_ID', 'int')]

Credit Loading

In [None]:
# load/write data to MariaDB
rearranged_credit_df.write.format("jdbc") \
                    .mode("append") \
                    .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
                    .option("dbtable", "CDW_SAPP_CREDIT_CARD") \
                    .option("user", "root") \
                    .option("password", PASSWORD) \
                    .save()