In [1]:
import pyspark 
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

from datetime import datetime
from dotenv import load_dotenv              # environment variables
import os

In [2]:
# load the environment variables
load_dotenv()

# assign environment variables
PASSWORD = os.getenv('MariaDB_Password')

In [3]:
spark = SparkSession.builder.appName("Home Loan App").getOrCreate()

External API

In [4]:
url = 'https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json'

Extract Function

In [5]:
# extract api - https://stackoverflow.com/questions/41820977/how-to-save-json-data-fetched-from-url-in-pyspark
def extract_api(url):
    spark.sparkContext.addFile(url)                                         # converts api -> json file and adds to local disk
    absolute_filepath = SparkFiles.get('loan_data.json')                    # get absolute path to the file 
    dataframe = spark.read.json(absolute_filepath)                          # converts json file -> pyspark dataframe
    return dataframe

Loading Function

In [6]:
# load/write data to MariaDB
def load_to_db(dataframe, db_name, table_name, user, password):
    dataframe.write.format("jdbc") \
                    .mode("append") \
                    .option("url", f"jdbc:mysql://localhost:3306/{db_name}") \
                    .option("dbtable", table_name) \
                    .option("user", user) \
                    .option("password", password) \
                    .save()

Logging Function

In [7]:
# logging
def log(message):
    now = datetime.now()                                                    # get current timestamp
    timestamp_format = '%Y-%h-%d-%H:%M:%S'                                  # Year-Month_name-Day-Hour-Minute-Second
    timestamp = now.strftime(timestamp_format)

    with open('loan_logfile.txt', 'a') as f:                                # outputs logs to loan_logfile.txt
        f.write(timestamp + ',' + message + '\n')

ELT Pipeline
- Loan

In [8]:
# Loan ELT Pipeline
log('Loan ELT Job Started')
#-----------------------------------------------------------
log('Loan Extraction Started')
loan_df = extract_api(url)
log('Loan Extraction Ended')
#-----------------------------------------------------------
log('Loan Loading Started')
load_to_db(loan_df,                         # dataframe
           'creditcard_capstone',           # db_name
           'CDW_SAPP_LOAN_APPLICATION',     # table_name
           'root',                          # user_name
           PASSWORD)                        # password
log('Loan Loading Ended')
#-----------------------------------------------------------
log('Loan ELT Job Ended')