In [148]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

import os
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jdk-18.0.2'
os.environ['SPARK_HOME'] = 'C:\Program Files\spark-3.3.0-bin-hadoop3'

In [149]:
#Create a spark session

spark = SparkSession.builder.appName('Read JSON File into DataFrame').getOrCreate()

#Read JSON and assign to pyspark dataframes

customer_df = spark.read.json('cdw_sapp_custmer.json')
credit_df = spark.read.json('cdw_sapp_credit.json')
branch_df = spark.read.json('cdw_sapp_branch.json')

In [150]:

customer_df.printSchema()

root
 |-- APT_NO: string (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_CITY: string (nullable = true)
 |-- CUST_COUNTRY: string (nullable = true)
 |-- CUST_EMAIL: string (nullable = true)
 |-- CUST_PHONE: long (nullable = true)
 |-- CUST_STATE: string (nullable = true)
 |-- CUST_ZIP: string (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- LAST_UPDATED: string (nullable = true)
 |-- MIDDLE_NAME: string (nullable = true)
 |-- SSN: long (nullable = true)
 |-- STREET_NAME: string (nullable = true)



In [151]:
from pyspark.sql.functions import concat_ws
#Format customer data
customer_df = customer_df.withColumnRenamed("FIRST_NAME","First_Name")
customer_df = customer_df.withColumnRenamed("MIDDLE_NAME", "Middle_Name")
customer_df = customer_df.withColumnRenamed("LAST_NAME", "Last_Name")

customer_df = customer_df.withColumn('FULL_STREET_ADDRESS', concat_ws(' ', customer_df.STREET_NAME, customer_df.APT_NO))
customer_df = customer_df.drop('APT_NO', 'STREET_NAME')

customer_df.withColumn("CUST_PHONE", concat_ws('-', customer_df.CUST_PHONE[0:3], customer_df.CUST_PHONE[3:7])).show(2)

customer_df = customer_df.withColumn("CUST_ZIP",customer_df.CUST_ZIP.cast("int"))
customer_df = customer_df.drop_duplicates()
customer_df.show(2)
customer_df.printSchema()

+----------------+------------+-------------+-------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+--------------------+
|  CREDIT_CARD_NO|   CUST_CITY| CUST_COUNTRY|         CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|First_Name|Last_Name|        LAST_UPDATED|Middle_Name|      SSN| FULL_STREET_ADDRESS|
+----------------+------------+-------------+-------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+--------------------+
|4210653310061055|     Natchez|United States|AHooper@example.com| 123-37818|        MS|   39120|      Alec|   Hooper|2018-04-21T12:49:...|         Wm|123456100|Main Street North...|
|4210653310102868|Wethersfield|United States|EHolman@example.com| 123-38933|        CT|   06109|      Etta|   Holman|2018-04-21T12:49:...|    Brendan|123453023|   Redwood Drive 829|
+----------------+------------+-------------+-------------------+----------+----------+---

In [152]:
branch_df.printSchema()

root
 |-- BRANCH_CITY: string (nullable = true)
 |-- BRANCH_CODE: long (nullable = true)
 |-- BRANCH_NAME: string (nullable = true)
 |-- BRANCH_PHONE: string (nullable = true)
 |-- BRANCH_STATE: string (nullable = true)
 |-- BRANCH_STREET: string (nullable = true)
 |-- BRANCH_ZIP: long (nullable = true)
 |-- LAST_UPDATED: string (nullable = true)



In [153]:
#Format branch data

from pyspark.sql.functions import unix_timestamp, to_timestamp
from pyspark.sql.types import TimestampType

#branch_df.select("LAST_UPDATED").show(truncate=False)
branch_df.na.fill(value=00000,subset=["BRANCH_ZIP"])
branch_df.withColumn("BRANCH_PHONE", concat_ws('-', branch_df.BRANCH_PHONE[0:3], branch_df.BRANCH_PHONE[3:7]))

branch_df = branch_df.withColumn("LAST_UPDATED",to_timestamp("LAST_UPDATED"))
branch_df.printSchema()
branch_df = branch_df.drop_duplicates()
branch_df.show(3)


root
 |-- BRANCH_CITY: string (nullable = true)
 |-- BRANCH_CODE: long (nullable = true)
 |-- BRANCH_NAME: string (nullable = true)
 |-- BRANCH_PHONE: string (nullable = true)
 |-- BRANCH_STATE: string (nullable = true)
 |-- BRANCH_STREET: string (nullable = true)
 |-- BRANCH_ZIP: long (nullable = true)
 |-- LAST_UPDATED: timestamp (nullable = true)

+-----------+-----------+------------+------------+------------+-----------------+----------+-------------------+
|BRANCH_CITY|BRANCH_CODE| BRANCH_NAME|BRANCH_PHONE|BRANCH_STATE|    BRANCH_STREET|BRANCH_ZIP|       LAST_UPDATED|
+-----------+-----------+------------+------------+------------+-----------------+----------+-------------------+
|    Acworth|        197|Example Bank|  1234914637|          GA|  Briarwood Drive|     30101|2018-04-18 16:51:47|
|  Mundelein|         78|Example Bank|  1234164452|          IL|    Heather Court|     60060|2018-04-18 16:51:47|
|    Huntley|          2|Example Bank|  1234618993|          IL|Washington St

In [154]:
credit_df.show(1)
credit_df.printSchema()

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|        114|4210653349028689|123459988| 14|    2|             1|       Education|             78.9|2018|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
only showing top 1 row

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: long (nullable = true)
 |-- MONTH: long (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: long (nullable = true)



In [155]:
#Format credit data
#Concat YEAR, MONTH, DAY for YYYYMMDD format
credit_df = credit_df.withColumn('TIMEID', concat_ws('', credit_df.YEAR, credit_df.MONTH, credit_df.DAY  ))
credit_df = customer_df.drop('DAY', 'MONTH', 'YEAR')

credit_df = credit_df.drop_duplicates()
credit_df.show(3)


AttributeError: 'DataFrame' object has no attribute 'setNullableStateOfColumn'