In [68]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit, substring, lpad, col, initcap, lower, trim, concat_ws, when, length

spark = SparkSession.builder.appName('capstone-prac').getOrCreate()

spark

In [9]:
def extract():
    
    # gather the nessecary data from their data sources and assign them variable names.
    df_branch =spark.read.option("multiLine", True).json('origin_data/cdw_sapp_branch.json')
    df_customer =spark.read.option("multiLine", True).json('origin_data/cdw_sapp_customer.json')
    df_credit =spark.read.option("multiLine", True).json('origin_data/cdw_sapp_credit.json')

    return df_branch, df_customer, df_credit

In [62]:
df_branch, df_customer, df_credit = extract()


In [103]:
def transform(df_branch, df_customer, df_credit) -> tuple:


    # Fill missing 'BRANCH_ZIP' with 99999
    df_branch = df_branch.fillna({'BRANCH_ZIP': 99999})

    df_branch = df_branch.withColumn(
        # select the column I want to change 
        'BRANCH_PHONE',
        # use the concat function with the literal and substring functions to manipulate the data to what you want it to be similar to splicing
        concat(
            lit("("), substring("BRANCH_PHONE", 1, 3), lit(")"),
            substring("BRANCH_PHONE", 4, 3), lit("-"),
            substring("BRANCH_PHONE", 7, 4) 
        )
    )
    # for testing purposes, make sure this is commented out for production
    # df_branch.show(10)

    # --------------------------------------------------------------------

    df_credit = df_credit.withColumn(
        "TIMEID",
        concat(
            lpad(col("YEAR").cast("string"), 4, "0"),
            lpad(col("MONTH").cast("string"), 2, "0"),
            lpad(col("DAY").cast("string"), 2, "0")
        ).cast("int")
    )
    df_credit = df_credit.drop("YEAR", "MONTH", "DAY")
    # for testing purposes, make sure this is commented out for production
    # df_credit.show(10)
    
    # --------------------------------------------------------------------

    # Capitalize first name (only first letter upper)
    df_customer = df_customer.withColumn("FIRST_NAME", initcap(trim(col("FIRST_NAME"))))

    # Lowercase middle name
    df_customer = df_customer.withColumn("MIDDLE_NAME", lower(trim(col("MIDDLE_NAME"))))

    # Capitalize last name (only first letter upper)
    df_customer = df_customer.withColumn("LAST_NAME", initcap(trim(col("LAST_NAME"))))

    # Combine street and apartment into full address (as string)
    df_customer = df_customer.withColumn(
        "FULL_STREET_ADDRESS",
        concat_ws(", ",
            trim(col("STREET_NAME")),
            trim(col("APT_NO").cast("string"))
        )
    )

    # Format CUST_PHONE as (XXX)XXX-XXXX, only if it's exactly 10 digits
    df_customer = df_customer.withColumn(
        "CUST_PHONE",
        concat(
            lit("("),
            substring(lpad(col("CUST_PHONE").cast("string"), 10, "X"), 1, 3),
            lit(")"),
            substring(lpad(col("CUST_PHONE").cast("string"), 10, "X"), 4, 3),
            lit("-"),
            substring(lpad(col("CUST_PHONE").cast("string"), 10, "X"), 7, 4)
        )
    )
    # for testing purposes, make sure this is commented out for production
    # df_customer.show(10)

    return df_branch, df_customer, df_credit

In [104]:
df_branch, df_customer, df_credit = transform(df_branch, df_customer, df_credit)

In [None]:
def extract(df_branch, df_customer, df_credit) -> tuple:
    pass 