In [None]:
# FindSpark simplifies the process of using Apache Spark with Python
import requests
import pyspark
import findspark

# Initializing FindSpark to locate Spark installation
findspark.init()

# Importing SparkSession from pyspark.sql module
from pyspark.sql import SparkSession

#import functions/Classes for sparkml
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.sql import functions as f
from pyspark.sql.types import IntegerType

# import functions/Classes for pipeline creation
from pyspark.ml import Pipeline

# import functions/Classes for metrics
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator, BinaryClassificationEvaluator, ClusteringEvaluator

In [20]:
spark = SparkSession.builder.appName("Classification using SparkML").getOrCreate()

In [21]:
api_url = "https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json"
response = requests.get(api_url)
loan_df = spark.read.json(spark.sparkContext.parallelize([response.json()]))
loan_df.show(5)

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|
|      LP001003|                 N|             1|         1|    Graduate|  Male|medium|    Yes|        Rural|           No|
|      LP001005|                 Y|             1|         0|    Graduate|  Male|   low|    Yes|        Urban|          Yes|
|      LP001006|                 Y|             1|         0|Not Graduate|  Male|   low|    Yes|        Urban|           No|
|      LP001008|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|


In [4]:
customer_df = spark.read.json("data\cdw_sapp_customer.json", multiLine=True)
credit_df = spark.read.json("data\cdw_sapp_credit.json", multiLine=True)
branch_df = spark.read.json("data\cdw_sapp_branch.json", multiLine=True)

# all_df = spark.read.json(["data\cdw_sapp_branch.json",
                        # "data\cdw_sapp_credit.json",
                        # "data\cdw_sapp_customer.json"
                        # ], multiLine=True)
# all_df.printSchema()
# all_df.show()


In [None]:
customer_df.show(10)

In [None]:
credit_df.show(10)

In [None]:
branch_df.show(10)

In [5]:
credit_cust_joined = customer_df.join(credit_df, on='CREDIT_CARD_NO')
credit_cust_joined.show(5)

+----------------+------+------------+-------------+-------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+------------+-----------+---------+---+-----+--------------+----------------+-----------------+----+
|  CREDIT_CARD_NO|APT_NO|   CUST_CITY| CUST_COUNTRY|         CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN| STREET_NAME|BRANCH_CODE| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|
+----------------+------+------------+-------------+-------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+------------+-----------+---------+---+-----+--------------+----------------+-----------------+----+
|4210653349028689|   774|Harleysville|United States|ACampos@example.com|   1236163|        PA|   19438|    Amalia|   Campos|2018-04-21T12:49:...|        Hal|123459988|Valley Drive|        114|1234

In [6]:
joined = customer_df.join(credit_df, on='CREDIT_CARD_NO').join(branch_df, on='BRANCH_CODE')
joined.show(5)

+-----------+----------------+------+------------+-------------+-------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+------------+---------+---+-----+--------------+----------------+-----------------+----+------------+------------+------------+------------+---------------+----------+--------------------+
|BRANCH_CODE|  CREDIT_CARD_NO|APT_NO|   CUST_CITY| CUST_COUNTRY|         CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN| STREET_NAME| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR| BRANCH_CITY| BRANCH_NAME|BRANCH_PHONE|BRANCH_STATE|  BRANCH_STREET|BRANCH_ZIP|        LAST_UPDATED|
+-----------+----------------+------+------------+-------------+-------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+------------+---------+---+-----+--------------+----------------+-----------------+--

In [34]:
new_loan_df = loan_df.withColumn("Dependents", f.regexp_replace(f.col("Dependents"), "\+", "")\
                                 .cast(IntegerType()))

new_loan_df.printSchema()

root
 |-- Application_ID: string (nullable = true)
 |-- Application_Status: string (nullable = true)
 |-- Credit_History: long (nullable = true)
 |-- Dependents: integer (nullable = true)
 |-- Education: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Self_Employed: string (nullable = true)



In [11]:
new_loan_df.show()

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|
|      LP001003|                 N|             1|         1|    Graduate|  Male|medium|    Yes|        Rural|           No|
|      LP001005|                 Y|             1|         0|    Graduate|  Male|   low|    Yes|        Urban|          Yes|
|      LP001006|                 Y|             1|         0|Not Graduate|  Male|   low|    Yes|        Urban|           No|
|      LP001008|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|


In [35]:
inputs = ["Application_Status", "Education", "Gender", "Income", "Married", "Property_Area", "Self_Employed"]
outputs = ["Application_Status_Index", "Education_Index", "Gender_Index", "Income_Index", "Married_Index", "Property_Area_Index", "Self_Employed_Index"]

In [36]:
indexer = StringIndexer(inputCols=inputs, outputCols=outputs)
indexed = indexer.fit(new_loan_df).transform(new_loan_df)

In [15]:
indexed.show()

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+------------------------+---------------+------------+------------+-------------+-------------------+-------------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|Application_Status_Index|Education_Index|Gender_Index|Income_Index|Married_Index|Property_Area_Index|Self_Employed_Index|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+------------------------+---------------+------------+------------+-------------+-------------------+-------------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|                     0.0|            0.0|         0.0|         1.0|          1.0|                1.0|                0.0|
|      LP001003|

In [37]:
dropped_index = indexed.drop("Application_ID", "Credit_History", "Application_Status", "Education", "Gender", "Income", "Married", "Property_Area", "Self_Employed")
dropped_index.show(5)

+----------+------------------------+---------------+------------+------------+-------------+-------------------+-------------------+
|Dependents|Application_Status_Index|Education_Index|Gender_Index|Income_Index|Married_Index|Property_Area_Index|Self_Employed_Index|
+----------+------------------------+---------------+------------+------------+-------------+-------------------+-------------------+
|         0|                     0.0|            0.0|         0.0|         1.0|          1.0|                1.0|                0.0|
|         1|                     1.0|            0.0|         0.0|         1.0|          0.0|                2.0|                0.0|
|         0|                     0.0|            0.0|         0.0|         0.0|          0.0|                1.0|                1.0|
|         0|                     0.0|            1.0|         0.0|         0.0|          0.0|                1.0|                0.0|
|         0|                     0.0|            0.0|         

### Define pipeline stages


In [38]:
dropped_index.printSchema()

root
 |-- Dependents: integer (nullable = true)
 |-- Application_Status_Index: double (nullable = false)
 |-- Education_Index: double (nullable = false)
 |-- Gender_Index: double (nullable = false)
 |-- Income_Index: double (nullable = false)
 |-- Married_Index: double (nullable = false)
 |-- Property_Area_Index: double (nullable = false)
 |-- Self_Employed_Index: double (nullable = false)



In [39]:
# Stage 1 - assemble the input columns into a single vector 
vectorAssembler = VectorAssembler(inputCols=["Dependents", "Education_Index", "Gender_Index", "Income_Index", "Married_Index", "Property_Area_Index", "Self_Employed_Index"], outputCol="features")
# Stage 2 - scale the features using standard scaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
# Stage 3 - create a linear regression instance
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="Application_Status_Index")

## Task 3 - Build the pipeline


In [40]:
# Build the pipeline
# All the stages of the pipeline are mentioned in the order of execution.
pipeline = Pipeline(stages=[vectorAssembler, scaler, lr])

## Task 4 - Split the data

In [41]:
# Split the data into training and testing sets
(trainingData, testData) = dropped_index.randomSplit([0.8, .02], seed=42)

## Task 5 - Fit the pipeline

In [42]:
# Fit the pipeline to the training data
# ignore any warnings. The warnings are due to the simplified settings and the security settings of the lab

model = pipeline.fit(trainingData)

## Task 6 - Evaluate the model

In [43]:
predictions = model.transform(testData)

Print the rmse value

In [44]:
evaluator = RegressionEvaluator(labelCol="Application_Status_Index", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) =", rmse)

Root Mean Squared Error (RMSE) = 0.4282090528632675


In [18]:
spark.stop()

In [None]:
new_index = indexed.select("Dependents", "Education", "Education_Index", "Gender", "Gender_Index", "Income", "Income_Index", "Married", "Married_Index", "Property_Area", "Property_Area_Index", "Self_Employed", "Self_Employed_Index", "Application_Status", "Application_Status_Index")

