In [47]:
# FindSpark simplifies the process of using Apache Spark with Python
import requests
import pyspark
import findspark

# Initializing FindSpark to locate Spark installation
findspark.init()

# Importing SparkSession from pyspark.sql module
from pyspark.sql import SparkSession

#import functions/Classes for sparkml
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.sql import functions as f
from pyspark.sql.types import IntegerType

# import functions/Classes for pipeline creation
from pyspark.ml import Pipeline

# import functions/Classes for metrics
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator, BinaryClassificationEvaluator, ClusteringEvaluator

## Task 1 - Create a spark session

In [None]:
spark = SparkSession.builder.appName("Classification using SparkML").getOrCreate()

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52389)
Traceback (most recent call last):
  File "C:\Users\james.byers\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Users\james.byers\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "C:\Users\james.byers\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Users\james.byers\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 747, in __init__
    self.handle()
  File "c:\Users\james.byers\Data_Engineer\.data\lib\site-packages\pyspark\accumulators.py", line 295, in handle
    poll(accum_updates)
  File "c:\Users\james.byers\Data_Engineer\.data\lib\sit

## Task 2 - Load data from a CSV file into a dataframe
Download data file

In [3]:
api_url = "https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json"
response = requests.get(api_url)
loan_df = spark.read.json(spark.sparkContext.parallelize([response.json()]))
loan_df.show(5)

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|
|      LP001003|                 N|             1|         1|    Graduate|  Male|medium|    Yes|        Rural|           No|
|      LP001005|                 Y|             1|         0|    Graduate|  Male|   low|    Yes|        Urban|          Yes|
|      LP001006|                 Y|             1|         0|Not Graduate|  Male|   low|    Yes|        Urban|           No|
|      LP001008|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|


In [4]:
new_loan_df = loan_df.withColumn("Dependents", f.regexp_replace(f.col("Dependents"), "\+", "")\
                                 .cast(IntegerType()))

new_loan_df.printSchema()

root
 |-- Application_ID: string (nullable = true)
 |-- Application_Status: string (nullable = true)
 |-- Credit_History: long (nullable = true)
 |-- Dependents: integer (nullable = true)
 |-- Education: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Self_Employed: string (nullable = true)



In [None]:
new_loan_df.show()

In [46]:
idx_inputs = ["Application_Status", "Dependents", "Education", "Gender", "Income", "Married", "Property_Area", "Self_Employed"]
idx_outputs = ["Application_Status_Index", "Dependents_Index", "Education_Index", "Gender_Index", "Income_Index", "Married_Index", "Property_Area_Index", "Self_Employed_Index"]

Convert string column(s) into a numeric column

In [None]:
# Convert column(s) from string to numerical values
indexer = StringIndexer(inputCols=idx_inputs, outputCols=idx_outputs)
indexed = indexer.fit(new_loan_df).transform(new_loan_df)

In [None]:
# idx_encoded = OneHotEncoder(inputCol=indexer, outputCol='index_encoded')
# cols_encoded = idx_encoded.fit(new_loan_df).transform(new_loan_df)

AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.

In [50]:
indexed.groupBy("Application_Status_Index").count().orderBy('count').show()

+------------------------+-----+
|Application_Status_Index|count|
+------------------------+-----+
|                     1.0|  164|
|                     0.0|  347|
+------------------------+-----+



In [51]:
indexed.groupBy("Application_Status").count().orderBy('count').show()

+------------------+-----+
|Application_Status|count|
+------------------+-----+
|                 N|  164|
|                 Y|  347|
+------------------+-----+



In [None]:
# dropped_index = indexer.drop("Application_ID", "Credit_History", "Application_Status", "Education", "Gender", "Income", "Married", "Property_Area", "Self_Employed")
# dropped_index.groupBy(dropped_index['Income_Index']).count().show()

+------------+-----+
|Income_Index|count|
+------------+-----+
|         0.0|  273|
|         1.0|  193|
|         2.0|   45|
+------------+-----+



## Task 3 - Identify the label column and the input columns
The VectorAssembler groups a bunch of inputCols as single column named "features"

In [None]:
# Prepare feature vector
vectorAssembler = VectorAssembler(inputCols=["Income_Index", "Property_Area_Index"], outputCol="features")

indexed_transformed = vectorAssembler.transform(indexed)

In [60]:
indexed_transformed.select("features", "Application_Status_Index").show()

+-----------------+------------------------+
|         features|Application_Status_Index|
+-----------------+------------------------+
|[0.0,0.0,1.0,1.0]|                     0.0|
|[2.0,0.0,1.0,2.0]|                     1.0|
|    (4,[3],[1.0])|                     0.0|
|[0.0,1.0,0.0,1.0]|                     0.0|
|[0.0,0.0,1.0,1.0]|                     0.0|
|[1.0,0.0,1.0,1.0]|                     0.0|
|[0.0,1.0,0.0,1.0]|                     0.0|
|    (4,[0],[3.0])|                     1.0|
|[1.0,0.0,1.0,1.0]|                     0.0|
|[2.0,0.0,2.0,0.0]|                     1.0|
|[1.0,0.0,0.0,1.0]|                     0.0|
|[1.0,0.0,0.0,1.0]|                     0.0|
|    (4,[3],[2.0])|                     1.0|
|[1.0,0.0,0.0,1.0]|                     0.0|
|[0.0,0.0,1.0,1.0]|                     0.0|
|    (4,[3],[1.0])|                     1.0|
|[0.0,1.0,1.0,2.0]|                     1.0|
|[0.0,1.0,1.0,1.0]|                     1.0|
|[2.0,0.0,1.0,1.0]|                     0.0|
|    (4,[1

## Taks 4 - Split the data
Split the data set in the ratio of 70:30. 70% training data, 30% testing data

In [None]:
# Split data into training and testing sets
(trainingData, testData) = indexed_transformed.randomSplit([0.7, 0.3], seed=42)

## Task 5 - Build and Train a Logistic Regression Model
Create a LR model and train the model using the training data set

In [None]:
# Stage 5 - create a linear regression instance
lr = LogisticRegression(featuresCol="features", labelCol="Application_Status_Index")
model = lr.fit(trainingData)

## Task 6 - Evaluate the model
Your model is now trained. Use the testing data to make predictions

In [97]:
# Make predictions on testing data
predictions = model.transform(testData)

In [None]:
predictions.show(20)

In [99]:
# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol='Application_Status_Index', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print('Accuracy = ', accuracy)

Accuracy =  0.6967741935483871


In [100]:
evaluator = MulticlassClassificationEvaluator(labelCol='Application_Status_Index', predictionCol='prediction', metricName='weightedPrecision')
precision = evaluator.evaluate(predictions)
print('Precisioin = ', precision)

Precisioin =  0.7126361888485366


In [101]:
evaluator = MulticlassClassificationEvaluator(labelCol='Application_Status_Index', predictionCol='prediction', metricName='weightedRecall')
recall = evaluator.evaluate(predictions)
print('Recall = ', recall)

Recall =  0.6967741935483871


In [102]:
evaluator = MulticlassClassificationEvaluator(labelCol='Application_Status_Index', predictionCol='prediction', metricName='f1')
f1_score = evaluator.evaluate(predictions)
print('F1 score = ', f1_score)

F1 score =  0.5945932632486364


***

## Task 1 - 

### Define pipeline stages


In [None]:
# Stage 2 - scale the features using standard scaler
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

## Task 3 - Build the pipeline


In [40]:
# Build the pipeline
# All the stages of the pipeline are mentioned in the order of execution.
pipeline = Pipeline(stages=[vectorAssembler, lr])

## Task 4 - Split the data

In [None]:
# Split the data into training and testing sets
# (trainingData, testData) = dropped_index.randomSplit([0.8, .02], seed=42)

## Task 5 - Fit the pipeline

In [42]:
# Fit the pipeline to the training data
# ignore any warnings. The warnings are due to the simplified settings and the security settings of the lab

model = pipeline.fit(trainingData)

## Task 6 - Evaluate the model

In [43]:
predictions = model.transform(testData)

In [44]:
predictions.show(5)

+----------+------------------------+---------------+------------+------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+----------+
|Dependents|Application_Status_Index|Education_Index|Gender_Index|Income_Index|Married_Index|Property_Area_Index|Self_Employed_Index|           features|       rawPrediction|         probability|prediction|
+----------+------------------------+---------------+------------+------------+-------------+-------------------+-------------------+-------------------+--------------------+--------------------+----------+
|         0|                     0.0|            0.0|         0.0|         0.0|          0.0|                0.0|                0.0|          (7,[],[])|[1.54754994474389...|[0.82455958599846...|       0.0|
|         0|                     0.0|            0.0|         0.0|         0.0|          0.0|                0.0|                0.0|          (7,[],[])|[1.54754994474389..

Print the rmse value

In [45]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="Application_Status_Index")
ROC_AUC = evaluator.evaluate(predictions)
print("Results =", ROC_AUC)

Results = 0.5


In [None]:
evaluator.

In [None]:
spark.stop()

In [None]:
new_index = indexed.select("Dependents", "Education", "Education_Index", "Gender", "Gender_Index", "Income", "Income_Index", "Married", "Married_Index", "Property_Area", "Property_Area_Index", "Self_Employed", "Self_Employed_Index", "Application_Status", "Application_Status_Index")



In [None]:
customer_df = spark.read.json("data\cdw_sapp_customer.json", multiLine=True)
credit_df = spark.read.json("data\cdw_sapp_credit.json", multiLine=True)
branch_df = spark.read.json("data\cdw_sapp_branch.json", multiLine=True)

# all_df = spark.read.json(["data\cdw_sapp_branch.json",
                        # "data\cdw_sapp_credit.json",
                        # "data\cdw_sapp_customer.json"
                        # ], multiLine=True)
# all_df.printSchema()
# all_df.show()


In [None]:
credit_cust_joined = customer_df.join(credit_df, on='CREDIT_CARD_NO')
credit_cust_joined.show(5)

In [None]:
joined = customer_df.join(credit_df, on='CREDIT_CARD_NO').join(branch_df, on='BRANCH_CODE')
joined.show(5)