In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName("Loan Prediction Classification") \
  .getOrCreate()

In [2]:
data = spark.read.csv("data/loan-prediction-train.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)



In [3]:
data.count()

614

In [4]:
data.show(10)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y

In [6]:
import pyspark.sql.functions as F

data \
    .select([F.count(F.when(F.col(c).isNull(), 1)).alias(c) for c in data.columns]) \
    .show()

+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|      0|    13|      3|        15|        0|           32|              0|                0|        22|              14|            50|            0|          0|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+



In [13]:
filtered_data = data.na.drop(subset=['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History'])

In [14]:
train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=24)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  398
Test size:  82


In [30]:
pipeline_stages = []

In [31]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder 

categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

for col in categorical_cols:
    string_indexer = StringIndexer(inputCol=col, outputCol= col + '_index')
    print(f'StringIndexer {string_indexer.getInputCol()} -> {string_indexer.getOutputCol()}')
    
    encoder = OneHotEncoder(inputCol=string_indexer.getOutputCol(), outputCol=col + '_vec', dropLast=False)
    print(f'OneHotEncoder {encoder.getInputCol()} -> {encoder.getOutputCol()}')
    print()
    
    pipeline_stages += [string_indexer, encoder]

StringIndexer Gender -> Gender_index
OneHotEncoder Gender_index -> Gender_vec

StringIndexer Married -> Married_index
OneHotEncoder Married_index -> Married_vec

StringIndexer Dependents -> Dependents_index
OneHotEncoder Dependents_index -> Dependents_vec

StringIndexer Education -> Education_index
OneHotEncoder Education_index -> Education_vec

StringIndexer Self_Employed -> Self_Employed_index
OneHotEncoder Self_Employed_index -> Self_Employed_vec

StringIndexer Property_Area -> Property_Area_index
OneHotEncoder Property_Area_index -> Property_Area_vec



In [32]:
pipeline_stages += [StringIndexer(inputCol='Loan_Status', outputCol= 'Loan_Status_index')]

In [33]:
encoded_categorical_cols = [col + "_vec" for col in categorical_cols]
encoded_categorical_cols

['Gender_vec',
 'Married_vec',
 'Dependents_vec',
 'Education_vec',
 'Self_Employed_vec',
 'Property_Area_vec']

In [34]:
numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
               'Credit_History'] 

In [35]:
input_columns = encoded_categorical_cols + numeric_cols
input_columns

['Gender_vec',
 'Married_vec',
 'Dependents_vec',
 'Education_vec',
 'Self_Employed_vec',
 'Property_Area_vec',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [36]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=input_columns, outputCol='features')

pipeline_stages.append(assembler)

In [37]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier(featuresCol='features', labelCol='Loan_Status_index')

pipeline_stages.append(dtc)
pipeline_stages

[StringIndexer_60caf04fd26a,
 OneHotEncoder_fca94ac0e8f0,
 StringIndexer_56e548eb64e3,
 OneHotEncoder_bccbe2b70f75,
 StringIndexer_af1e2cacfa8b,
 OneHotEncoder_d6167fa7cdb8,
 StringIndexer_48adaea11096,
 OneHotEncoder_c45decdfddc0,
 StringIndexer_5b52062cd4cb,
 OneHotEncoder_0d9986449213,
 StringIndexer_57d93bf039a0,
 OneHotEncoder_006c3336acd5,
 StringIndexer_6d9a11af4c61,
 VectorAssembler_b8bce8799c82,
 DecisionTreeClassifier_824a45e6de4b]

In [38]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=pipeline_stages)

In [40]:
model = pipeline.fit(train_data)

In [41]:
predictions = model.transform(test_data)
predictions.select('features', 'Loan_Status_index', 'prediction').show(10)

+--------------------+-----------------+----------+
|            features|Loan_Status_index|prediction|
+--------------------+-----------------+----------+
|(20,[0,2,6,8,10,1...|              0.0|       0.0|
|(20,[0,3,4,8,10,1...|              0.0|       0.0|
|(20,[0,2,4,9,10,1...|              1.0|       1.0|
|(20,[0,2,4,8,10,1...|              0.0|       0.0|
|(20,[1,3,4,8,10,1...|              1.0|       0.0|
|(20,[0,2,6,8,10,1...|              0.0|       0.0|
|(20,[1,3,4,8,10,1...|              1.0|       1.0|
|(20,[0,2,6,8,10,1...|              0.0|       0.0|
|(20,[0,3,4,8,10,1...|              0.0|       0.0|
|(20,[1,2,4,8,10,1...|              0.0|       0.0|
+--------------------+-----------------+----------+
only showing top 10 rows


In [43]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accuracty_evaluator = MulticlassClassificationEvaluator(labelCol='Loan_Status_index', predictionCol='prediction', metricName='accuracy')
accuracy = accuracty_evaluator.evaluate(predictions) * 100
print(f'Accuracy = {accuracy:.2f}%')

Accuracy = 85.37%


In [44]:
precision_evaluator = MulticlassClassificationEvaluator(labelCol='Loan_Status_index', predictionCol='prediction', metricName='precisionByLabel')
precision = precision_evaluator.evaluate(predictions) * 100
print(f'Precision = {precision:.2f}%')

Precision = 85.07%


In [45]:
recall_evaluator = MulticlassClassificationEvaluator(labelCol='Loan_Status_index', predictionCol='prediction', metricName='recallByLabel')
recall = recall_evaluator.evaluate(predictions) * 100
print(f'Recall = {recall:.2f}%')

Recall = 96.61%
