In [1]:
import requests
import pandas as pd
from pyspark.sql import SparkSession
import json

import os
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jdk-18.0.2'
os.environ['SPARK_HOME'] = 'C:\Program Files\spark-3.3.0-bin-hadoop3'
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


In [2]:
#Create a Python program to GET (consume) data from the above API endpoint for the loan application dataset.
got = requests.get('https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json')

#Create a spark session

spark = SparkSession.builder.appName('Loan App').getOrCreate()

json_data = json.loads(got.content)
json_data[0]
df=pd.read_json(got.content)
df = spark.createDataFrame(df)
df.createOrReplaceTempView("df")

df.printSchema()



root
 |-- Application_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- Credit_History: long (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Application_Status: string (nullable = true)



In [3]:
#Find the status code of the above API endpoint.
print(got.status_code)

200


In [4]:
df.show()

+--------------+------+-------+----------+------------+-------------+--------------+-------------+------+------------------+
|Application_ID|Gender|Married|Dependents|   Education|Self_Employed|Credit_History|Property_Area|Income|Application_Status|
+--------------+------+-------+----------+------------+-------------+--------------+-------------+------+------------------+
|      LP001002|  Male|     No|         0|    Graduate|           No|             1|        Urban|medium|                 Y|
|      LP001003|  Male|    Yes|         1|    Graduate|           No|             1|        Rural|medium|                 N|
|      LP001005|  Male|    Yes|         0|    Graduate|          Yes|             1|        Urban|   low|                 Y|
|      LP001006|  Male|    Yes|         0|Not Graduate|           No|             1|        Urban|   low|                 Y|
|      LP001008|  Male|     No|         0|    Graduate|           No|             1|        Urban|medium|                 Y|


In [5]:
df.write.format("jdbc") \
  .mode("overwrite") \
  .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
  .option("dbtable", "CDW_SAPP_loan_application") \
  .option("driver", "com.mysql.jdbc.Driver") \
  .option("user", "root") \
  .option("password", "db") \
  .save()

In [6]:

df1 = df.toPandas()
import plotly.express as px



In [7]:
#Find and plot the percentage of applications approved for self-employed applicants.
approved = df1[df1['Application_Status']=='Y']
percent_accepted = len(approved) / len(df1)
percent_accepted

fig = px.pie(values=[percent_accepted, 1.0], names=['Approved', 'Total'])
fig.show()


#df2 = px.gapminder().query("Application_Status" == "Y")
#df1.loc[df['pop'] < 2.e6, 'country'] = 'Other countries' # Represent only large countries

#df.filter(df.Application_Status == 'Y').show(truncate=False)

In [8]:
#Find the percentage of rejection for married male applicants.
import plotly.graph_objects as go

import numpy as np
#male = np.logical_and(df1['Gender']=='Male', df1['Married'=='Y'], df1['Application_Status']=='Y' )

male = df1.groupby(['Gender', 'Married'])['Application_Status'].apply(lambda x: (x=='Y').sum()).reset_index(name='count')
#& df1[df1['Application_Status']=='Y']

#d = pd.DataFrame(np.extract(male==True, male), np.extract(male==False, male) )


fig = px.bar(
    x=male['Gender']=='Male', 
    y=male['count'], 
    color=male['Gender'], 
    title="Application approval by Gender and Marital Status",
    color_discrete_sequence=px.colors.qualitative.Vivid,
    labels={
        'x': "Gender",
        'y': "Count"
    },
    data_frame=male,
    text=round(male['count']/male['count'].sum()*100, 2),
    barmode='overlay'

)

"""trace2= px.bar(
    x=male['Gender']=='Female', 
    y=male['count'], 
    color=male['Married'], 
    title="Application approval by Gender and Marital Status",
    color_discrete_sequence=px.colors.qualitative.Vivid,
    labels={
        'x': "Married",
        'y': "Count"
    }
    ,barmode='overlay'

)
fig.add_trace(trace2.data[0])"""
fig.show()

male.head()

Unnamed: 0,Gender,Married,count
0,Female,No,40
1,Female,Yes,17
2,Male,No,71
3,Male,Yes,219


In [9]:
#Reading data from database because we need the credit table

url = "jdbc:mysql://localhost:3306/creditcard_capstone"
driver = "com.mysql.jdbc.Driver"
user = "root"
password = "db"


credit_df =  spark.read\
    .format("jdbc")\
    .option("driver", driver)\
    .option("url", url)\
    .option("user", user)\
    .option("password", password)\
    .option("dbtable", "CDW_SAPP_CREDIT")\
    .load()

credit_df = credit_df.toPandas()



In [10]:
#Find and plot the top three months with the largest transaction data.
import calendar
dates = credit_df.loc[:,['TIMEID', 'TRANSACTION_VALUE']]
dates['TIMEID'] = pd.to_datetime(dates['TIMEID'], format='%Y%m%d').dt.month
dates


tgroup = dates.groupby('TIMEID')['TRANSACTION_VALUE'].sum().reset_index()
tgroup

fig = px.line(tgroup, x='TIMEID', y='TRANSACTION_VALUE', markers=True, line_shape='spline', text='TIMEID')
fig.update_traces(textposition="bottom right")
fig.show()

In [11]:
#Find and plot which branch processed the highest total dollar value of healthcare transactions.

a = np.where(credit_df['TRANSACTION_TYPE']=='Healthcare')
health = credit_df.iloc[a]
grouped = health.groupby('BRANCH_CODE')['TRANSACTION_VALUE'].sum().reset_index()

grouped = grouped.sort_values(by='TRANSACTION_VALUE', ascending=False)
grouped
fig = px.scatter(grouped, x='BRANCH_CODE', y='TRANSACTION_VALUE', color='TRANSACTION_VALUE', size='TRANSACTION_VALUE')
fig.show()

<h2> Image of Tableau Dashboard for this project, Workbook in Project Directory </h2>

<img src='ccc_Dashboard.png'/>
