<a href="https://colab.research.google.com/github/harsh359/harsh359/blob/DSA_ipynbs/loan_approval_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd


df = pd.read_json('/content/loan_data.json')
# by print we are having a look over how our data looks like.
print(df.head())

  Application_ID Gender Married Dependents     Education Self_Employed  \
0       LP001002   Male      No          0      Graduate            No   
1       LP001003   Male     Yes          1      Graduate            No   
2       LP001005   Male     Yes          0      Graduate           Yes   
3       LP001006   Male     Yes          0  Not Graduate            No   
4       LP001008   Male      No          0      Graduate            No   

   Credit_History Property_Area  Income Application_Status  
0               1         Urban  medium                  Y  
1               1         Rural  medium                  N  
2               1         Urban     low                  Y  
3               1         Urban     low                  Y  
4               1         Urban  medium                  Y  


## 1. Find % of total applicants for each unique value of dependents.

In [None]:
dependent_counts = df['Dependents'].value_counts(normalize=True) * 100


print("Percentage of applicants for each unique value of dependents:")
print(dependent_counts.to_string(index=True))


Percentage of applicants for each unique value of dependents:
0.0    62.955032
2.0    18.843683
1.0    18.201285


In [None]:
grouped_data = df.groupby('Dependents')['Application_ID'].count()
print("count of applicants for each unique value of dependents:")
print(grouped_data)

count of applicants for each unique value of dependents:
Dependents
0     294
1      85
2      88
3+     44
Name: Application_ID, dtype: int64


## Find the % of applications approved for self-employed applicants.

In [None]:
self_employed_approval_rate = (df[df['Self_Employed'] == "Yes"]['Application_Status'] == "Y").mean() * 100

print(f"Approval rate for self-employed applicants: {self_employed_approval_rate}")

Approval rate for self-employed applicants: 65.71428571428571


## 3. What is the % of rejections for married male applicants?

In [None]:
married_male_data = df[(df['Married'] == "Yes") & (df['Gender'] == "Male")]

married_male_rejection_rate = (married_male_data['Application_Status'] == "N").mean() * 100

print(f"Rejection rate for married male applicants: {married_male_rejection_rate}")

Rejection rate for married male applicants: 28.431372549019606


## 4. Which property area has the maximum approval ratio.

In [None]:
df['Application_Status'] = df['Application_Status'].apply(lambda x: 1 if x == "Y" else 0)
property_area_approval_ratios = (df.groupby('Property_Area')['Application_Status'].mean() * 100).reset_index()
max_approval_property_area = property_area_approval_ratios['Property_Area'].iloc[property_area_approval_ratios['Application_Status'].idxmax()]
print(f'Property area with the maximum approval ratio: {max_approval_property_area}')

Property area with the maximum approval ratio: Semiurban


In [None]:
property_area_approval_ratios

Unnamed: 0,Property_Area,Application_Status
0,Rural,60.402685
1,Semiurban,77.664975
2,Urban,63.030303


## 5. Find average number of dependents per income group.

In [None]:
df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')
average_dependents_per_income = df.groupby('Income')['Dependents'].mean() * 100

print(f"Average number of dependents per income group: {average_dependents_per_income}")

Average number of dependents per income group: Income
high      62.500000
low       48.437500
medium    65.497076
Name: Dependents, dtype: float64


## 6. Find approval ratio for various combinations of Property_Area and Marrital status.

In [None]:
df_new = df.groupby(['Property_Area','Married']).count()

df_rural = df_new.iloc[0:2,:]
df_urban = df_new.iloc[4:6,:]
df_final = pd.concat([df_rural,df_urban])

df_final.rename(columns={'Married':'Marital_Status', 'Application_ID':'# of Applicants'},inplace=True)
df_final.drop(columns=['Gender','Dependents','Education','Self_Employed','Credit_History','Income'],axis=1,inplace=True)
df_final.drop(columns=['Application_Status'],axis=1,inplace=True)

# Taking applicants for particular property area


rural_not_married = (34*100)/149
rural_married = (56*100)/149

urban_not_married = (30*100)/165
urban_married = (74*100)/165

ls = [rural_not_married,rural_married,urban_not_married,urban_married]
df_final['Approval_Ratio'] = ls
df_final

Unnamed: 0_level_0,Unnamed: 1_level_0,# of Applicants,Approval_Ratio
Property_Area,Married,Unnamed: 2_level_1,Unnamed: 3_level_1
Rural,No,56,22.818792
Rural,Yes,93,37.583893
Urban,No,56,18.181818
Urban,Yes,109,44.848485


## 7. Create a simple predictive model to assess whether a loan application will be approved or rejected and provide the accuracy score.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Prepare features and target variable
features = pd.get_dummies(df[['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Income']])
target = df['Application_Status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

# Random Forest Classifier model
model = RandomForestClassifier(max_depth=4)

# Training model
model.fit(X_train, y_train)

# Prediction
predictions = model.predict(X_test)

# accuracy score
accuracy = model.score(X_test, y_test) * 100

# Print the accuracy score
print(f"Accuracy of the model: {accuracy}")


Accuracy of the model: 81.55339805825243
