## Housing Loan Approval 

In [1]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Import and read CSV
loan_df = pd.read_csv("Resources/loan-train.csv")
#loan_df.head()
loan_df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
# Determine the number of unique values in each column
loan_df.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [4]:
# Drop Column 
new_df = loan_df.drop(["Credit_History"], axis = 1)

In [5]:
#Check for Null Values
new_df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
#New df
new_df.dropna()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,Urban,Y


## Preprocessing Data 

In [7]:
# Check the DataFrame data types
new_df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Property_Area         object
Loan_Status           object
dtype: object

In [8]:
# Scaling the numeric columns
income_scaled = StandardScaler().fit_transform(new_df[["ApplicantIncome", "CoapplicantIncome", "LoanAmount","Loan_Amount_Term"]])

# Creating a DataFrame with with the scaled data
scaled_income_df = pd.DataFrame(income_scaled, columns=["ApplicantIncome", "CoapplicantIncome", "LoanAmount","Loan_Amount_Term"])

# Display sample data
scaled_income_df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
0,0.072991,-0.554487,,0.276642
1,-0.134412,-0.038732,-0.215309,0.276642
2,-0.393747,-0.554487,-0.940328,0.276642
3,-0.462062,0.25198,-0.30886,0.276642
4,0.097728,-0.554487,-0.063289,0.276642


In [9]:
# Transform the Loan_Status column using get_dummies()
LS_dummies = pd.get_dummies(new_df["Loan_Status"])

# Display sample data
LS_dummies.head()

Unnamed: 0,N,Y
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1


In [10]:
# Concatenate the scaled_income_df and the LS_dummies DataFrames
scaled_income_df = pd.concat([scaled_income_df, LS_dummies], axis=1)

# Display sample data
scaled_income_df.head(15)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,N,Y
0,0.072991,-0.554487,,0.276642,0,1
1,-0.134412,-0.038732,-0.215309,0.276642,1,0
2,-0.393747,-0.554487,-0.940328,0.276642,0,1
3,-0.462062,0.25198,-0.30886,0.276642,0,1
4,0.097728,-0.554487,-0.063289,0.276642,0,1
5,0.002218,0.8806,1.410137,0.276642,0,1
6,-0.503019,-0.035995,-0.601206,0.276642,0,1
7,-0.38785,0.301914,0.135506,0.276642,1,0
8,-0.228939,-0.032575,0.252445,0.276642,0,1
9,1.218457,3.196713,2.369033,0.276642,1,0


In [11]:
# Transform the Gender column using get_dummies()
gender_dummies = pd.get_dummies(new_df["Gender"], prefix='Gender')

#gender_dummies.rename(columns={'Gender_F': 'F','Gender_M': 'M'}, inplace=True)

print(gender_dummies)

     Gender_Female  Gender_Male
0                0            1
1                0            1
2                0            1
3                0            1
4                0            1
..             ...          ...
609              1            0
610              0            1
611              0            1
612              0            1
613              1            0

[614 rows x 2 columns]


In [12]:
# Concatenate the scaled_income_df and the gender_dummies DataFrames
scaled_income_df = pd.concat([scaled_income_df, gender_dummies], axis=1)

# Display sample data
scaled_income_df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,N,Y,Gender_Female,Gender_Male
0,0.072991,-0.554487,,0.276642,0,1,0,1
1,-0.134412,-0.038732,-0.215309,0.276642,1,0,0,1
2,-0.393747,-0.554487,-0.940328,0.276642,0,1,0,1
3,-0.462062,0.25198,-0.30886,0.276642,0,1,0,1
4,0.097728,-0.554487,-0.063289,0.276642,0,1,0,1


In [15]:
scaled_income_df.dropna()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,N,Y,Gender_Female,Gender_Male
1,-0.134412,-0.038732,-0.215309,0.276642,1,0,0,1
2,-0.393747,-0.554487,-0.940328,0.276642,0,1,0,1
3,-0.462062,0.251980,-0.308860,0.276642,0,1,0,1
4,0.097728,-0.554487,-0.063289,0.276642,0,1,0,1
5,0.002218,0.880600,1.410137,0.276642,0,1,0,1
...,...,...,...,...,...,...,...,...
609,-0.410130,-0.554487,-0.881859,0.276642,0,1,1,0
610,-0.212557,-0.554487,-1.244368,-2.489775,0,1,0,1
611,0.437174,-0.472404,1.246423,0.276642,0,1,0,1
612,0.357064,-0.554487,0.474628,0.276642,0,1,0,1


In [18]:
scaled_income_df.isnull().sum()

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
N                     0
Y                     0
Gender_Female         0
Gender_Male           0
dtype: int64

In [19]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

In [20]:
# Fit the model for the df_stocks_scaled DataFrame
model.fit(scaled_income_df)

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values