In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
from pathlib import Path
import hvplot.pandas
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
# Read the data into a Pandas DataFrame
finance_loan = Path('C:/Users/maria/OneDrive/Final_project/Resources/finance_loanT.csv')
finance_loan_df = pd.read_csv(finance_loan)
finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
# Get a brief summary of the DataFrame
finance_loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
# Combine Applicant Income and Co-applicant Income into a new variable
finance_loan_df['Total_Income'] = finance_loan_df['ApplicantIncome'] + finance_loan_df['CoapplicantIncome']

In [5]:
#Make bins for Applicant Income and create a new column
income_bins=[0,7000,1500,25000,40000,max(finance_loan_df['ApplicantIncome'])+1]
income_labels = ['Very_Low','Low','Average','High','Very_High']
#bins sorted
income_bins.sort()
# Use `pd.cut` to categorize income
finance_loan_df['Income_Category'] = pd.cut(finance_loan_df['ApplicantIncome'], bins=income_bins, labels=income_labels, include_lowest=True)
finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0,Low
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low


In [6]:
finance_loan_df['Income_Category'] = finance_loan_df['Income_Category'].astype('str') 

In [7]:
#Get the shape of the dataset after the columns were added
finance_loan_df.shape

(614, 15)

In [8]:
#Get the data types of the data after the colimns were added
finance_loan_df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
Total_Income         float64
Income_Category       object
dtype: object

In [9]:
# Check for any empty cells/ missing data
empty_cells = finance_loan_df.isna().any()
print("Empty Cells:")
print(empty_cells)

Empty Cells:
Loan_ID              False
Gender                True
Married               True
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
Loan_Status          False
Total_Income         False
Income_Category      False
dtype: bool


In [10]:
# Check and Remove any Duplicates in the dataset
duplicate_rows = finance_loan_df[finance_loan_df.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

Duplicate Rows:
Empty DataFrame
Columns: [Loan_ID, Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, Property_Area, Loan_Status, Total_Income, Income_Category]
Index: []


In [11]:
# Renaming column for better understanding
renamed_df = finance_loan_df.rename(columns={"ApplicantIncome": "Applicant_Income", "CoapplicantIncome":"Co-applicant_Income", "LoanAmount":"Loan_Amount",
                                             "Loan_Amount_Term": "Terms_of_loan_in months", "Dependents": "Number_of_Dependents"})
renamed_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0,Low
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low


In [12]:
# Drop rows with empty cells
new_finance_loan_df = renamed_df.dropna()
new_finance_loan_df

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,2900.0,Low
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,4106.0,Low
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,8312.0,Average
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,7583.0,Average


In [13]:
# Save new df to a file, that will be used for the tableau
new_finance_loan_df.to_csv('C:/Users/maria/OneDrive/Final_project/Resources/new_finance_loan_df.csv', index=False)

In [14]:
new_finance_loan_df['Number_of_Dependents']= new_finance_loan_df['Number_of_Dependents'].replace(['0', '1', '2', '3+'],['0','1','2','3'])
new_finance_loan_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_finance_loan_df['Number_of_Dependents']= new_finance_loan_df['Number_of_Dependents'].replace(['0', '1', '2', '3+'],['0','1','2','3'])


Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,2900.0,Low
610,LP002979,Male,Yes,3,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,4106.0,Low
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,8312.0,Average
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,7583.0,Average


In [15]:
# Review for any empty cells/ missing data
empty_cells = new_finance_loan_df.isna().any()
print("Empty Cells:")
print(empty_cells)

Empty Cells:
Loan_ID                    False
Gender                     False
Married                    False
Number_of_Dependents       False
Education                  False
Self_Employed              False
Applicant_Income           False
Co-applicant_Income        False
Loan_Amount                False
Terms_of_loan_in months    False
Credit_History             False
Property_Area              False
Loan_Status                False
Total_Income               False
Income_Category            False
dtype: bool


## Transform "Education" column with get_dummies

In [16]:
# Verify the categories of the "education" column
new_finance_loan_df["Education"].value_counts()

Education
Graduate        383
Not Graduate     97
Name: count, dtype: int64

In [17]:
# Transform the education column using get_dummies
education_dummies = pd.get_dummies(new_finance_loan_df["Education"])

# Display the transformed data
education_dummies.tail()

Unnamed: 0,Graduate,Not Graduate
609,True,False
610,True,False
611,True,False
612,True,False
613,True,False


In [18]:
# Concatenate the finance_loan_df and the finance_loan_df_dummies DataFrames
new_finance_loan_df = pd.concat([new_finance_loan_df, education_dummies], axis=1)

# Drop the original education column
new_finance_loan_df = new_finance_loan_df.drop(columns=["Education"])

# Display the DataFrame
new_finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,LP001003,Male,Yes,1,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low,True,False
2,LP001005,Male,Yes,0,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low,True,False
3,LP001006,Male,Yes,0,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low,False,True
4,LP001008,Male,No,0,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low,True,False
5,LP001011,Male,Yes,2,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low,True,False


# Transform "Income_Category" column with get_dummies

In [19]:
# Verify the categories of the "education" column
new_finance_loan_df["Income_Category"].value_counts()

Income_Category
Low          388
Average       79
Very_Low       8
High           4
Very_High      1
Name: count, dtype: int64

In [20]:
# Transform the education column using get_dummies
incomecategory_dummies = pd.get_dummies(new_finance_loan_df["Income_Category"])

# Display the transformed data
incomecategory_dummies.tail()

Unnamed: 0,Average,High,Low,Very_High,Very_Low
609,False,False,True,False,False
610,False,False,True,False,False
611,True,False,False,False,False
612,True,False,False,False,False
613,False,False,True,False,False


In [21]:
# Concatenate the finance_loan_df and the finance_loan_df_dummies DataFrames
new_finance_loan_df = pd.concat([new_finance_loan_df, incomecategory_dummies], axis=1)

# Drop the original Income_Category column
new_finance_loan_df = new_finance_loan_df.drop(columns=["Income_Category"])

# Display the DataFrame
new_finance_loan_df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low
609,LP002978,Female,No,0,No,2900,0.0,71.0,360.0,1.0,Rural,Y,2900.0,True,False,False,False,True,False,False
610,LP002979,Male,Yes,3,No,4106,0.0,40.0,180.0,1.0,Rural,Y,4106.0,True,False,False,False,True,False,False
611,LP002983,Male,Yes,1,No,8072,240.0,253.0,360.0,1.0,Urban,Y,8312.0,True,False,True,False,False,False,False
612,LP002984,Male,Yes,2,No,7583,0.0,187.0,360.0,1.0,Urban,Y,7583.0,True,False,True,False,False,False,False
613,LP002990,Female,No,0,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N,4583.0,True,False,False,False,True,False,False


## Transform "Married" column with encoding function

In [22]:
def encode_Married(Married):
    """This function encodes married by setting Yes as 1 and No as 0."""
    if Married == "Yes":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Married"] = new_finance_loan_df["Married"].apply(encode_Married)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low
1,LP001003,Male,1,1,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,True,False,False,False,True,False,False
2,LP001005,Male,1,0,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,True,False,False,False,True,False,False
3,LP001006,Male,1,0,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,False,True,False,False,True,False,False
4,LP001008,Male,0,0,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,True,False,False,False,True,False,False
5,LP001011,Male,1,2,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,True,False,False,False,True,False,False


## Transform "Self_Employed" column with encoding function

In [23]:
def encode_Self_Employed(Self_Employed):
    """This function encodes the self employee status by setting Yes as 1 and No as 0."""
    if Self_Employed == "Yes":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Self_Employed"] = new_finance_loan_df["Self_Employed"].apply(encode_Self_Employed)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low
1,LP001003,Male,1,1,0,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,True,False,False,False,True,False,False
2,LP001005,Male,1,0,1,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,True,False,False,False,True,False,False
3,LP001006,Male,1,0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,False,True,False,False,True,False,False
4,LP001008,Male,0,0,0,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,True,False,False,False,True,False,False
5,LP001011,Male,1,2,1,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,True,False,False,False,True,False,False


## Transform "Gender" column with encoding function

In [24]:
def encode_Gender(Gender):
    """This function encodes the gender by setting Male as 1, Female as 0 and ."""
    if Gender == "Male":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Gender"] = new_finance_loan_df["Gender"].apply(encode_Gender)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low
1,LP001003,1,1,1,0,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,True,False,False,False,True,False,False
2,LP001005,1,1,0,1,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,True,False,False,False,True,False,False
3,LP001006,1,1,0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,False,True,False,False,True,False,False
4,LP001008,1,0,0,0,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,True,False,False,False,True,False,False
5,LP001011,1,1,2,1,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,True,False,False,False,True,False,False


## Transform "Property_Area" column with encoding function

In [25]:
# Verify the categories of the "education" column
new_finance_loan_df["Property_Area"].value_counts()

Property_Area
Semiurban    191
Urban        150
Rural        139
Name: count, dtype: int64

In [26]:
# Transform the education column using get_dummies
propertyarea_dummies = pd.get_dummies(new_finance_loan_df["Property_Area"])

# Display the transformed data
propertyarea_dummies.tail()

Unnamed: 0,Rural,Semiurban,Urban
609,True,False,False
610,True,False,False
611,False,False,True
612,False,False,True
613,False,True,False


In [27]:
# Concatenate the finance_loan_df and the finance_loan_df_dummies DataFrames
new_finance_loan_df = pd.concat([new_finance_loan_df, propertyarea_dummies], axis=1)

# Drop the original Property_Area column
new_finance_loan_df = new_finance_loan_df.drop(columns=["Property_Area"])

# Display the DataFrame
new_finance_loan_df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,...,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low,Rural,Semiurban,Urban
609,LP002978,0,0,0,0,2900,0.0,71.0,360.0,1.0,...,True,False,False,False,True,False,False,True,False,False
610,LP002979,1,1,3,0,4106,0.0,40.0,180.0,1.0,...,True,False,False,False,True,False,False,True,False,False
611,LP002983,1,1,1,0,8072,240.0,253.0,360.0,1.0,...,True,False,True,False,False,False,False,False,False,True
612,LP002984,1,1,2,0,7583,0.0,187.0,360.0,1.0,...,True,False,True,False,False,False,False,False,False,True
613,LP002990,0,0,0,1,4583,0.0,133.0,360.0,0.0,...,True,False,False,False,True,False,False,False,True,False


## Transform "Loan_Status" column with encoding function

In [28]:
def encode_Loan_Status(Loan_Status):
    """This function encodes the loan status by setting Y as 1 and N as 0."""
    if Loan_Status == "Y":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Loan_Status"] = new_finance_loan_df["Loan_Status"].apply(encode_Loan_Status)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,...,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low,Rural,Semiurban,Urban
1,LP001003,1,1,1,0,4583,1508.0,128.0,360.0,1.0,...,True,False,False,False,True,False,False,True,False,False
2,LP001005,1,1,0,1,3000,0.0,66.0,360.0,1.0,...,True,False,False,False,True,False,False,False,False,True
3,LP001006,1,1,0,0,2583,2358.0,120.0,360.0,1.0,...,False,True,False,False,True,False,False,False,False,True
4,LP001008,1,0,0,0,6000,0.0,141.0,360.0,1.0,...,True,False,False,False,True,False,False,False,False,True
5,LP001011,1,1,2,1,5417,4196.0,267.0,360.0,1.0,...,True,False,False,False,True,False,False,False,False,True


In [29]:
new_finance_loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 480 entries, 1 to 613
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  480 non-null    object 
 1   Gender                   480 non-null    int64  
 2   Married                  480 non-null    int64  
 3   Number_of_Dependents     480 non-null    object 
 4   Self_Employed            480 non-null    int64  
 5   Applicant_Income         480 non-null    int64  
 6   Co-applicant_Income      480 non-null    float64
 7   Loan_Amount              480 non-null    float64
 8   Terms_of_loan_in months  480 non-null    float64
 9   Credit_History           480 non-null    float64
 10  Loan_Status              480 non-null    int64  
 11  Total_Income             480 non-null    float64
 12  Graduate                 480 non-null    bool   
 13  Not Graduate             480 non-null    bool   
 14  Average                  480 no

In [30]:
# Logistic Regression is a classification algorithm. It is used to predict a binary outcome (1 / 0, Yes / No, True / False) given a set of independent variables
# drop Loan_ID
new_finance_loan_df = new_finance_loan_df.drop('Loan_ID', axis=1)
new_finance_loan_df

Unnamed: 0,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Loan_Status,...,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low,Rural,Semiurban,Urban
1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,...,True,False,False,False,True,False,False,True,False,False
2,1,1,0,1,3000,0.0,66.0,360.0,1.0,1,...,True,False,False,False,True,False,False,False,False,True
3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,1,...,False,True,False,False,True,False,False,False,False,True
4,1,0,0,0,6000,0.0,141.0,360.0,1.0,1,...,True,False,False,False,True,False,False,False,False,True
5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,1,...,True,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,2900,0.0,71.0,360.0,1.0,1,...,True,False,False,False,True,False,False,True,False,False
610,1,1,3,0,4106,0.0,40.0,180.0,1.0,1,...,True,False,False,False,True,False,False,True,False,False
611,1,1,1,0,8072,240.0,253.0,360.0,1.0,1,...,True,False,True,False,False,False,False,False,False,True
612,1,1,2,0,7583,0.0,187.0,360.0,1.0,1,...,True,False,True,False,False,False,False,False,False,True


## Split the Data into Training and Testing Sets

In [31]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = new_finance_loan_df["Loan_Status"]
# Separate the X variable, the features
X = new_finance_loan_df.drop(columns=["Loan_Status"])

In [32]:
# Review the y variable Series
display(y.head())
display(y.tail())

1    0
2    1
3    1
4    1
5    1
Name: Loan_Status, dtype: int64

609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, dtype: int64

In [33]:
# Review the X variable DataFrame
display(X.head())
display(X.tail())

Unnamed: 0,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Total_Income,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low,Rural,Semiurban,Urban
1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,6091.0,True,False,False,False,True,False,False,True,False,False
2,1,1,0,1,3000,0.0,66.0,360.0,1.0,3000.0,True,False,False,False,True,False,False,False,False,True
3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,4941.0,False,True,False,False,True,False,False,False,False,True
4,1,0,0,0,6000,0.0,141.0,360.0,1.0,6000.0,True,False,False,False,True,False,False,False,False,True
5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,9613.0,True,False,False,False,True,False,False,False,False,True


Unnamed: 0,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Total_Income,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low,Rural,Semiurban,Urban
609,0,0,0,0,2900,0.0,71.0,360.0,1.0,2900.0,True,False,False,False,True,False,False,True,False,False
610,1,1,3,0,4106,0.0,40.0,180.0,1.0,4106.0,True,False,False,False,True,False,False,True,False,False
611,1,1,1,0,8072,240.0,253.0,360.0,1.0,8312.0,True,False,True,False,False,False,False,False,False,True
612,1,1,2,0,7583,0.0,187.0,360.0,1.0,7583.0,True,False,True,False,False,False,False,False,False,True
613,0,0,0,1,4583,0.0,133.0,360.0,0.0,4583.0,True,False,False,False,True,False,False,False,True,False


##  Split the data into training and testing datasets by using `train_test_split`.

In [71]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split and assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

## Create a Logistic Regression Model

In [72]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
# Fit the model using training data
lr_model.fit(X_train, y_train)

In [73]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Evaluation

In [74]:
# Make a prediction using the testing data
lr_predictions = lr_model.predict(X_test)
pd.DataFrame({'Predictions': lr_predictions, 'Actual': y_test})



Unnamed: 0,Predictions,Actual
286,1,0
538,1,0
167,1,1
574,1,0
106,1,1
...,...,...
534,0,1
562,1,1
123,1,1
258,1,0


In [75]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, lr_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)


In [76]:
# Print the classification report for the model
print(classification_report(y_test, lr_predictions))

              precision    recall  f1-score   support

           0       0.60      0.65      0.62        37
           1       0.84      0.81      0.82        83

    accuracy                           0.76       120
   macro avg       0.72      0.73      0.72       120
weighted avg       0.76      0.76      0.76       120



In [77]:
from sklearn.metrics import balanced_accuracy_score
print(f"The balanced accuracy score of the model is: {balanced_accuracy_score(y_test, lr_predictions)}")

The balanced accuracy score of the model is: 0.7279387821556497


# Decision Trees

In [58]:
#Imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [59]:
#loading the data

new_finance_loan_df.head()

Unnamed: 0,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Loan_Status,...,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low,Rural,Semiurban,Urban
1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,...,True,False,False,False,True,False,False,True,False,False
2,1,1,0,1,3000,0.0,66.0,360.0,1.0,1,...,True,False,False,False,True,False,False,False,False,True
3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,1,...,False,True,False,False,True,False,False,False,False,True
4,1,0,0,0,6000,0.0,141.0,360.0,1.0,1,...,True,False,False,False,True,False,False,False,False,True
5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,1,...,True,False,False,False,True,False,False,False,False,True


In [60]:
# Define features set

X = new_finance_loan_df.copy()
X.drop('Loan_Status', axis=1, inplace=True)
X.head()

Unnamed: 0,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Total_Income,Graduate,Not Graduate,Average,High,Low,Very_High,Very_Low,Rural,Semiurban,Urban
1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,6091.0,True,False,False,False,True,False,False,True,False,False
2,1,1,0,1,3000,0.0,66.0,360.0,1.0,3000.0,True,False,False,False,True,False,False,False,False,True
3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,4941.0,False,True,False,False,True,False,False,False,False,True
4,1,0,0,0,6000,0.0,141.0,360.0,1.0,6000.0,True,False,False,False,True,False,False,False,False,True
5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,9613.0,True,False,False,False,True,False,False,False,False,True


In [61]:
# Define target vector
y = new_finance_loan_df["Loan_Status"]
y[:5]



1    0
2    1
3    1
4    1
5    1
Name: Loan_Status, dtype: int64

In [62]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)



In [63]:
# Creating StandardScaler instance
scaler = StandardScaler()


In [64]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [65]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Fitting the Decision Tree Model

In [66]:
# Create the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [67]:
# Fit the model
model = model.fit(X_train_scaled, y_train)

# Making Predictions Using the Tree Model

In [68]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

# Model Evaluation

In [69]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)



In [70]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,20,15
Actual 1,19,66


Accuracy Score : 0.7166666666666667
Classification Report
              precision    recall  f1-score   support

           0       0.51      0.57      0.54        35
           1       0.81      0.78      0.80        85

    accuracy                           0.72       120
   macro avg       0.66      0.67      0.67       120
weighted avg       0.73      0.72      0.72       120

