In [137]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
from pathlib import Path
import hvplot.pandas
from sklearn.metrics import confusion_matrix, classification_report


In [138]:
# Read the data into a Pandas DataFrame
finance_loan = Path(r'C:\Users\laris\Documents\Project4-Group1\Project4-Group1\Resources\finance-loan.csv')
finance_loan_df = pd.read_csv(finance_loan)
finance_loan_df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [139]:
# Get a brief summary of the DataFrame
finance_loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [140]:
# Combine Applicant Income and Co-applicant Income into a new variable
finance_loan_df['Total_Income'] = finance_loan_df['ApplicantIncome'] + finance_loan_df['CoapplicantIncome']

In [174]:
income_bins=[0,7000,1500,25000,40000,max(finance_loan_df['ApplicantIncome'])+1]
income_labels = ['Very_Low','Low','Average','High','Very_High']
#bins sorted
income_bins.sort()
# Use `pd.cut` to categorize income
finance_loan_df['Income_Category'] = pd.cut(finance_loan_df['ApplicantIncome'], bins=income_bins, labels=income_labels, include_lowest=True)
finance_loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0,Low
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low


In [175]:
# Check for any empty cells/ missing data
empty_cells = finance_loan_df.isna().any()
print("Empty Cells:")
print(empty_cells)

Empty Cells:
Loan_ID              False
Gender                True
Married               True
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
Loan_Status          False
Total_Income         False
Income_Category      False
dtype: bool


In [176]:
# Check and Remove any Duplicates in the dataset
duplicate_rows = finance_loan_df[finance_loan_df.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

Duplicate Rows:
Empty DataFrame
Columns: [Loan_ID, Gender, Married, Dependents, Education, Self_Employed, ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, Property_Area, Loan_Status, Total_Income, Income_Category]
Index: []


In [177]:
# Renaming column for better understanding
renamed_df = finance_loan_df.rename(columns={"ApplicantIncome": "Applicant_Income", "CoapplicantIncome":"Co-applicant_Income", "LoanAmount":"Loan_Amount",
                                             "Loan_Amount_Term": "Terms_of_loan_in months", "Dependents": "Number_of_Dependents"})
renamed_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0,Low
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low


In [178]:
# Drop rows with empty cells
new_df = renamed_df.dropna()
new_df

Unnamed: 0,Loan_ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,2900.0,Low
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,4106.0,Low
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,8312.0,Average
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,7583.0,Average


In [179]:
# Save new df to a file, that will be used for the tableau
new_df.to_csv(r"C:\Users\laris\Documents\Project4-Group1\Project4-Group1\Resources\new_df.csv", index=False)

In [180]:
# Adding a new column 'ID' with unique identifiers starting from 1
new_df['ID'] = range(1, len(new_df) + 1)

# Reordering teh DataFrame columns
columns = ['ID'] + [col for col in new_df.columns if col != 'ID']
new_df = new_df[columns]

# Print
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['ID'] = range(1, len(new_df) + 1)


Unnamed: 0,ID,Loan_ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
1,1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low
5,5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low


In [181]:
# Drop 'Loan_ID' column
new_finance_loan_df = new_df.drop('Loan_ID', axis=1)
new_finance_loan_df

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
1,1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low
5,5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,476,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,2900.0,Low
610,477,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,4106.0,Low
611,478,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,8312.0,Average
612,479,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,7583.0,Average


In [182]:
new_finance_loan_df['Number_of_Dependents']= new_finance_loan_df['Number_of_Dependents'].replace(['0', '1', '2', '3+'],['0','1','2','3'])
new_finance_loan_df

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Education,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category
1,1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low
2,2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low
3,3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low
4,4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low
5,5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,476,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,2900.0,Low
610,477,Male,Yes,3,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,4106.0,Low
611,478,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,8312.0,Average
612,479,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,7583.0,Average


In [183]:
# Review for any empty cells/ missing data
empty_cells = new_finance_loan_df.isna().any()
print("Empty Cells:")
print(empty_cells)

Empty Cells:
ID                         False
Gender                     False
Married                    False
Number_of_Dependents       False
Education                  False
Self_Employed              False
Applicant_Income           False
Co-applicant_Income        False
Loan_Amount                False
Terms_of_loan_in months    False
Credit_History             False
Property_Area              False
Loan_Status                False
Total_Income               False
Income_Category            False
dtype: bool


## Transform "Education" column with get_dummies

In [184]:
# Verify the categories of the "education" column
new_finance_loan_df["Education"].value_counts()

Education
Graduate        383
Not Graduate     97
Name: count, dtype: int64

In [185]:
# Transform the education column using get_dummies
education_dummies = pd.get_dummies(new_finance_loan_df["Education"])

# Display the transformed data
education_dummies.tail()

Unnamed: 0,Graduate,Not Graduate
609,True,False
610,True,False
611,True,False
612,True,False
613,True,False


In [186]:
# Concatenate the finance_loan_df and the finance_loan_df_dummies DataFrames
new_finance_loan_df = pd.concat([new_finance_loan_df, education_dummies], axis=1)

# Drop the original education column
new_finance_loan_df = new_finance_loan_df.drop(columns=["Education"])

# Display the DataFrame
new_finance_loan_df.head()

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,1,Male,Yes,1,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,Low,True,False
2,2,Male,Yes,0,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,Low,True,False
3,3,Male,Yes,0,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,Low,False,True
4,4,Male,No,0,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,Low,True,False
5,5,Male,Yes,2,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,Low,True,False


## Transform "Income_Category" column with encoding function

In [188]:
def encode_Income_Category(Income_Category):
    """This function encodes income category by setting Very_Low as 0, Low as 1, Average as 2, High as 3 and Very_high as 4."""
    if Income_Category == "Very_high":
        return 4
    else:
        if Income_Category == "High":
            return 3
        else:
            if Income_Category == "Average":
                return 2
            else:
                if Income_Category == "Low":
                    return 1
                else:
                    return 0

# Encode_Married function on the Married column
new_finance_loan_df["Income_Category"] = new_finance_loan_df["Income_Category"].apply(encode_Income_Category)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,1,Male,Yes,1,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,0,True,False
2,2,Male,Yes,0,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,0,True,False
3,3,Male,Yes,0,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,0,False,True
4,4,Male,No,0,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,0,True,False
5,5,Male,Yes,2,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,0,True,False


## Transform "Married" column with encoding function

In [189]:
def encode_Married(Married):
    """This function encodes married by setting Yes as 1 and No as 0."""
    if Married == "Yes":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Married"] = new_finance_loan_df["Married"].apply(encode_Married)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,1,Male,1,1,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,0,True,False
2,2,Male,1,0,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,0,True,False
3,3,Male,1,0,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,0,False,True
4,4,Male,0,0,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,0,True,False
5,5,Male,1,2,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,0,True,False


## Transform "Self_Employed" column with encoding function

In [190]:
def encode_Self_Employed(Self_Employed):
    """This function encodes the self employee status by setting Yes as 1 and No as 0."""
    if Self_Employed == "Yes":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Self_Employed"] = new_finance_loan_df["Self_Employed"].apply(encode_Self_Employed)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,1,Male,1,1,0,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,0,True,False
2,2,Male,1,0,1,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,0,True,False
3,3,Male,1,0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,0,False,True
4,4,Male,0,0,0,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,0,True,False
5,5,Male,1,2,1,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,0,True,False


## Transform "Gender" column with encoding function

In [191]:
def encode_Gender(Gender):
    """This function encodes the gender by setting Male as 1, Female as 0 and ."""
    if Gender == "Male":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Gender"] = new_finance_loan_df["Gender"].apply(encode_Gender)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,0,True,False
2,2,1,1,0,1,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,0,True,False
3,3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,0,False,True
4,4,1,0,0,0,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,0,True,False
5,5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,Urban,Y,9613.0,0,True,False


## Transform "Property_Area" column with encoding function

In [192]:
def encode_Property_Area(Property_Area):
    """This function encodes the property area by setting Rural as 2, Semiurban as 1 Urban as 0."""
    if Property_Area == "Rural":
        return 2
    else:
        if Property_Area == "Semiurban":
            return 1
        else:
            return 0

# Encode_Married function on the Married column
new_finance_loan_df["Property_Area"] = new_finance_loan_df["Property_Area"].apply(encode_Property_Area)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,2,N,6091.0,0,True,False
2,2,1,1,0,1,3000,0.0,66.0,360.0,1.0,0,Y,3000.0,0,True,False
3,3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,0,Y,4941.0,0,False,True
4,4,1,0,0,0,6000,0.0,141.0,360.0,1.0,0,Y,6000.0,0,True,False
5,5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,0,Y,9613.0,0,True,False


## Transform "Loan_Status" column with encoding function

In [193]:
def encode_Loan_Status(Loan_Status):
    """This function encodes the loan status by setting Y as 1 and N as 0."""
    if Loan_Status == "Y":
        return 1
    else:
        return 0

# Encode_Married function on the Married column
new_finance_loan_df["Loan_Status"] = new_finance_loan_df["Loan_Status"].apply(encode_Loan_Status)

# Review the DataFrame 
new_finance_loan_df.head()

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Loan_Status,Total_Income,Income_Category,Graduate,Not Graduate
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,2,0,6091.0,0,True,False
2,2,1,1,0,1,3000,0.0,66.0,360.0,1.0,0,1,3000.0,0,True,False
3,3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,0,1,4941.0,0,False,True
4,4,1,0,0,0,6000,0.0,141.0,360.0,1.0,0,1,6000.0,0,True,False
5,5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,0,1,9613.0,0,True,False


In [194]:
new_finance_loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 480 entries, 1 to 613
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       480 non-null    int64  
 1   Gender                   480 non-null    int64  
 2   Married                  480 non-null    int64  
 3   Number_of_Dependents     480 non-null    object 
 4   Self_Employed            480 non-null    int64  
 5   Applicant_Income         480 non-null    int64  
 6   Co-applicant_Income      480 non-null    float64
 7   Loan_Amount              480 non-null    float64
 8   Terms_of_loan_in months  480 non-null    float64
 9   Credit_History           480 non-null    float64
 10  Property_Area            480 non-null    int64  
 11  Loan_Status              480 non-null    int64  
 12  Total_Income             480 non-null    float64
 13  Income_Category          480 non-null    int64  
 14  Graduate                 480 no

## Split the Data into Training and Testing Sets

In [195]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = new_finance_loan_df["Loan_Status"]
# Separate the X variable, the features
X = new_finance_loan_df.drop(columns=["Loan_Status"])

In [196]:
# Review the y variable Series
display(y.head())
display(y.tail())

1    0
2    1
3    1
4    1
5    1
Name: Loan_Status, dtype: int64

609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, dtype: int64

In [197]:
# Review the X variable DataFrame
display(X.head())
display(X.tail())

Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Total_Income,Income_Category,Graduate,Not Graduate
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,2,6091.0,0,True,False
2,2,1,1,0,1,3000,0.0,66.0,360.0,1.0,0,3000.0,0,True,False
3,3,1,1,0,0,2583,2358.0,120.0,360.0,1.0,0,4941.0,0,False,True
4,4,1,0,0,0,6000,0.0,141.0,360.0,1.0,0,6000.0,0,True,False
5,5,1,1,2,1,5417,4196.0,267.0,360.0,1.0,0,9613.0,0,True,False


Unnamed: 0,ID,Gender,Married,Number_of_Dependents,Self_Employed,Applicant_Income,Co-applicant_Income,Loan_Amount,Terms_of_loan_in months,Credit_History,Property_Area,Total_Income,Income_Category,Graduate,Not Graduate
609,476,0,0,0,0,2900,0.0,71.0,360.0,1.0,2,2900.0,0,True,False
610,477,1,1,3,0,4106,0.0,40.0,180.0,1.0,2,4106.0,0,True,False
611,478,1,1,1,0,8072,240.0,253.0,360.0,1.0,0,8312.0,0,True,False
612,479,1,1,2,0,7583,0.0,187.0,360.0,1.0,0,7583.0,0,True,False
613,480,0,0,0,1,4583,0.0,133.0,360.0,0.0,1,4583.0,0,True,False


### Split the data into training and testing datasets by using `train_test_split`.

In [198]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split and assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

## Create a Logistic Regression Model

In [199]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
# Fit the model using training data
lr_model.fit(X_train, y_train)

In [200]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Evaluation

In [201]:
# Make a prediction using the testing data
lr_predictions = lr_model.predict(X_test)
pd.DataFrame({'Predictions': lr_predictions, 'Actual': y_test})



Unnamed: 0,Predictions,Actual
286,0,0
538,1,0
167,1,1
574,1,0
106,1,1
...,...,...
534,0,1
562,1,1
123,1,1
258,1,0


In [202]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, lr_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)


In [203]:
# Print the classification report for the model
print(classification_report(y_test, lr_predictions))

              precision    recall  f1-score   support

           0       0.65      0.65      0.65        37
           1       0.84      0.84      0.84        83

    accuracy                           0.78       120
   macro avg       0.75      0.75      0.75       120
weighted avg       0.78      0.78      0.78       120



In [204]:
from sklearn.metrics import balanced_accuracy_score
print(f"The balanced accuracy score of the model is: {balanced_accuracy_score(y_test, lr_predictions)}")

The balanced accuracy score of the model is: 0.7460110713122762
