In [1]:
# Libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing tools
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Classifier

# Machine learning models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Metrics and model evaluation
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, classification_report, confusion_matrix
)

# Utilities for model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

# Clustering and dimensionality reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


In [7]:
df= pd.read_csv('Credit_Score.csv')

In [8]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,LP002953,Male,Yes,3+,Graduate,No,5703,0.0,128.0,360.0,1.0,Urban,Y
377,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
378,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
379,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y


In [9]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [10]:
df.nunique()

Loan_ID              381
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      322
CoapplicantIncome    182
LoanAmount           101
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [11]:
df.shape

(381, 13)

In [12]:
data = [df]
for dataset in data:
    #Filter categorical variables
    categorical_columns = [x for x in dataset.dtypes.index if dataset.dtypes[x]=='object']
    
for col in categorical_columns:
    print ('\nFrequency of Categories for variable %s'%col)
    print (df[col].value_counts())


Frequency of Categories for variable Loan_ID
LP001003    1
LP002281    1
LP002314    1
LP002308    1
LP002305    1
           ..
LP001698    1
LP001693    1
LP001692    1
LP001691    1
LP002990    1
Name: Loan_ID, Length: 381, dtype: int64

Frequency of Categories for variable Gender
Male      291
Female     85
Name: Gender, dtype: int64

Frequency of Categories for variable Married
Yes    228
No     153
Name: Married, dtype: int64

Frequency of Categories for variable Dependents
0     234
2      59
1      52
3+     28
Name: Dependents, dtype: int64

Frequency of Categories for variable Education
Graduate        278
Not Graduate    103
Name: Education, dtype: int64

Frequency of Categories for variable Self_Employed
No     325
Yes     35
Name: Self_Employed, dtype: int64

Frequency of Categories for variable Property_Area
Semiurban    149
Urban        126
Rural        106
Name: Property_Area, dtype: int64

Frequency of Categories for variable Loan_Status
Y    271
N    110
Name: Loan_S

In [13]:
df.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [14]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode().iloc[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode().iloc[0])
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode().iloc[0]).astype(int)
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode().iloc[0]).astype(int)

df['Dependents'] = df['Dependents'].replace(['0', '1', '2', '3+'], [0,1,2,3,])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode().iloc[0])

df['CoapplicantIncome'] = df['CoapplicantIncome'].astype(int)
df['LoanAmount'] = df['LoanAmount'].astype(int)

In [15]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [16]:
df.drop(columns = ['Loan_ID'], inplace=True)

In [17]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [18]:
df['Gender']= le.fit_transform(df['Gender'])
df['Married']= le.fit_transform(df['Married'])
df['Dependents']= le.fit_transform(df['Dependents'])
df['Education']= le.fit_transform(df['Education'])
df['Self_Employed']= le.fit_transform(df['Self_Employed'])
df['ApplicantIncome']= le.fit_transform(df['ApplicantIncome'])
df['CoapplicantIncome']= le.fit_transform(df['CoapplicantIncome'])
df['LoanAmount']= le.fit_transform(df['LoanAmount'])
df['Loan_Amount_Term']= le.fit_transform(df['Loan_Amount_Term'])
df['Credit_History']= le.fit_transform(df['Credit_History'])
df['Property_Area']= le.fit_transform(df['Property_Area'])
df['Loan_Status']= le.fit_transform(df['Loan_Status'])



In [19]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,0,0,251,48,79,8,1,0,0
1,1,1,0,0,1,116,0,26,8,1,2,1
2,1,1,0,1,0,75,125,71,8,1,2,1
3,1,0,0,0,0,298,0,92,8,1,2,1
4,1,1,0,1,0,56,49,46,8,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
376,1,1,3,0,0,291,0,79,8,1,2,1
377,1,1,0,0,0,144,90,59,8,1,0,1
378,0,0,0,0,0,103,0,29,8,1,0,1
379,1,1,3,0,0,224,0,7,5,1,0,1


In [20]:
X = df.drop('Loan_Status', axis=1)
Y = df['Loan_Status']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [21]:
# Initialize Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)  # using 100 trees

# Train the model
rfc.fit(X_train, Y_train)

# Predict on the test data
predictions = rfc.predict(X_test)

# Check the accuracy
accuracy = accuracy_score(Y_test, predictions)
print(f"Random Forest Classifier Accuracy: {accuracy * 100:.2f}%")


Random Forest Classifier Accuracy: 80.52%


In [46]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Assuming encoders are already fitted to the training data
# For demonstration, let's create and fit encoders (in practice, use the pre-fitted encoders)
gender_encoder = LabelEncoder().fit(['Male', 'Female'])
married_encoder = LabelEncoder().fit(['Yes', 'No'])
dependents_encoder = LabelEncoder().fit(['0', '1', '2', '3+'])
education_encoder = LabelEncoder().fit(['Graduate', 'Not Graduate'])
self_employed_encoder = LabelEncoder().fit(['Yes', 'No'])
property_area_encoder = LabelEncoder().fit(['Urban', 'Semiurban', 'Rural'])

# Correcting the input data encoding
input_data = ['Male', 'Yes', '1','Graduate', 'Yes', 'Urban', 251, 48, 79, 8, 1]
encoded_input_data = [
    gender_encoder.transform([input_data[0]])[0], 
    married_encoder.transform([input_data[1]])[0], 
    dependents_encoder.transform([input_data[2]])[0], 
    education_encoder.transform([input_data[3]])[0],
    self_employed_encoder.transform([input_data[4]])[0],
    property_area_encoder.transform([input_data[5]])[0],
    input_data[6],  # Leave numerical features as is
    input_data[7],
    input_data[8],
    input_data[9],
    input_data[10],
]

# Convert the list to a numpy array and reshape for a single sample prediction
input_data_as_numpy_array = np.array(encoded_input_data).reshape(1, -1)

# Load your pre-trained Random Forest Classifier
# Here, for demonstration, we initialize and train a mock model
model = RandomForestClassifier()
X_train = np.random.rand(100, 11)  # Mock training data features
Y_train = np.random.randint(2, size=100)  # Mock training data labels
model.fit(X_train, Y_train)

# Making a prediction with the preprocessed input data
prediction = model.predict(input_data_as_numpy_array)
print("Prediction:", prediction)

if prediction[0] == 1:
    print('Loan Approved')
else:
    print('Loan Not Approved')


Prediction: [1]
Loan Approved


In [47]:
for column in X.columns:
  print(column)

Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
