In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset (assuming it's in a CSV file named 'loan_dataset.csv')
df = pd.read_csv('loan_train.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [None]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [None]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,480.0,480.0,480.0,480.0,480.0
mean,5364.23125,1581.093583,144.735417,342.05,0.854167
std,5668.251251,2617.692267,80.508164,65.212401,0.353307
min,150.0,0.0,9.0,36.0,0.0
25%,2898.75,0.0,100.0,360.0,1.0
50%,3859.0,1084.5,128.0,360.0,1.0
75%,5852.5,2253.25,170.0,360.0,1.0
max,81000.0,33837.0,600.0,480.0,1.0


In [None]:
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [None]:
# Data preprocessing
# Assuming the dataset has been cleaned and preprocessed appropriately
# Handling missing values, encoding categorical variables, etc.

# Selecting features (X) and target variable (y)
X = df.drop('Loan_Status', axis=1)  # Features
y = df['Loan_Status']  # Target variable

# Encoding categorical variables (if necessary)
X = pd.get_dummies(X)

# Splitting the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Training the model
rf_classifier.fit(X_train, y_train)

# Making predictions
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Optionally, you can also inspect feature importances
feature_importances = pd.Series(rf_classifier.feature_importances_, index=X.columns)
print('\nFeature Importances:')
print(feature_importances)

Accuracy: 0.81

Confusion Matrix:
[[11 17]
 [ 1 67]]

Classification Report:
              precision    recall  f1-score   support

           N       0.92      0.39      0.55        28
           Y       0.80      0.99      0.88        68

    accuracy                           0.81        96
   macro avg       0.86      0.69      0.72        96
weighted avg       0.83      0.81      0.78        96


Feature Importances:
ApplicantIncome            0.077821
CoapplicantIncome          0.055018
LoanAmount                 0.086859
Loan_Amount_Term           0.034118
Credit_History             0.159512
                             ...   
Self_Employed_No           0.008457
Self_Employed_Yes          0.009313
Property_Area_Rural        0.014383
Property_Area_Semiurban    0.015257
Property_Area_Urban        0.012076
Length: 500, dtype: float64
