In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Task 1**: Develop a credit scoring model to predict the creditworthiness of individuals based on historical financial data. Utilize classification algorithms and assess the model's accuracy.

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/credit-risk-dataset/loan/loan.csv')
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.drop(df.columns.difference(['loan_amnt','term','int_rate','installment','grade','emp_length','home_ownership',
                                         'annual_inc','verification_status','loan_status','purpose',]),axis=1, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.annual_inc = df.annual_inc.fillna(0)
df.isnull().sum()

**Creating a Label based on Observations**

In [None]:
label_categories = [
    (0, ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid', 'Current']),
    (1, ['Late (31-120 days)', 'Late (16-30 days)', 'In Grace Period', 
         'Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'])
]

# function to apply the transformation
def classify_label(text):
    for category, matches in label_categories:
        if any(match in text for match in matches):
            return category
    return None

df.loc[:, 'label'] = df['loan_status'].apply(classify_label)
df = df.drop('loan_status', axis=1)

In [None]:
def SC_LabelEncoder1(text):
    if text == "E":
        return 1
    elif text == "D":
        return 2
    elif text == "C":
        return 3
    elif text == "B":
        return 4
    elif text == "A":
        return 5
    else:
        return 0

In [None]:
def SC_LabelEncoder2(text):
    if text == "< 1 year":
        return 1
    elif text == "1 year":
        return 2
    elif text == "2 years":
        return 3
    elif text == "3 years":
        return 4
    elif text == "4 years":
        return 5
    elif text == "5 years":
        return 6
    elif text == "6 years":
        return 7
    elif text == "7 years":
        return 8
    elif text == "8 years":
        return 9
    elif text == "9 years":
        return 10
    elif text == "10 years":
        return 11
    elif text == "10+ years":
        return 12
    else:
        return 0

In [None]:
def SC_LabelEncoder3(text):
    if text == "RENT":
        return 1
    elif text == "MORTGAGE":
        return 2
    elif text == "OWN":
        return 3
    else:
        return 0

In [None]:
df["grade"] = df["grade"].apply(SC_LabelEncoder1)
df["emp_length"] = df["emp_length"].apply(SC_LabelEncoder2)
df["home_ownership"] = df["home_ownership"].apply(SC_LabelEncoder3)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.isnull().sum()

**Exploring Data Analysis**

In [None]:
fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.set_style('darkgrid')
sns.countplot(data=df,x = 'grade',hue = 'home_ownership' , ax = ax[0],palette='Set2').set_title("Grade/Home Ownership distribution")
sns.countplot(data=df,x = 'term',hue = 'home_ownership' , ax = ax[1],palette='Set2').set_title("Term/Home Ownership distribution")

fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.set_style('darkgrid')
sns.countplot(data=df,x = 'grade',hue = 'verification_status' , ax = ax[0],palette='Set2').set_title("Grade/Verification Status distribution")
sns.countplot(data=df,x = 'term',hue = 'verification_status' , ax = ax[1],palette='Set2').set_title("Term/Verification Status distribution")

In [None]:
fig, ax = plt.subplots(1,4,figsize=(20,5))
sns.histplot(df, x='loan_amnt',hue="label", bins=30, ax=ax[0],palette='Set2').set_title("Loan Ammount distribution");
sns.countplot(data=df, x='term', hue="label", ax=ax[1],palette='Set2').set_title("Term distribution");
sns.countplot(data=df, hue='home_ownership', x='label', ax=ax[2],palette='Set2').set_title("Home ownership with loan_status");
sns.countplot(data=df, x='verification_status', hue='label', ax=ax[3],palette='Set2').set_title("Verification Status Distribution with loan_status");

In [None]:
sns.set(rc={'figure.figsize':(10,5)})
sns.heatmap(df[['loan_amnt', 'int_rate', 'grade', 'emp_length', 'home_ownership', 'annual_inc','label']].corr(),cbar=True,annot=True,
            linecolor='white',linewidths=1.5,cmap="mako").set_title("Pearson Correlations Heatmap");

**Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
for col in ["verification_status", "purpose","term"]:
    le = LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])

In [None]:
df.label = df.label.fillna(1)

**Splitting into Testing and Training**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X, y = df.drop("label", axis=1), df["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [None]:
acc = []
pre = []
f1 =[]
rec =[]

**Model Evaluation**

K Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

In [None]:
knn.fit(X_train_scaled,y_train)

In [None]:
pred_knn = knn.predict(X_test_scaled)

In [None]:
print("Classification Report :")
print(     classification_report(y_test,pred_knn))

In [None]:
print("Accuracy = ",accuracy_score(y_test,pred_knn))

In [None]:
acc.append(accuracy_score(y_test,pred_knn))
pre.append(precision_score(y_test,pred_knn))
rec.append(recall_score(y_test,pred_knn))
f1.append(f1_score(y_test,pred_knn))

Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rftree = RandomForestClassifier(n_estimators=10)

In [None]:
rftree.fit(X_train_scaled,y_train)

In [None]:
rftree_pred = rftree.predict(X_test_scaled)

In [None]:
print("Classification Report :")
print(classification_report(y_test,rftree_pred))

In [None]:
print("Accuracy = ",accuracy_score(y_test,rftree_pred))

In [None]:
acc.append(accuracy_score(y_test,rftree_pred))
pre.append(precision_score(y_test,rftree_pred))
rec.append(recall_score(y_test,rftree_pred))
f1.append(f1_score(y_test,rftree_pred))

Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train_scaled,y_train)

In [None]:
pred_dtree = dtree.predict(X_test)

In [None]:
print("Classification Report :")
print(classification_report(y_test,pred_dtree))

In [None]:
print("Accuracy = ",accuracy_score(y_test,pred_dtree))

In [None]:
acc.append(accuracy_score(y_test,pred_dtree))
pre.append(precision_score(y_test,pred_dtree))
rec.append(recall_score(y_test,pred_dtree))
f1.append(f1_score(y_test,pred_dtree))

**Performance Evaluation**

In [None]:
labels = ['KNN','Random Forest','Decision Tree']

In [None]:
fig, ax = plt.subplots(1,4,figsize=(20,5))
sns.set_style('darkgrid')
sns.barplot(x=labels, y=acc,palette='Set2',ax = ax[0]).set_title("Accuracy");
sns.barplot(x=labels, y=rec,palette='Set2',ax = ax[1]).set_title("Recall");
sns.barplot(x=labels, y=pre,palette='Set2',ax = ax[2]).set_title("Precision");
sns.barplot(x=labels, y=f1,palette='Set2',ax = ax[3]).set_title("F1 Score");

fig, ax = plt.subplots(1,4,figsize=(20,5))
sns.set_style('darkgrid')
sns.lineplot(x=labels, y=acc,palette='Set2',ax = ax[0],marker='s', color='red').set_title("Accuracy");
sns.lineplot(x=labels, y=rec,palette='Set2',ax = ax[1],marker='s', markerfacecolor='blue').set_title("Recall");
sns.lineplot(x=labels, y=pre,palette='Set2',ax = ax[2],marker='s', color='green').set_title("Precision");
sns.lineplot(x=labels, y=f1,palette='Set2',ax = ax[3],marker='s', color='purple').set_title("F1 Score");

In conclusion, the analysis of credit risk predictions using machine learning algorithms, specifically K-Nearest Neighbors (KNN), Random Forest, and Decision Tree, has provided valuable insights into their performance. Through rigorous evaluation and comparison, it is evident that Random Forest emerges as the most effective algorithm among the three for predicting credit risk.