In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from collections import Counter


In [2]:
data = pd.read_csv(r'H:\DATA\MY\practice\Practice-37\train.csv')
data.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [3]:
print(data.shape)
print(data.info())

(58645, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int6

In [4]:
print(data.isnull().sum())

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64


In [5]:
for column in data.columns:
  if data[column].dtype == object:
    print(column)
    print(Counter(data[column]))

person_home_ownership
Counter({'RENT': 30594, 'MORTGAGE': 24824, 'OWN': 3138, 'OTHER': 89})
loan_intent
Counter({'EDUCATION': 12271, 'MEDICAL': 10934, 'PERSONAL': 10016, 'VENTURE': 10011, 'DEBTCONSOLIDATION': 9133, 'HOMEIMPROVEMENT': 6280})
loan_grade
Counter({'A': 20984, 'B': 20400, 'C': 11036, 'D': 5034, 'E': 1009, 'F': 149, 'G': 33})
cb_person_default_on_file
Counter({'N': 49943, 'Y': 8702})


In [6]:
# we need to remove some the labels since they do not have as many occurrence in the data 
data = data.drop('id', axis = 1)
data = data[~data['loan_grade'].isin(['F', 'G', 'E']) & ~data['person_home_ownership'].isin(['OTHER'])]
for column in data.columns:
  if data[column].dtype == object:
    print(column)
    print(Counter(data[column]))

person_home_ownership
Counter({'RENT': 29832, 'MORTGAGE': 24443, 'OWN': 3092})
loan_intent
Counter({'EDUCATION': 12007, 'MEDICAL': 10714, 'PERSONAL': 9819, 'VENTURE': 9770, 'DEBTCONSOLIDATION': 8981, 'HOMEIMPROVEMENT': 6076})
loan_grade
Counter({'A': 20963, 'B': 20360, 'C': 11020, 'D': 5024})
cb_person_default_on_file
Counter({'N': 49285, 'Y': 8082})


In [7]:
from sklearn.preprocessing import TargetEncoder, StandardScaler

scaler = StandardScaler()
encoder = TargetEncoder()

X = data.drop('loan_status', axis = 1)
Y = data['loan_status']

X_encode = encoder.fit_transform(X, Y)
X_scaled = scaler.fit_transform(X_encode)

In [8]:
model = RandomForestClassifier()

x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size= 0.2, random_state= 42)

model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred = model.predict(x_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Training Accuracy : {accuracy_train*100:.2f}%')
print(f'Testing Accuracy : {accuracy*100:.2f}%')
print(f'F1-score : {f1*100:.2f}%')
print(report)


Training Accuracy : 100.00%
Testing Accuracy : 95.15%
F1-score : 78.92%
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      9995
           1       0.90      0.70      0.79      1479

    accuracy                           0.95     11474
   macro avg       0.93      0.85      0.88     11474
weighted avg       0.95      0.95      0.95     11474



In [9]:
cv_score = cross_val_score(model, X_scaled, Y, scoring = 'accuracy')

print('Cross validation score : ', cv_score)

Cross validation score :  [0.95049678 0.94788217 0.94996949 0.95345594 0.95057962]


In [10]:
models = [XGBClassifier(), KNeighborsClassifier(), LogisticRegression()]

for model in models:
  model.fit(x_train, y_train)

  y_pred_train = model.predict(x_train)
  y_pred = model.predict(x_test)

  accuracy_train = accuracy_score(y_train, y_pred_train)
  accuracy = accuracy_score(y_test, y_pred)
  report = classification_report(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  print(model.__class__.__name__)
  print(f'Training Accuracy : {accuracy_train*100:.2f}%')
  print(f'Testing Accuracy : {accuracy*100:.2f}%')
  print(f'F1-score : {f1*100:.2f}%')
  print('.....')

XGBClassifier
Training Accuracy : 97.29%
Testing Accuracy : 95.15%
F1-score : 79.41%
.....
KNeighborsClassifier
Training Accuracy : 95.31%
Testing Accuracy : 94.36%
F1-score : 76.08%
.....
LogisticRegression
Training Accuracy : 93.65%
Testing Accuracy : 93.65%
F1-score : 72.62%
.....


In [11]:
model = XGBClassifier()
model.fit(x_train, y_train)

In [12]:
test_data = pd.read_csv(r'H:\DATA\MY\practice\Practice-37\test.csv')
test_data.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [13]:
data = test_data[~data['loan_grade'].isin(['F', 'G', 'E']) & ~test_data['person_home_ownership'].isin(['OTHER'])]

data = test_data
data = data.drop('id', axis = 1)

data.head()

  data = test_data[~data['loan_grade'].isin(['F', 'G', 'E']) & ~test_data['person_home_ownership'].isin(['OTHER'])]


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [14]:
test_data_encode = encoder.transform(data)
test_data_scaled = scaler.transform(test_data_encode)

condidates = [XGBClassifier(), RandomForestClassifier()]

submission = pd.DataFrame(test_data['id'])
for model in condidates:  
  model.fit(x_train, y_train)
  
  test_pred = model.predict(test_data_scaled)
  submission[model.__class__.__name__] = test_pred


In [15]:
total_records = len(submission)
different_records_count = len(submission[submission['XGBClassifier'] != submission['RandomForestClassifier']])
percentage_different = (different_records_count / total_records) * 100
print(f"Percentage of differing records: {percentage_different:.2f}%")


Percentage of differing records: 1.34%


## Mlflow envirement

In [21]:
import mlflow
import mlflow.sklearn  
from sklearn.metrics import f1_score
with mlflow.start_run():
  model = XGBClassifier()
  
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)

  mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
  mlflow.log_metric("f1_score", f1_score(y_test, y_pred))
  mlflow.sklearn.log_model(model, "XGBClassifier")
    

