# Creditworthiness Predictor

### Importing libraries

In [67]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

### Loading the dataset

In [68]:
df = pd.read_csv('/content/credit_dataset.csv')

In [69]:
print("\nFirst 5 rows of dataset")
print(df.head())

print("\nInfo")
print(len(df), df.columns)

print("\nNull Values")
print(df.isnull().sum())


First 5 rows of dataset
   income_yearly  debt_total  payment_history_score  credit_limit  \
0      2243839.0   2212461.0                   89.0      866791.0   
1      1092486.0    760389.0                   94.0      226047.0   
2       573637.0    118147.0                   73.0       95509.0   
3      1762532.0   1636321.0                   40.0      634277.0   
4      3470650.0   3917237.0                   36.0      744828.0   

   credit_used  credit_utilization_ratio credit_worthiness  
0          NaN                      0.10              good  
1     159434.0                      0.71               bad  
2      64490.0                      0.68              good  
3     614385.0                      0.97               bad  
4      39344.0                      0.05               bad  

Info
20000 Index(['income_yearly', 'debt_total', 'payment_history_score', 'credit_limit',
       'credit_used', 'credit_utilization_ratio', 'credit_worthiness'],
      dtype='object')

Null Val

### Handling missing values

In [None]:
df['payment_history_score'].fillna(df['payment_history_score'].median(), inplace=True)
df['debt_total'].fillna(df['debt_total'].median(), inplace=True)
df['credit_limit'].fillna(df['credit_limit'].mean(), inplace=True)
df['credit_used'].fillna(df['credit_used'].mean(), inplace=True)
df['income_yearly'].fillna(df['income_yearly'].mean(), inplace=True)
df.dropna(subset=['credit_utilization_ratio'], inplace=True) # -> Because this is an important column which cannot be filled with misleading values

In [71]:
print(df.isnull().sum())

income_yearly               0
debt_total                  0
payment_history_score       0
credit_limit                0
credit_used                 0
credit_utilization_ratio    0
credit_worthiness           0
dtype: int64


In [48]:
#Selecting features
# These are the columns of information the model should learn from
features = df[['income_yearly', 'debt_total', 'credit_limit', 'credit_used', 'credit_utilization_ratio']]
X = features # The input

#This is the ouput the model will actually predict
y = df['credit_worthiness'] # Good or bad

#Splitting the dataset for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 80% training, 20% Testing
# Train the model on 80% of our data, and separate 20% data for us to test it on

In [49]:
y

Unnamed: 0,credit_worthiness
0,good
1,bad
2,good
3,bad
4,bad
...,...
19995,good
19996,good
19997,bad
19998,bad


### Encoding the data

### Alphabetical  -> Numeric data

In [50]:
la = LabelEncoder()
y_train = la.fit_transform(y_train) # Good- 1 and bad- 0
y_test = la.transform(y_test)

In [51]:
y_test, y.head()

(array([1, 0, 0, ..., 1, 1, 0]),
 0    good
 1     bad
 2    good
 3     bad
 4     bad
 Name: credit_worthiness, dtype: object)

### Making the initial model

In [56]:
#We use a algorithm called a Random Forest — which you can imagine as a bunch of tiny decision-making trees voting together.

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)
# This is the part where the model learns.
# We feed it examples:

# X_train: the financial numbers
# y_train: whether each person was credit worthy or not -> 0 or 1 -> Good or bad
# The model studies these examples and figures out patterns.

y_pred = model.predict(X_test) # Giving it new data to test it
test_accuracy = accuracy_score(y_test, y_pred) # -> A function to get accuracy

print(f"Accuracy: {test_accuracy:.2f}")

Accuracy: 0.75


## Hyperparameter tuning

In [53]:
# A Random Forest has a bunch of settings (hyperparameters) that influence how smart it becomes.
# Changing these settings can massively change accuracy.

# This is called hyperparameter tuning.

para_grid = {
    'n_estimators': [10, 100, 200, 300], # how many trees should the forest have
    'max_depth': [3, 5, 10, 20, None], # how tall each tree can grow
    'max_features': ["sqrt", "log2", None], # how many features each tree can use when splitting
    'min_samples_leaf': [1, 2, 4], # how many data points you need to make a final “leaf” decision
    'bootstrap': [True, False] # should the forest use bootstrapping? - whether or not should the model should sample data with replacement
}

# We create a fresh RandomForest model which is not yet trained.
# This is the model that RandomizedSearchCV will improve by testing different settings.
model = RandomForestClassifier(random_state=42)

#RandomizedSearchCV is a hyperparameter tuning technique provided by the scikit-learn library in Python
# Instead of trying every possible combination (which can be millions), it randomly samples a handful of combinations.
rand_search = RandomizedSearchCV(model, para_grid, cv=10, scoring="accuracy", n_jobs=-1, verbose=2)
rand_search.fit(X_train, y_train)

best_model = rand_search.best_estimator_
print(f"Best Model: {best_model}")
print(f"Best Score: {rand_search.best_score_:.2f}")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best Model: RandomForestClassifier(max_depth=10, max_features=None, min_samples_leaf=2,
                       random_state=42)
Best Score: 0.78


### Predicting creditworthiness from user input


In [72]:
def predict_user():
  try:
      print("Credit Worthiness Predictor")
      print("\nNote: Currency is INR")

      # We collect required financial information from the user.
      # 'float()' is used to convert the input (which is text) into a number.
      income = float(input("Enter your Income (Yearly): "))
      debt = float(input("Enter your current Debt: "))
      credit_limit = float(input("Enter your credit card limit: "))
      credit_used = float(input("Enter your credit card usage: "))

      # This tells us how much of the available credit the user has used.
      credit_utilization = credit_used / credit_limit

      # Then we create a DataFrame containing the user’s entered data.
      user_df = pd.DataFrame([{
          'income_yearly': income,
          'debt_total': debt,
          'credit_limit': credit_limit,
          'credit_used': credit_used,
          'credit_utilization_ratio': credit_utilization
      }])

      # The trained model predicts whether the user is credit worthy (1) or not (0).
      prediction = best_model.predict(user_df)[0]

      print("\nPrediction:", "You are credit worthy" if prediction == 1 else "You are not credit worthy")
  except ValueError:
      print("Invalid input. Please enter valid numerical values.")
predict_user()

Credit Worthiness Predictor

Note: Currency is INR
Enter your Income (Yearly): 1200000
Enter your current Debt: 50000
Enter your credit card limit: 200000
Enter your credit card usage: 30000

Prediction: You are credit worthy


### Evaluating how good the model is

In [57]:

# Precision: Out of everyone the model said deserves a credit card, how many actually deserved it?
# Recall: Out of everyone who truly deserved a credit card, how many did the model correctly identify?
# F1-Score: A single score that balances both precision and recall for the “deserves a credit card” class.
# ROC–AUC: A score that measures how well the model separates people who deserve a credit card from those who don’t
# higher value means the model is better at telling the two groups apart.

print("\nModel Evaluation Metrics using the initial model:")


y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nAccuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_rep)


print("\nModel Evaluation Metrics using the best model:")
y_pred_1 = best_model.predict(X_test)
y_pred_proba_1 = best_model.predict_proba(X_test)[:, 1]

accuracy_1 = accuracy_score(y_test, y_pred_1)
classification_rep_1 = classification_report(y_test, y_pred_1)
roc_auc_1 = roc_auc_score(y_test, y_pred_proba_1)


print(f"\nAccuracy: {accuracy_1:.2f}")
print(f"ROC AUC Score: {roc_auc_1:.2f}")
print("\nClassification Report:")
print(classification_rep_1)


Model Evaluation Metrics using the initial model:

Accuracy: 0.75
ROC AUC Score: 0.84

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.56      0.64      1455
           1       0.76      0.88      0.81      2239

    accuracy                           0.75      3694
   macro avg       0.75      0.72      0.73      3694
weighted avg       0.75      0.75      0.74      3694


Model Evaluation Metrics using the best model:

Accuracy: 0.78
ROC AUC Score: 0.85

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.51      0.65      1455
           1       0.75      0.97      0.84      2239

    accuracy                           0.78      3694
   macro avg       0.83      0.74      0.75      3694
weighted avg       0.81      0.78      0.77      3694

