In [1]:
import pandas as pd
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

In [2]:
uploaded = files.upload()

Saving credit_risk_dataset.csv to credit_risk_dataset.csv


In [3]:
data = pd.read_csv("credit_risk_dataset.csv")

In [4]:
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [10]:
X = data[["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "person_home_ownership", "loan_intent","cb_person_default_on_file"]]
y = data["loan_status"]
X.drop(columns=["loan_amnt"], inplace=True)
numeric_features = ["person_age","person_income","person_emp_length","loan_int_rate"]
categorical_features = ["person_home_ownership","loan_intent","cb_person_default_on_file"]
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers= [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
X_processed = preprocessor.fit_transform(X)
categorical_columns = preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(categorical_features)
all_columns = numeric_features + list(categorical_columns)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
logistic_classifier = LogisticRegression(random_state=42)
logistic_classifier.fit(X_train,y_train)
y_pred = logistic_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test,y_pred))
print("\nClassification Report:")
print(classification_report(y_test,y_pred))
categories = {
    "person_home_ownership": ['RENT', 'OWN', 'MORTGAGE', 'OTHER'],
    "loan_intent": ['PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT',
       'DEBTCONSOLIDATION'],
    "cb_person_default_on_file": ['Y', 'N']
}
def predict_loan_default():
  print("\nEnter applicant details for credit risk assessment:")
  age = float(input("Applicant's age:"))
  income = float(input("Applicant's income:"))
  emp_length = float(input("Applicant's employment length (in years):"))
  loan_int_rate = float(input("Loan interest rate:"))
  loan_amnt = float(input("Loan amount requested:"))
  loan_percent_income = loan_amnt/income
  home_ownership = input("Home ownership (RENT/MORTGAGE/OWN/OTHER):").upper()
  loan_intent = input("Loan intent (DEBTCONSOLIDATION/EDUCATION/HOMEIMPROVEMENT/MEDICAL/PERSONAL/VENTURE):").upper()
  default_history = input("Has the applicant defaulted before (Y/N):").upper()

  input_data = {
      "person_age": [age],
      "person_income": [income],
      "person_emp_length": [emp_length],
      "loan_int_rate": [loan_int_rate]
  }
  for category, value in zip(["person_home_ownership","loan_intent","cb_person_default_on_file"],[home_ownership,loan_intent,default_history]):
    for val in categories[category]:
      input_data[f"{category}_{val}"] = [1 if value == val else 0]
  input_data = pd.DataFrame(input_data)

  for col in all_columns:
    if col not in input_data.columns:
      input_data[col] = 0
  input_data = input_data[all_columns]
  prediction = logistic_classifier.predict(input_data)[0]
  if prediction == 1:
    print("\nBased on the information provided, the applicant is predicted to be more likely to default on the loan")
  else:
    print("\nBased on the information provided, the applicant is predicted to be less likely to default on the loan")

predict_loan_default()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=["loan_amnt"], inplace=True)


Accuracy: 0.8198557618536136

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.96      0.89      5072
           1       0.69      0.34      0.46      1445

    accuracy                           0.82      6517
   macro avg       0.76      0.65      0.67      6517
weighted avg       0.80      0.82      0.80      6517


Enter applicant details for credit risk assessment:
Applicant's age:23
Applicant's income:58000
Applicant's employment length (in years):5
Loan interest rate:2
Loan amount requested:780
Home ownership (RENT/MORTGAGE/OWN/OTHER):OWN
Loan intent (DEBTCONSOLIDATION/EDUCATION/HOMEIMPROVEMENT/MEDICAL/PERSONAL/VENTURE):EDUCATION
Has the applicant defaulted before (Y/N):N

Based on the information provided, the applicant is predicted to be less likely to default on the loan


