In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Read the CSV file into a DataFrame
df = pd.read_csv('credit_risk_dataset.csv')

# Check unique values in integer columns
integer_columns = df.select_dtypes(include=['int']).columns
for column in integer_columns:
    unique_values = df[column].unique()

# Perform data preprocessing steps
# For example, handle missing values
df.dropna(inplace=True)  # Remove rows with missing values

# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'cb_person_default_on_file'])

# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['loan_status'])  # Features
y = df['loan_status']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check unique values in integer columns after dropping NaN values
integer_columns = X_train.select_dtypes(include=['int']).columns
for column in integer_columns:
    unique_values = X_train[column].unique()

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the models
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_scaled, y_train)

random_forest = RandomForestClassifier()
random_forest.fit(X_train_scaled, y_train)

gradient_boosting = GradientBoostingClassifier()
gradient_boosting.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = logistic_regression.predict(X_test_scaled)
y_pred_rf = random_forest.predict(X_test_scaled)
y_pred_gb = gradient_boosting.predict(X_test_scaled)

# Model Evaluation
print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))

# Confusion Matrix
print("Confusion Matrix - Logistic Regression:")
print(confusion_matrix(y_test, y_pred_lr))
print("Confusion Matrix - Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))
print("Confusion Matrix - Gradient Boosting:")
print(confusion_matrix(y_test, y_pred_gb))

Logistic Regression:
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      4443
           1       0.75      0.46      0.57      1285

    accuracy                           0.84      5728
   macro avg       0.80      0.71      0.74      5728
weighted avg       0.83      0.84      0.83      5728

Random Forest:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      4443
           1       0.94      0.69      0.79      1285

    accuracy                           0.92      5728
   macro avg       0.93      0.84      0.87      5728
weighted avg       0.92      0.92      0.92      5728

Gradient Boosting:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      4443
           1       0.94      0.67      0.78      1285

    accuracy                           0.92      5728
   macro avg       0.93      0.83      0.87      5728
weighted avg       