In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

In [36]:
german_credit = pd.read_csv('../data/german_credit.csv')
german_credit = german_credit.drop(columns=['Unnamed: 0'])
german_credit

Unnamed: 0,Account_status,Duration,Credit_history,Purpose,Credit_amount,Savings_bonds,Present_employment_since,Installment_rate,Other_debtors_guarantors,Resident_since,...,Age,Other_installment_plans,Housing,Existing_credits,Job,People_maintenance_for,Telephone,Foreign_worker,Credit_risk,Gender
0,< 0 DM,6,critical account / other credits existing (not...,radio / television,1169,unknown / no savings account,>= 7 years,4,none,4,...,67,none,own,2,skilled employee / official,1,yes,yes,1,Female
1,0 < ... < 200 DM,48,existing credits paid back duly till now,radio / television,5951,< 100 DM,1 <= ... < 4 years,2,none,2,...,22,none,own,1,skilled employee / official,1,none,yes,0,Male
2,no checking account,12,critical account / other credits existing (not...,education,2096,< 100 DM,4 <= ... < 7 years,2,none,3,...,49,none,own,1,unskilled - resident,2,none,yes,1,Female
3,< 0 DM,42,existing credits paid back duly till now,furniture / equipment,7882,< 100 DM,4 <= ... < 7 years,2,guarantor,4,...,45,none,for free,1,skilled employee / official,2,none,yes,1,Female
4,< 0 DM,24,delay in paying off in the past,car (new),4870,< 100 DM,1 <= ... < 4 years,3,none,4,...,53,none,for free,2,skilled employee / official,2,none,yes,0,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking account,12,existing credits paid back duly till now,furniture / equipment,1736,< 100 DM,4 <= ... < 7 years,3,none,4,...,31,none,own,1,unskilled - resident,1,none,yes,1,Male
996,< 0 DM,30,existing credits paid back duly till now,car (used),3857,< 100 DM,1 <= ... < 4 years,4,none,4,...,40,none,own,1,management / self-employed / highly qualified ...,1,yes,yes,1,Female
997,no checking account,12,existing credits paid back duly till now,radio / television,804,< 100 DM,>= 7 years,4,none,4,...,38,none,own,1,skilled employee / official,1,none,yes,1,Female
998,< 0 DM,45,existing credits paid back duly till now,radio / television,1845,< 100 DM,1 <= ... < 4 years,4,none,4,...,23,none,for free,1,skilled employee / official,1,yes,yes,0,Female


In [37]:
# One-hot encoding for categorical variables
german_credit_encoded = pd.get_dummies(german_credit, drop_first=True)

# Split the data into training and test sets (80% training, 20% test)
X = german_credit_encoded.drop(columns='Credit_risk')
y = german_credit_encoded['Credit_risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Fit a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# View model details (equivalent to ranger's fitted_forest)
print(rf_model)

# Inference Phase: Make predictions on the test set
# Get predicted probabilities for the positive class
predictions_test_prob = rf_model.predict_proba(X_test)[:, 1]

# Add predictions to the test set DataFrame for further processing
german_credit_test = X_test.copy()
german_credit_test['prob'] = predictions_test_prob

# Create Credit_decision column (1 if prob > 0.5, else 0)
german_credit_test['Credit_decision'] = np.where(german_credit_test['prob'] > 0.5, 1, 0)

# Calculate AUC (for the predicted probabilities against the actual test labels)
auc_score = roc_auc_score(y_test, predictions_test_prob)
print(f"AUC: {auc_score}")

RandomForestClassifier(random_state=42)
AUC: 0.7749404761904762


# Tune Hyperparameters

It samples a fixed number of parameter settings from the specified distributions. This means it will explore a subset of combinations, which can be much faster than exhaustive search.

In [38]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],            # Number of trees in the forest
    'max_depth': [10, 20, 30, None],              # Max depth of the tree
    'min_samples_split': [2, 5],                      # Min samples required to split a node
    'min_samples_leaf': [1, 2],                        # Min samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],                     # Number of features to consider at every split
    'bootstrap': [True, False]                            # Whether to bootstrap samples
}

# Instantiate the random forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Use RandomizedSearchCV for hyperparameter tuning (random search is faster)
rf_random_search = RandomizedSearchCV(estimator=rf_model, 
                                      param_distributions=param_grid, 
                                      n_iter=100,        # Number of parameter combinations to try
                                      cv=3,              # 3-fold cross-validation
                                      verbose=3,         # Output progress
                                      random_state=42, 
                                      n_jobs=-1,         # Use all available cores
                                      scoring='roc_auc') # Use AUC as the evaluation metric

# Perform the random search with the training data
rf_random_search.fit(X_train, y_train)

# Best parameters and model
print("Best Parameters:", rf_random_search.best_params_)
best_rf_model = rf_random_search.best_estimator_

# Make predictions on the test set using the best model
predictions_test_prob = best_rf_model.predict_proba(X_test)[:, 1]

# Calculate AUC on the test set
auc_score = roc_auc_score(y_test, predictions_test_prob)
print(f"Final AUC after hyperparameter tuning: {auc_score}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
Final AUC after hyperparameter tuning: 0.7973809523809524


It searches through all possible combinations of the specified hyperparameters. If you have multiple parameters with several options, the number of combinations can grow very quickly.

In [39]:
from sklearn.model_selection import GridSearchCV

# Replace RandomizedSearchCV with GridSearchCV (for exhaustive search)
rf_grid_search = GridSearchCV(estimator=rf_model, 
                              param_grid=param_grid, 
                              cv=3, 
                              verbose=3, 
                              n_jobs=-1, 
                              scoring='roc_auc',
                              error_score='raise')
rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


In [40]:
# Best parameters and model
print("Best Parameters:", rf_grid_search.best_params_)
best_rf_model = rf_grid_search.best_estimator_

# Make predictions on the test set using the best model
predictions_test_prob = best_rf_model.predict_proba(X_test)[:, 1]

# Calculate AUC on the test set
auc_score = roc_auc_score(y_test, predictions_test_prob)
print(f"Final AUC after hyperparameter tuning: {auc_score}")

Best Parameters: {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Final AUC after hyperparameter tuning: 0.7941666666666667
