<a href="https://colab.research.google.com/github/kbghub56/grocery_store_credit_analysis/blob/main/Calculate_AUC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from google.colab import drive
import joblib
drive.mount('/content/drive') # Needed to access files in drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the grid_search object
grid_search = joblib.load('/content/drive/My Drive/Colab Notebooks/Credit_Scoring/quick_grid_search.joblib')


In [None]:
# Best estimator from the grid search
best_pipeline = grid_search.best_estimator_

# Load the dataset
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Credit_Scoring/231019_sampledata_cohort3.csv")

# Drop the unnecessary columns
data.drop(columns=['Unnamed: 0', 'person_id'], inplace=True)


In [None]:
# Define X and y
X = data.drop(columns="y")
y = data["y"]

# Initialize the StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = list(skf.split(X, y))

# Dictionary to store results
results = {
    'fold': [],
    'in_sample_auc': [],
    'out_sample_auc': [],
    'feature_importances': []
}

In [None]:
# Iterate over the folds
for i, (train_index, test_index) in enumerate(folds):
    # Split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the best pipeline
    best_pipeline.fit(X_train, y_train)

    # Predict probabilities for the training and test sets
    y_train_pred = best_pipeline.predict_proba(X_train)[:, 1]
    y_test_pred = best_pipeline.predict_proba(X_test)[:, 1]

    # Calculate AUC
    in_sample_auc = roc_auc_score(y_train, y_train_pred)
    out_sample_auc = roc_auc_score(y_test, y_test_pred)

    # Extract feature importances from the 'model' step of the pipeline
    feature_importances = best_pipeline.named_steps['model'].feature_importances_

    # Store results
    results['fold'].append(i + 1)
    results['in_sample_auc'].append(in_sample_auc)
    results['out_sample_auc'].append(out_sample_auc)
    results['feature_importances'].append(feature_importances)

    # Print results for the current fold
    print(f"Fold {i+1}: In-Sample AUC = {in_sample_auc}, Out-of-Sample AUC = {out_sample_auc}")




Fold 1: In-Sample AUC = 0.9233086966103675, Out-of-Sample AUC = 0.6698952377910504




Fold 2: In-Sample AUC = 0.9283538179196457, Out-of-Sample AUC = 0.6735875999446379
Fold 3: In-Sample AUC = 0.9295611258116305, Out-of-Sample AUC = 0.6589345981453683
Fold 4: In-Sample AUC = 0.9295345212010252, Out-of-Sample AUC = 0.6841559786674902
Fold 5: In-Sample AUC = 0.9268532224229106, Out-of-Sample AUC = 0.672909283582328
Fold 6: In-Sample AUC = 0.9284647280415617, Out-of-Sample AUC = 0.6620366795623801
Fold 7: In-Sample AUC = 0.9278996304191047, Out-of-Sample AUC = 0.6401789779459339
Fold 8: In-Sample AUC = 0.9235831943534838, Out-of-Sample AUC = 0.6435591103821587
Fold 9: In-Sample AUC = 0.9239123450457669, Out-of-Sample AUC = 0.6497415975951669
Fold 10: In-Sample AUC = 0.9248794604117615, Out-of-Sample AUC = 0.6736148645950097


In [None]:
# Convert results dictionary to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv("/content/drive/My Drive/Colab Notebooks/Credit_Scoring/results.csv", index=False)

# Average feature importances
avg_feature_importances = pd.DataFrame(results['feature_importances']).mean().tolist()
print("Average Feature Importances:", avg_feature_importances)

Average Feature Importances: [0.022092502564191818, 0.0, 0.011500829830765724, 0.0034316927194595337, 0.002758628921583295, 5.9059926570625976e-05, 0.0018633443396538496, 0.00497386185452342, 0.0014216606505215168, 0.0032156340312212706, 0.00384765793569386, 0.0013243717839941382, 0.01122719794511795, 0.0, 0.0015487518394365907, 0.0025903922505676746, 0.0009507783688604832, 0.004335375968366861, 0.0, 0.0017420526128262281, 0.0031790193170309067, 0.0005014491034671664, 0.0006019592983648181, 0.00025507149985060096, 0.000667556538246572, 0.0002830935991369188, 0.0024658716283738613, 0.000745109748095274, 0.004716106690466404, 0.0004175876674707979, 0.0012376864906400442, 0.01592564955353737, 0.07649186998605728, 0.018315443769097328, 0.0050264811143279076, 0.009072807617485523, 0.007162280380725861, 0.007765217684209347, 0.008505018427968025, 0.0019234387436881661, 0.007042825222015381, 0.02698875591158867, 9.624061931390315e-05, 0.0019370332593098283, 0.016362525522708893, 0.02633408829