<a href="https://colab.research.google.com/github/kbghub56/grocery_store_credit_analysis/blob/main/Extract_Params.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from google.colab import drive
import joblib
drive.mount('/content/drive') # Needed to access files in drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## Load data ##
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Credit_Scoring/231019_sampledata_cohort3.csv")

In [None]:
## Split data ##

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 0', 'person_id'])

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Create the stratified folds
folds = list(skf.split(data.drop(columns="y"), data["y"]))

# Verify the distribution for the first fold
train_index, test_index = folds[0]
data.iloc[test_index]['y'].value_counts()

0    2868
1     262
Name: y, dtype: int64

In [None]:
# Define X and y for training set (using the first 9 folds combined)
train_indices = [index for fold in folds[:-1] for index in fold[0]]
X_train = data.iloc[train_indices].drop(columns="y")
y_train = data.iloc[train_indices]['y']

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()

# Create a column transformer with one-hot encoding for categorical columns and imputation for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols),
        ('imputer', SimpleImputer(strategy='mean'), numerical_cols) #dont need to
    ],
    remainder='passthrough'
)

# Define model
model = xgb.XGBClassifier(objective='binary:logistic', n_jobs=1, eval_metric="auc")  # Limiting parallelism

# SMOTE for oversampling
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Update pipeline to include preprocessing
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('model', model)
])

In [None]:
'''
What parameters were considered when extracting hyperparameters
# Reduced hyperparameter grid
param_grid = {
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [4, 6, 8],
    'model__min_child_weight': [1, 3, 5],
    'model__gamma': [0, 0.1, 0.2],
    'model__subsample': [0.7, 0.8, 0.9],
    'model__colsample_bytree': [0.6, 0.7, 0.8],
    'model__scale_pos_weight': [1, 10, 20]
}
'''

param_grid = {
    'model__colsample_bytree': [0.8],
    'model__gamma': [0.1],
    'model__learning_rate': [0.2],
    'model__max_depth': [8],
    'model__min_child_weight': [1],
    'model__scale_pos_weight': [1],
    'model__subsample': [0.7]
}
# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)

# Fit model
grid_search.fit(X_train, y_train)

# Get best hyperparameters
best_params = grid_search.best_params_
print(best_params)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
{'model__colsample_bytree': 0.8, 'model__gamma': 0.1, 'model__learning_rate': 0.2, 'model__max_depth': 8, 'model__min_child_weight': 1, 'model__scale_pos_weight': 1, 'model__subsample': 0.7}


In [None]:
joblib.dump(grid_search, '/content/drive/My Drive/Colab Notebooks/Credit_Scoring/quick_grid_search.joblib')

['/content/drive/My Drive/Colab Notebooks/Credit_Scoring/quick_grid_search.joblib']