In [1]:
from google.cloud import bigquery

# Use raw string for the file path
key_path = r"C:\Advanced Analytics\JSON KEY\master-charmer-472608-g7-5a81523269bb.json"

# Initialize the BigQuery client
client = bigquery.Client.from_service_account_json(key_path)

# Query table
query = """
    SELECT *
    FROM `bigquery-public-data.ml_datasets.credit_card_default`
    LIMIT 10000
"""
import db_dtypes
# Run the query and create a DataFrame
df = client.query(query).to_dataframe()

# Display the first few rows
print(df.head())

        id  limit_balance sex education_level marital_status   age  pay_0  \
0  27502.0        80000.0   1               6              1  54.0    0.0   
1  26879.0       200000.0   1               4              1  49.0    0.0   
2  18340.0        20000.0   2               6              2  22.0    0.0   
3  13692.0       260000.0   2               4              2  33.0    0.0   
4  20405.0       150000.0   1               4              2  32.0    0.0   

   pay_2  pay_3  pay_4  ... bill_amt_5 bill_amt_6  pay_amt_1  pay_amt_2  \
0    0.0    0.0    0.0  ...    26210.0    17643.0     2545.0     2208.0   
1    0.0    0.0    0.0  ...    50235.0    48984.0     1689.0     2164.0   
2    0.0    0.0    0.0  ...      500.0        0.0     4641.0     1019.0   
3    0.0    0.0    0.0  ...    30767.0    29890.0     5000.0     5000.0   
4    0.0    0.0   -1.0  ...   143375.0   146411.0     4019.0   146896.0   

   pay_amt_3  pay_amt_4  pay_amt_5  pay_amt_6  default_payment_next_month  \
0     133

In [4]:
#Initial Inspection
print("Dataset shape:", df.shape)
df.head()
print("\nSummary Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())
print(type(df))
for col in df.columns:
    print(col, type(df[col].iloc[0]))
df_no_ndarray = df.drop(columns=["predicted_default_payment_next_month"])
print("Duplicated Rows:", df_no_ndarray.duplicated().sum())
print("\nData Types:")
print(df.dtypes)

Dataset shape: (2965, 26)

Summary Statistics:
                 id  limit_balance          age        pay_0        pay_2  \
count   2965.000000    2965.000000  2965.000000  2965.000000  2965.000000   
mean   14945.556155  163369.308600    35.193255     0.005059    -0.122428   
std     8700.288152  125030.415472     9.109439     1.114395     1.180784   
min       29.000000   10000.000000    21.000000    -2.000000    -2.000000   
25%     7499.000000   50000.000000    28.000000    -1.000000    -1.000000   
50%    14782.000000  140000.000000    34.000000     0.000000     0.000000   
75%    22571.000000  230000.000000    41.000000     0.000000     0.000000   
max    29995.000000  800000.000000    69.000000     8.000000     7.000000   

             pay_3        pay_4     bill_amt_1     bill_amt_2     bill_amt_3  \
count  2965.000000  2965.000000    2965.000000    2965.000000    2965.000000   
mean     -0.141653    -0.185160   52118.305228   50649.153120   48239.757504   
std       1.183630 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(6, 4))
sns.countplot(x='sex', hue='default_payment_next_month', data=df)
plt.title("Default Payment by Sex")
plt.xlabel("sex (1 = Male, 2 = Female)")
plt.ylabel("Count")
plt.legend(title='Default')
plt.show()


In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x='education_level', hue='default_payment_next_month', data=df)
plt.title("Default Payment by Education Level")
plt.xlabel("Education (1 = Grad School, 2 = University, 3 = High School, 4+ = Others)")
plt.ylabel("Count")
plt.legend(title='Default')
plt.show()


In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x='marital_status', hue='default_payment_next_month', data=df)
plt.title("Default Payment by Marital Status")
plt.xlabel("Marriage (1 = Married, 2 = Single, 3 = Others)")
plt.ylabel("Count")
plt.legend(title='Default')
plt.show()


In [3]:
import numpy as np
import pandas as pd

# Function to clean values
def clean_binary_column(val):
    # Handle lists/arrays with a single item
    if isinstance(val, (list, np.ndarray)) and len(val) == 1:
        val = val[0]
    elif isinstance(val, dict) and 'value' in val:
        val = val['value']
    # Try converting to numeric
    try:
        return int(float(val))
    except (ValueError, TypeError):
        return np.nan

# Apply cleaning
df['default_payment_next_month'] = df['default_payment_next_month'].apply(clean_binary_column)
df['default_payment_next_month'] = pd.to_numeric(df['default_payment_next_month'], errors='coerce')

# Drop invalid values and cast to int
df = df.dropna(subset=['default_payment_next_month'])
df['default_payment_next_month'] = df['default_payment_next_month'].astype(int)

# Confirm it worked
print(df['default_payment_next_month'].unique())
print(df['default_payment_next_month'].dtype)


[1 0]
int64


In [None]:
# Data Cleaning and Feature Selection

# Accept only numeric values
df_numeric = df.select_dtypes(include=[np.number]).copy()

# Drop ID column if present
if 'id' in df_numeric.columns:
    df_numeric.drop(columns=['id'], inplace=True)

# Drop missing values
df_numeric.dropna(inplace=True)

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df_numeric.corr(), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

# Correlation with target variable
target_col = "default_payment_next_month"
if target_col in df_numeric.columns:
    correlation = df_numeric.corr()[target_col].sort_values(ascending=False)
    print(f"\nCorrelation with '{target_col}':")
    print(correlation)
else:
    print(f"\nColumn '{target_col}' not found in df_numeric.")

# Drop highly correlated features (multicollinearity filter)
# Threshold can be adjusted (e.g., 0.8 or 0.9)
corr_matrix = df_numeric.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]

print(f"\nDropping highly correlated features:\n{to_drop}")
df_reduced = df_numeric.drop(columns=to_drop)


# Box plots for outlier detection
print("\nBoxplots for numeric columns (checking for outliers)...")
for column in df_reduced.columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df_reduced, x=column)
    plt.title(f"Boxplot of {column}")
    plt.tight_layout()
    plt.show()


In [None]:
# Model Training and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb


# Rename target column
df.rename(columns={"default_payment_next_month": "default"}, inplace=True)

# Drop ID column
if 'id' in df.columns:
    df.drop(columns=["id"], inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)

# Split features and target
X = df.drop("default", axis=1)
y = df[["default"]]

# Keep only numeric features
X = X.select_dtypes(include=["number"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the model
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='accuracy',      # You can change to 'roc_auc', 'f1', etc.
    cv=5,                     # 5-fold cross-validation
    verbose=1,                # Shows progress
    n_jobs=-1                # Use all CPU cores
)

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train.values.ravel())

# Best parameters and score
print("Best Parameters:\n", grid_search.best_params_)
print("\nBest Accuracy Score:", grid_search.best_score_)


In [None]:
best_model = grid_search.best_estimator_

# Predict
y_pred = best_model.predict(X_test_scaled)

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import joblib
joblib.dump(model, "model.pkl")