In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Read the dataset
df = pd.read_csv('train_dataset_final1.csv')
df.head()

In [None]:
print(df.shape)
print(df.info())
print(df.describe())

In [None]:
# Missing values
print(df.isnull().sum())

#in age replace nulls with mean
df['age'].fillna(df['age'].mean(), inplace=True)

In [None]:
# Target variable distribution
sns.countplot(x='next_month_default', data=df)
plt.title("Class Distribution")
plt.show()

print(df['next_month_default'].value_counts(normalize=True))

# -> 20% of users are likely to default next month

In [None]:
categorical_cols = ['sex', 'education', 'marriage']

#print total rows in the dataset
print("Total rows in the dataset:", len(df))

for col in categorical_cols:
    sns.countplot(x=col, data=df)
    plt.title(f'{col} distribution')
    plt.show()
    print(f"{col} unique values:", df[col].unique())
    print(df[col].value_counts(normalize=True) * 100)
    print(df[col].value_counts())

# 297 outliers for education
# 273 outliers for marriage
# Very less married people in the dataset -> only 53 out of 25247

#In marriage replace 3+ with median of 0,1,2
df['marriage'] = df['marriage'].replace(0, 3)

#In edu replace 3+ with median of 0,1,2
df['education'] = df['education'].replace(0, 4)
df['education'] = df['education'].replace(5, 4)
df['education'] = df['education'].replace(6, 4)


In [None]:
# Boxplot for age
plt.figure(figsize=(8, 6))

sns.boxplot(y='age', data=df)
plt.title('Boxplot of Age')
plt.ylabel('Age')
plt.grid(axis='y')
plt.show()

# Number of outliers in age
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['age'] < (Q1 - 1.5 * IQR)) | (df['age'] > (Q3 + 1.5 * IQR))]
print(f'Number of outliers in age: {len(outliers)}')


# Capping the outliers to upper IQR bound
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(df.shape)
df['age'] = df['age'].clip(lower=lower_bound, upper=upper_bound)
print(df.shape)



In [None]:

# KDE plot: Age distributions split by default status
plt.figure(figsize=(8, 4))

sns.kdeplot(
    data=df[df['next_month_default'] == 1],
    x='age',
    label='Default',
    fill=True,
    color="#ff7f0e"
)
plt.title("KDE Plot: Age Distribution by Default Status")
plt.xlabel("Age")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# EDA for Limit Balance
# 2. Boxplot of Credit Limit
plt.figure(figsize=(6, 1.5))
sns.boxplot(x=df['LIMIT_BAL'])
plt.title("Boxplot of Credit Limits")
plt.tight_layout()
plt.show()

#also print histogram
plt.figure(figsize=(8, 4))
sns.histplot(df['LIMIT_BAL'], bins=50, kde=True)
plt.title("Histogram of Credit Limits")
plt.xlabel("Credit Limit")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()


In [None]:
# number of outliers in LIMIT_BAL

Q1 = df['LIMIT_BAL'].quantile(0.25)
Q3 = df['LIMIT_BAL'].quantile(0.75)
IQR = Q3 - Q1

# Capping the outliers to upper IQR bound
lower_bound = max(Q1 - 1.5 * IQR,0)
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['LIMIT_BAL'] < lower_bound) | (df['LIMIT_BAL'] > upper_bound)]
print(f'Lower bound: {lower_bound}, Upper bound: {upper_bound}')
print(f'Number of outliers in LIMIT_BAL: {len(outliers)}')

print(df.shape)
df['LIMIT_BAL'] = df['LIMIT_BAL'].clip(lower=lower_bound, upper=upper_bound)
# Print dimensions after cleaning
print("Shape after capping LIMIT_BAL outliers:", df.shape)

In [None]:
# kde plot for Limit Balance with default status

plt.figure(figsize=(8, 4))
sns.kdeplot(
    data=df[df['next_month_default'] == 1],
    x='LIMIT_BAL',
    label='Default',
    fill=True,
    color="#ff7f0e"
)
plt.title("KDE Plot: Credit Limit Distribution by Default Status")
plt.xlabel("Limit Balance")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
pay_status_cols = ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']

for col in pay_status_cols:
    sns.countplot(x=col, data=df)
    plt.title(f'{col} distribution')
    plt.show()
    print(f"{col} unique values:", df[col].unique())
    print(f"{col} value counts:\n", df[col].value_counts())
    

In [None]:
# 1) Melt your pay‐status columns into long form
# Use the correct column names as in your DataFrame
pay_status_cols = ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']
df_long = (
    df
    .reset_index()                         # preserve original index as “customer_id” if you like
    .melt(id_vars=['next_month_default'], 
          value_vars=pay_status_cols,
          var_name='month',
          value_name='pay_status')
)

# Convert next_month_default to int for aggregation
df_long['next_month_default'] = df_long['next_month_default'].astype(int)

# 2) Compute default rate by (month, pay_status)
pivot = (
    df_long
    .groupby(['month','pay_status'])['next_month_default']
    .mean()
    .reset_index()
    .pivot(index='pay_status', columns='month', values='next_month_default')
)

# 3) Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlOrBr", cbar_kws={'label':'Default Rate'})
plt.title("Default Rate by PAY Status and Month")
plt.xlabel("Month (0 = most recent)")
plt.ylabel("PAY Status Code")
plt.yticks(rotation=0)
plt.show()

In [None]:

# print("Shape after dropping outliers:", df.shape)

bill_cols = [f'Bill_amt{i}' for i in range(1, 7)]

for col in bill_cols:
#     sns.boxplot(x=df[col])
#     plt.title(f'Boxplot of {col}')
#     plt.show()
# # Check for negative values in bill columns
#     print("Negative values in bill columns:")
#     print(df[col].lt(0).sum())
# If negative values exist, replace them with 0
    df[col] = df[col].clip(lower=0)
# Clip the bill amounts with IQR method
# Capping the bill amounts to upper IQR bound
#     Q1 = df[col].quantile(0.25)
#     Q3 = df[col].quantile(0.75)
#     IQR = Q3 - Q1
# # print number of outliers in bill amounts
#     outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
#     print(f'Number of outliers in {col}: {len(outliers)}')
# #removedcapping
#     # df[col] = df[col].clip(lower=Q1 - 1.5 * IQR, upper=Q3 + 1.5 * IQR)
# Print dimensions after cleaning
    print("Shape after capping bill amounts outliers:", df.shape)
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], binwidth=10000)
    plt.title(f'Distribution of {col} after capping outliers')
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

# Check for negative values in bill columns after clipping
    # print("Negative values in bill columns after clipping:")
    # print(df[col].lt(0).sum())
# KDE plot for Bill Amounts with default status
    plt.figure(figsize=(8, 4))
    sns.kdeplot(
        data=df[df['next_month_default'] == 1],
        x=col,
        label='Default',
        fill=True,
        color="#ff7f0e"
    )
    plt.title(f"KDE Plot: {col} Distribution by Default Status")
    plt.xlabel(col)
    plt.ylabel("Density")
    plt.legend()
    plt.tight_layout()
    plt.show()


#similarly for pay amounts
pay_cols = [f'pay_amt{i}' for i in range(1, 7)]
for col in pay_cols:
    # print("\n" + "="*40)
    # print(col, "summary stats:")
    # print(df[col].describe(), "\n")
    
    # clip negatives
    # negs = (df[col] < 0).sum()
    # print(f"{col} negative values: {negs}")
    df[col] = df[col].clip(lower=0)
    
    # IQR bounds
    # Q1, Q3 = df[col].quantile([0.25, 0.75])
    # IQR = Q3 - Q1
    # low, high = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    # lo_count = (df[col] < low).sum()
    # hi_count = (df[col] > high).sum()
    # print(f"{col} outliers  < {low:.2f}: {lo_count},  > {high:.2f}: {hi_count}")
    # df[col] = df[col].clip(lower=low, upper=high)
    
    # histogram with smaller bins
    plt.figure(figsize=(8,4))
    sns.histplot(df[col], bins=50, kde=True)
    plt.title(f"{col} after clipping & IQR capping")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


In [None]:
eng_cols = ['AVG_Bill_amt', 'PAY_TO_BILL_ratio']

# for col in eng_cols:
#     # lower = df[col].quantile(0.005)
#     # upper = df[col].quantile(0.975)
#     # df = df[(df[col] >= lower) & (df[col] <= upper)]
#     # df.reset_index(drop=True, inplace=True)
#     # print(f"After capping, {col} shape: {df[col].shape}")
#     # now plot
#     sns.histplot(df[col], kde=True)
#     plt.title(f'Distribution of {col} after capping')
#     plt.show()

# clip eng_cols to 0.005 and 0.975 quantiles
# for col in eng_cols:
#     lower = df[col].quantile(0.005)
#     upper = df[col].quantile(0.975)
#     df[col] = df[col].clip(lower=lower, upper=upper)
#     print(f"After capping, {col} shape: {df[col].shape}")
#     # now plot
#     sns.histplot(df[col], kde=True)
#     plt.title(f'Distribution of {col} after capping')
#     plt.show()


#make negative values in engineering columns to 0
for col in eng_cols:
    df[col] = df[col].clip(lower=0)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col} after clipping negative values')
    plt.show()

#apply log transformation to engineering columns
# for col in eng_cols:
#     if(col == 'PAY_TO_BILL_ratio'):
#         continue
#     df[col] = np.log1p(df[col])  # log1p handles zero values correctly
#     sns.histplot(df[col], kde=True)
#     plt.title(f'Distribution of {col} after log transformation')
#     plt.show()



# Clip the engineering columns to 0.005 and 0.975 quantiles
for col in eng_cols:
    lower = df[col].quantile(0.05)
    upper = df[col].quantile(0.95)
    #df[col] = df[col].clip(lower=lower, upper=upper)
    print(f"After capping, {col} shape: {df[col].shape}")
    # now plot
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col} after capping')
    plt.show()
    
# see later

In [None]:
# relation between engineering columns and default status
# kde plot for AVG_Bill_amt (log transformed) with default status

plt.figure(figsize=(8, 4))
sns.kdeplot(
    data=df[df['next_month_default'] == 1],
    x='AVG_Bill_amt',
    label='Default',
    fill=True,
    color="#ff7f0e"
)


plt.title("KDE Plot: AVG_Bill_amt Distribution by Default Status")
plt.xlabel("AVG_Bill_amt")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()
# kde plot for PAY_TO_BILL_ratio with default status
plt.figure(figsize=(8, 4))
sns.kdeplot(
    data=df[df['next_month_default'] == 1],
    x='PAY_TO_BILL_ratio',
    label='Default',
    fill=True,
    color="#ff7f0e"
)





plt.title("KDE Plot: PAY_TO_BILL_ratio Distribution by Default Status")
plt.xlabel("PAY_TO_BILL_ratio")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()




In [None]:
# EDA for Bill Amounts# Columns for monthly bill amounts
bill_cols = ['Bill_amt6', 'Bill_amt5', 'Bill_amt4', 'Bill_amt3', 'Bill_amt2', 'Bill_amt1']  # reversed for chronological order

# Group by default status
bill_means = df.groupby('next_month_default')[bill_cols].mean().T
bill_means.columns = ['Non-Defaulters', 'Defaulters'] if 0 in bill_means.columns else ['Defaulters', 'Non-Defaulters']

# Plot
bill_means.plot(figsize=(8, 4), marker='o')
plt.title("Average Monthly Bill Amounts\nDefaulters vs Non-Defaulters")
plt.xlabel("Month (6 = Oldest, 1 = Most Recent)")
plt.ylabel("Average Bill Amount")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Payment columns (chronologically reversed)
pay_cols_amt = ['pay_amt6', 'pay_amt5', 'pay_amt4', 'pay_amt3', 'pay_amt2', 'pay_amt1']

# Group by default status and calculate average
pay_means = df.groupby('next_month_default')[pay_cols_amt].mean().T
pay_means.columns = ['Non-Defaulters', 'Defaulters'] if 0 in pay_means.columns else ['Defaulters', 'Non-Defaulters']

# Plot
pay_means.plot(figsize=(8, 4), marker='o')
plt.title("Average Monthly Payment Amounts\nDefaulters vs Non-Defaulters")
plt.xlabel("Month (6 = Oldest, 1 = Most Recent)")
plt.ylabel("Average Payment")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Define delay_bucket function
def delay_bucket(x):
    if x <= 1:
        return 'On Time to 1 Month Delay'
    elif 2 <= x <= 4:
        return '2  Months Delay to 4 Months Delay'
    else:
        return '4 Months+ Delay'

#for pay_0 to pay_6, create buckets and plot default rate
for col in pay_status_cols[:]:
    df[f'{col}_bucket'] = df[col].apply(delay_bucket)
    grouped = df.groupby(f'{col}_bucket')['next_month_default'].agg(['mean', 'count']).reset_index()
    grouped.columns = [f'{col}_bucket', 'default_rate', 'count']
    
    plt.figure(figsize=(8, 4))
    sns.barplot(x=f'{col}_bucket', y='default_rate', data=grouped, order=[
        'On Time to 1 Month Delay', 
        '2  Months Delay to 4 Months Delay', 
        '4 Months+ Delay'
    ])
    plt.xticks(rotation=90)
    plt.title(f'Default Rate vs {col} Delay Bucket')
    plt.ylabel('Default Rate')
    plt.xlabel(f'{col} Bucket')
    plt.show()

In [None]:
# Create LIMIT_BAL buckets
bins = [0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 5000000, np.inf]
labels = ['0-50K', '50K-100K', '100K-150K', '150K-200K', '200K-250K', '250K-300K', '300K-350K', '350K-400K', '400K-450K', '450K-500K', '500K+']

df['LIMIT_BAL_bucket'] = pd.cut(df['LIMIT_BAL'], bins=bins, labels=labels)

# Compute default rate for each LIMIT_BAL bucket
grouped_limit = df.groupby('LIMIT_BAL_bucket')['next_month_default'].agg(['mean', 'count']).reset_index()

grouped_limit.columns = ['LIMIT_BAL_bucket', 'default_rate', 'count']
# Plot default rate for LIMIT_BAL buckets
plt.figure(figsize=(10, 5))
sns.barplot(x='LIMIT_BAL_bucket', y='default_rate', data=grouped_limit)
plt.xticks(rotation=90)
plt.title('Default Rate vs LIMIT_BAL Bucket')
plt.ylabel('Default Rate')
plt.xlabel('LIMIT_BAL Bucket')
plt.show()


#also plot a continuous plot for LIMIT_BAL something like a kde plot
plt.figure(figsize=(8, 4))
sns.kdeplot(
    data=df[df['next_month_default'] == 1],
    x='LIMIT_BAL',
    label='No Default',
    fill=True,
    color="#ff7f0e"
)

plt.title("KDE Plot: Credit Limit Distribution by Default Status")
plt.xlabel("Limit Balance")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()




In [None]:
categorical_vars = ['education', 'sex', 'marriage']

for var in categorical_vars:
    plt.figure(figsize=(5, 3))
    sns.barplot(x=var, y='next_month_default', data=df)
    plt.title(f'Default Rate by {var.capitalize()}')
    plt.ylabel('Default Rate')
    plt.xlabel(var.capitalize())
    plt.show()


In [None]:

df.drop(columns=['Customer_ID'], inplace=True)


In [None]:
df['sex'] = df['sex'].astype('category')
df['education'] = df['education'].astype('category')
df['marriage'] = df['marriage'].astype('category')
df['next_month_default'] = df['next_month_default'].astype('category')


In [None]:

# drop 'pay_0_bucket', 'pay_2_bucket', 'pay_3_bucket', 'pay_4_bucket', 'pay_5_bucket', 'pay_6_bucket', 'LIMIT_BAL_bucket'
df.drop(columns=['pay_0_bucket', 'pay_2_bucket', 'pay_3_bucket', 'pay_4_bucket', 'pay_5_bucket', 'pay_6_bucket', 'LIMIT_BAL_bucket'], inplace=True)

#print dimensions of the dataset
print("Final dataset shape:", df.shape)

#print cols
print("Columns in the final dataset:")
print(df.columns.tolist())


In [None]:

df.head()

In [None]:
import numpy as np

# 1. UTILIZATION FEATURES
df['utilization'] = df[[f'Bill_amt{i}' for i in range(1, 7)]].sum(axis=1) / (6 * df['LIMIT_BAL'] + 1e-6)
df['recent_utilization'] = df['Bill_amt1'] / (df['LIMIT_BAL'] + 1e-6)

# 2. PAYMENT TO BILL RATIOS
df['avg_pay_ratio'] = df[[f'pay_amt{i}' for i in range(1, 7)]].sum(axis=1) / (df[[f'Bill_amt{i}' for i in range(1, 7)]].sum(axis=1) + 1e-6)
df['recent_payment_ratio'] = df['pay_amt1'] / (df['Bill_amt1'] + 1e-6)

# 3. BILL AND PAYMENT STATS
df['bill_mean'] = df[[f'Bill_amt{i}' for i in range(1, 7)]].mean(axis=1)
df['bill_std'] = df[[f'Bill_amt{i}' for i in range(1, 7)]].std(axis=1)
df['pay_mean'] = df[[f'pay_amt{i}' for i in range(1, 7)]].mean(axis=1)
df['pay_std'] = df[[f'pay_amt{i}' for i in range(1, 7)]].std(axis=1)

# 4. DELINQUENCY STREAKS & PATTERNS
pay_cols = ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']
df['overdue_count'] = (df[pay_cols] >= 1).sum(axis=1)
df['on_time_count'] = (df[pay_cols] == -1).sum(axis=1)
df['avg_delinquency'] = df[pay_cols].replace([-2, -1], 0).mean(axis=1)
df['recent_delinquency'] = df['pay_0'].apply(lambda x: max(x, 0))

# Max consecutive months of delay
def max_consec(arr):
    max_run = run = 0
    for x in arr:
        if x >= 1:
            run += 1
            max_run = max(max_run, run)
        else:
            run = 0
    return max_run
df['max_overdue_streak'] = df[pay_cols].apply(lambda row: max_consec(row.values), axis=1)

# Months since last overdue
def last_overdue(row):
    for i, x in enumerate(row):
        if x >= 1:
            return i  # 0 = most recent
    return len(row)
df['months_since_overdue'] = df[pay_cols].apply(lambda r: last_overdue(r.values), axis=1)

# 5. PAYMENT CONSISTENCY
df['repayment_consistency'] = df[[f'pay_amt{i}' for i in range(1, 7)]].apply(lambda row: (row > 0).sum(), axis=1) / 6

# 6. SHORTFALL & BEHAVIOR RATIOS
df['shortfall_count'] = sum(df[f'pay_amt{i}'] < df[f'Bill_amt{i}'] for i in range(1, 7))
df['rev_to_ontime'] = (df[pay_cols] == 0).sum(axis=1) / ((df[pay_cols] == -1).sum(axis=1) + 1e-6)
df['rev_to_ontime'].fillna(0, inplace=True)



In [None]:
# plot overall correlation matrix
numeric_df = df.select_dtypes(include='number')

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Correlation Matrix (Numeric Features)', fontsize=18)
plt.show()

In [None]:
selected_features = [
    'age', 'education', 'marriage', 'sex',
    'utilization', 'recent_utilization', 'avg_pay_ratio', 'recent_payment_ratio',
    'bill_mean', 'bill_std',  'pay_std','pay_mean',
    'overdue_count', 'on_time_count', 'avg_delinquency', 'recent_delinquency',
    'max_overdue_streak', 'months_since_overdue', 'repayment_consistency',
    'shortfall_count', 'rev_to_ontime', 
    'LIMIT_BAL', 
    'AVG_Bill_amt',	'PAY_TO_BILL_ratio'
]

X = df[selected_features]
y = df['next_month_default']


In [None]:
# # Step 2: Stratified Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score, roc_auc_score
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd

# --- Resample with SMOTETomek ---
smk = SMOTETomek(random_state=42)
X_train_res, y_train_res = smk.fit_resample(X_train, y_train)

# Convert resampled data to DataFrame (optional: use original column names)
X_train_res = pd.DataFrame(X_train_res, columns=X_train.columns if hasattr(X_train, 'columns') else None)
X_test = pd.DataFrame(X_test, columns=X_test.columns if hasattr(X_test, 'columns') else None)

# Convert all columns to numeric float32, filling any NaNs with 0
X_train_res = X_train_res.apply(pd.to_numeric, errors='coerce').fillna(0).astype(np.float32)
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0).astype(np.float32)

# Make sure target variables are 1D numpy arrays
if isinstance(y_train_res, pd.DataFrame):
    y_train_res = y_train_res.iloc[:, 0]
y_train_res = np.array(y_train_res).ravel()

if isinstance(y_test, pd.DataFrame):
    y_test = y_test.iloc[:, 0]
y_test = np.array(y_test).ravel()


# --- Class Imbalance Weight for XGBoost ---
pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# --- Model Definitions ---
logreg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
rf = RandomForestClassifier(class_weight='balanced', n_estimators=200, max_depth=10, random_state=42)
xgb = XGBClassifier(scale_pos_weight=pos_weight, use_label_encoder=False, eval_metric='logloss', random_state=42)

models = {
    'Logistic Regression': logreg,
    'Random Forest': rf,
    'XGBoost': xgb
}

# --- Evaluation Function ---
def evaluate_model(name, model):
    model.fit(X_train_res, y_train_res)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Find best threshold for F2-score
    thresholds = np.arange(0.1, 0.9, 0.01)
    f2_scores = [fbeta_score(y_test, y_proba >= t, beta=2) for t in thresholds]
    best_thresh = thresholds[np.argmax(f2_scores)]

    y_pred = (y_proba >= best_thresh).astype(int)

    print(f"\n{name} — Best Threshold for F2: {best_thresh:.2f}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-score:", fbeta_score(y_test, y_pred, beta=1))
    print("F2-score:", fbeta_score(y_test, y_pred, beta=2))
    print("AUC-ROC:", roc_auc_score(y_test, y_proba))

    return {
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-score': fbeta_score(y_test, y_pred, beta=1),
        'F2-score': fbeta_score(y_test, y_pred, beta=2),
        'AUC-ROC': roc_auc_score(y_test, y_proba)
    }

# --- Train and Evaluate Models ---
results = []
for name, model in models.items():
    results.append(evaluate_model(name, model))

# --- Display Results ---
results_df = pd.DataFrame(results).set_index('Model')
results_df = results_df.sort_values('F2-score', ascending=False)
print("\nFinal Model Comparison:\n")
print(results_df)


In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import fbeta_score
# import numpy as np
# import pandas as pd

# # 1. Retrain RF on the full resampled data
# best_rf = RandomForestClassifier(
#     class_weight='balanced',
#     n_estimators=200,
#     max_depth=10,
#     random_state=42
# )
# best_rf.fit(X_train_res, y_train_res)

# # 2. Predict on X_test
# y_proba_test = best_rf.predict_proba(X_test)[:, 1]

# # 3. Evaluate across thresholds
# thresholds = np.arange(0.1, 0.9, 0.01)
# records = []
# for t in thresholds:
#     y_pred_t = (y_proba_test >= t).astype(int)
#     f2 = fbeta_score(y_test, y_pred_t, beta=2)
#     positives = y_pred_t.sum()
#     records.append({
#         'Threshold': t,
#         'F2‑Score': f2,
#         'Predicted 1s': positives
#     })

# # 4. Build DataFrame & find best threshold
# results_df = pd.DataFrame(records)
# best_row = results_df.loc[results_df['F2‑Score'].idxmax()]

# print("\n🏆 Threshold with Highest F2:")
# print(best_row.to_frame().T.to_string(index=False))

# print("\n📊 Full threshold table (first 10 rows):")
# print(results_df.head(10).to_string(index=False))

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score

# Step 1: Retrain RF on full resampled data
best_rf = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=200,
    max_depth=10,
    random_state=42
)
best_rf.fit(X_train_res, y_train_res)

# Step 2: Predict on X_test
y_proba_test = best_rf.predict_proba(X_test)[:, 1]

# Step 3: Evaluate thresholds and capture results
thresholds = np.arange(0.1, 0.9, 0.01)
records = []
for t in thresholds:
    preds = (y_proba_test >= t).astype(int)
    f2 = fbeta_score(y_test, preds, beta=2, zero_division=0)
    positives = preds.sum()
    records.append((t, f2, positives))
    # Debug print per iteration
    print(f"Threshold {t:.2f} — F2: {f2:.4f}, PredictedPositives: {positives}")

# Step 4: Build DataFrame
results_df = pd.DataFrame(records, columns=['Threshold', 'F2-Score', 'Predicted_1s'])

# Show only top and bottom, then full if small
print("\nTop 5 thresholds by F2 score:")
print(results_df.nlargest(5, 'F2-Score').to_string(index=False))

print("\nFull threshold table:")
print(results_df.to_string(index=False))

# Step 5: Display best threshold summary
best_row = results_df.loc[results_df['F2-Score'].idxmax()]
print("\n🏆 Best threshold for F2:")
print(best_row.to_frame().T.to_string(index=False))



In [None]:
# --- Step 1: Load validation dataset ---
validate_df = pd.read_csv("validate_dataset_final.csv")

# --- Step 2: Preprocess validation dataset ---
# Feature engineering to match training data

# 1. UTILIZATION FEATURES
validate_df['utilization'] = validate_df[[f'Bill_amt{i}' for i in range(1, 7)]].sum(axis=1) / (6 * validate_df['LIMIT_BAL'] + 1e-6)
validate_df['recent_utilization'] = validate_df['Bill_amt1'] / (validate_df['LIMIT_BAL'] + 1e-6)

# 2. PAYMENT TO BILL RATIOS
validate_df['avg_pay_ratio'] = validate_df[[f'pay_amt{i}' for i in range(1, 7)]].sum(axis=1) / (validate_df[[f'Bill_amt{i}' for i in range(1, 7)]].sum(axis=1) + 1e-6)
validate_df['recent_payment_ratio'] = validate_df['pay_amt1'] / (validate_df['Bill_amt1'] + 1e-6)

# 3. BILL AND PAYMENT STATS
validate_df['bill_mean'] = validate_df[[f'Bill_amt{i}' for i in range(1, 7)]].mean(axis=1)
validate_df['bill_std'] = validate_df[[f'Bill_amt{i}' for i in range(1, 7)]].std(axis=1)
validate_df['pay_mean'] = validate_df[[f'pay_amt{i}' for i in range(1, 7)]].mean(axis=1)
validate_df['pay_std'] = validate_df[[f'pay_amt{i}' for i in range(1, 7)]].std(axis=1)

# 4. DELINQUENCY STREAKS & PATTERNS
pay_cols = ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']
validate_df['overdue_count'] = (validate_df[pay_cols] >= 1).sum(axis=1)
validate_df['avg_delinquency'] = validate_df[pay_cols].replace([-2, -1], 0).mean(axis=1)
validate_df['on_time_count'] = (validate_df[pay_cols] == -1).sum(axis=1)
validate_df['recent_delinquency'] = validate_df['pay_0'].apply(lambda x: max(x, 0))

# Max consecutive months of delay
def max_consec(arr):
    max_run = run = 0
    for x in arr:
        if x >= 1:
            run += 1
            max_run = max(max_run, run)
        else:
            run = 0
    return max_run
validate_df['max_overdue_streak'] = validate_df[pay_cols].apply(lambda row: max_consec(row.values), axis=1)

# Months since last overdue
def last_overdue(row):
    for i, x in enumerate(row):
        if x >= 1:
            return i  # 0 = most recent
    return len(row)
validate_df['months_since_overdue'] = validate_df[pay_cols].apply(lambda r: last_overdue(r.values), axis=1)

# 5. PAYMENT CONSISTENCY
validate_df['repayment_consistency'] = validate_df[[f'pay_amt{i}' for i in range(1, 7)]].apply(lambda row: (row > 0).sum(), axis=1) / 6

# 6. SHORTFALL & BEHAVIOR RATIOS
validate_df['shortfall_count'] = sum(validate_df[f'pay_amt{i}'] < validate_df[f'Bill_amt{i}'] for i in range(1, 7))
validate_df['rev_to_ontime'] = (validate_df[pay_cols] == 0).sum(axis=1) / ((validate_df[pay_cols] == -1).sum(axis=1) + 1e-6)
validate_df['rev_to_ontime'].fillna(0, inplace=True)

# Now select columns in the same order as X_train_res
validate_df = validate_df[X_train_res.columns]

# Convert to float32 and handle any non-numeric entries
validate_df = validate_df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(np.float32)

# --- Step 3: Make predictions using the trained Random Forest model ---
# Use threshold based on F2-score (e.g., 0.24 from your output)
best_threshold = 0.24

# Predict probabilities
validate_probs = rf.predict_proba(validate_df)[:, 1]

# Apply threshold
validate_preds = (validate_probs >= best_threshold).astype(int)

# --- Step 4: Save predictions ---
output_df = pd.DataFrame({
    'ID': validate_df.index,  # or any other ID column if present in original CSV
    'Default_Prediction': validate_preds
})

output_df.to_csv("validation_predictions.csv", index=False)

print("✅ Predictions saved to validation_predictions.csv")
