In [17]:
# baseline_submission.py
# Requirements:
# pip install pandas numpy scikit-learn lightgbm category_encoders

import os
import gc
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# -------------------------
# Config
# -------------------------
TRAIN_PATH = "Train_set.csv"
TEST_PATH = "Test_set.csv"
TARGET = "default"
ID_COL = "ID"            # adjust if your id column name differs
N_FOLDS = 5
RANDOM_STATE = 42
SUBMISSION_FILE = "submission.csv"

# -------------------------
# Load data
# -------------------------
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print("Train rows:", len(train))
print("Test rows:", len(test))

# Quick check: user expects 39933 entries in submission
# If test has different shape you may need to confirm with organisers.
print("Test rows (should be 39933):", len(test))

# -------------------------
# Basic EDA (quick)
# -------------------------
print("Train columns:", train.columns.tolist())
print(train[TARGET].value_counts(dropna=False))

# -------------------------
# Preprocessing helpers
# -------------------------
def reduce_mem(df):
    # very simple memory reducer (optional)
    for col in df.select_dtypes(include=["int64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    for col in df.select_dtypes(include=["float64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    return df

train = reduce_mem(train)
test = reduce_mem(test)

# -------------------------
# Feature selection / engineering
# -------------------------
# Heuristic: treat object columns as categorical. Numeric columns keep as-is.
# Drop any columns you know are leakage or not available at inference.

all_data = pd.concat([train.drop(columns=[TARGET]), test], axis=0, sort=False)
cat_cols = all_data.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in all_data.columns if c not in cat_cols and c != ID_COL]

print("Categorical cols:", cat_cols)
print("Numeric cols:", num_cols)

# Basic imputation:
# - Numeric -> median
# - Categorical -> fill "MISSING" then label-encode
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="constant", fill_value="__MISSING__")

# Fit imputers on combined data (to avoid unseen category at test)
all_num = all_data[num_cols]
all_cat = all_data[cat_cols].astype(str) if len(cat_cols) else pd.DataFrame(index=all_data.index)

all_num_imputed = pd.DataFrame(num_imputer.fit_transform(all_num), columns=num_cols, index=all_num.index)
all_cat_imputed = pd.DataFrame(cat_imputer.fit_transform(all_cat), columns=cat_cols, index=all_cat.index)

# Label encode categoricals
label_encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    all_cat_imputed[c] = le.fit_transform(all_cat_imputed[c].astype(str))
    label_encoders[c] = le

# Reassemble processed data
processed = pd.concat([all_num_imputed, all_cat_imputed], axis=1)
processed[ID_COL] = all_data[ID_COL].values
# Split back to train/test
proc_train = processed.iloc[:len(train)].reset_index(drop=True)
proc_test = processed.iloc[len(train):].reset_index(drop=True)

X = proc_train.drop(columns=[ID_COL])
y = train[TARGET].values
X_test = proc_test.drop(columns=[ID_COL])

print("X shape:", X.shape, "X_test shape:", X_test.shape)
# -------------------------
# LightGBM training with StratifiedKFold
# -------------------------
params = {
    "objective": "binary",
    "metric": "binary_error",   # binary_error gives 0/1 error; we'll compute accuracy later
    "boosting": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": RANDOM_STATE,
    "verbosity": -1
}

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
feature_importance_df = pd.DataFrame()

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)


Train rows: 93174
Test rows: 39933
Test rows (should be 39933): 39933
Train columns: ['ID', 'loan_amnt', 'loan_term', 'interest_rate', 'loan_grade', 'loan_subgrade', 'job_experience', 'home_ownership', 'annual_income', 'income_verification_status', 'loan_purpose', 'state_code', 'debt_to_income', 'delinq_2yrs', 'public_records', 'revolving_balance', 'total_acc', 'interest_receive', 'application_type', 'last_week_pay', 'total_current_balance', 'total_revolving_limit', 'default']
default
0    71045
1    22129
Name: count, dtype: int64
Categorical cols: ['loan_term', 'loan_grade', 'loan_subgrade', 'job_experience', 'home_ownership', 'income_verification_status', 'loan_purpose', 'state_code', 'application_type']
Numeric cols: ['loan_amnt', 'interest_rate', 'annual_income', 'debt_to_income', 'delinq_2yrs', 'public_records', 'revolving_balance', 'total_acc', 'interest_receive', 'last_week_pay', 'total_current_balance', 'total_revolving_limit']
X shape: (93174, 21) X_test shape: (39933, 21)


In [18]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val)
    
    # Optionally set scale_pos_weight for imbalanced classes:
    # pos = sum(y_train==1); neg = sum(y_train==0)
    # params['scale_pos_weight'] = neg / (pos+1e-9)
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_val],
        callbacks=[lgb.early_stopping(100),lgb.log_evaluation(100)]
    )
    
    # Predict
    val_pred_prob = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred = (val_pred_prob >= 0.5).astype(int)
    oof_preds[val_idx] = val_pred
    
    test_pred_prob = model.predict(X_test, num_iteration=model.best_iteration)
    test_preds += test_pred_prob / N_FOLDS
    
    # feature importance
    fold_imp = pd.DataFrame({
        "feature": X.columns,
        "importance": model.feature_importance(importance_type="gain"),
        "fold": fold
    })
    feature_importance_df = pd.concat([feature_importance_df, fold_imp], axis=0)
    
    acc = accuracy_score(y_val, val_pred)
    print(f"Fold {fold} accuracy: {acc:.5f}")
    gc.collect()
    
# OOF accuracy
oof_pred_labels = (oof_preds >= 0.5).astype(int)
oof_acc = accuracy_score(y, oof_pred_labels)
print("OOF accuracy:", oof_acc)


Fold 1
Training until validation scores don't improve for 100 rounds
[100]	training's binary_error: 0.131837	valid_1's binary_error: 0.137913
[200]	training's binary_error: 0.114061	valid_1's binary_error: 0.125731
[300]	training's binary_error: 0.102725	valid_1's binary_error: 0.121277
[400]	training's binary_error: 0.0934947	valid_1's binary_error: 0.118809
[500]	training's binary_error: 0.0849757	valid_1's binary_error: 0.118218
[600]	training's binary_error: 0.076631	valid_1's binary_error: 0.11795
Early stopping, best iteration is:
[513]	training's binary_error: 0.0834194	valid_1's binary_error: 0.117789
Fold 1 accuracy: 0.88221
Fold 2
Training until validation scores don't improve for 100 rounds
[100]	training's binary_error: 0.130361	valid_1's binary_error: 0.13861
[200]	training's binary_error: 0.111901	valid_1's binary_error: 0.127663
[300]	training's binary_error: 0.100994	valid_1's binary_error: 0.123477
[400]	training's binary_error: 0.0924617	valid_1's binary_error: 0.1234

In [19]:
# Final test predictions (thresholded)
final_test_labels = (test_preds >= 0.5).astype(int)

# -------------------------
# Submission
# -------------------------
submission = pd.DataFrame({
    ID_COL: proc_test[ID_COL].values,
    TARGET: final_test_labels
})



In [20]:
# OPTIONAL: ensure correct number of rows (problem expects 39933)
print("Submission rows:", len(submission))
submission.to_csv(SUBMISSION_FILE, index=False)
print(f"Wrote {SUBMISSION_FILE}")

# -------------------------
# Helpful extras (feature importance)
# -------------------------
imp_mean = feature_importance_df.groupby("feature")["importance"].mean().sort_values(ascending=False)
print("Top features:\n", imp_mean.head(20))

Submission rows: 39933
Wrote submission.csv
Top features:
 feature
last_week_pay                 113263.194933
interest_rate                  86248.466365
loan_subgrade                  42218.741955
total_current_balance          37168.161636
interest_receive               22001.352601
total_revolving_limit          19074.209397
debt_to_income                 16939.106927
loan_term                      14793.241593
loan_amnt                      11918.528693
revolving_balance              11692.348105
annual_income                  10612.132073
loan_grade                     10056.002086
total_acc                       9783.303072
state_code                      6806.067883
income_verification_status      4568.518214
job_experience                  2375.447043
loan_purpose                    2111.137820
delinq_2yrs                     1598.723091
public_records                  1238.601614
home_ownership                  1219.048389
Name: importance, dtype: float64
