# Tabular Classification Template ‚Äì EDA ‚Üí Imbalance ‚Üí Modeling

This notebook is a reusable template for **tabular classification problems** (binary or multiclass).  
It mirrors the structure of the regression workflow, but with **classification-specific twists**:

- Class balance & target leakage checks
- Metrics and evaluation suited to classification (accuracy, ROC-AUC, F1, etc.)
- Handling label imbalance (class weights, resampling)
- Same modular model factory (trees, linear, neural nets)

---

## üîÅ High-Level Workflow (Classification)

1. **Imports & config**
2. **Load data**
3. **Structure & column typing**
4. **EDA**
   - target distribution (class balance)
   - numeric & categorical feature analysis
5. **Skewness & outliers (on numeric predictors)**
6. **Feature engineering** (date parts, ratios, bins, etc.)
7. **Missingness & imputation**
8. **Class imbalance strategy**
9. **Model family selection & baseline training**
10. **Evaluation & error analysis (confusion matrix, ROC, PR curves)**

You can duplicate this notebook for any new **classification** competition and adjust only the config.


In [None]:
# ========== 1. Imports & Config (Classification) ==========

import os
from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Optional: gradient boosting libs
try:
    from xgboost import XGBClassifier
except ImportError:
    XGBClassifier = None

try:
    from lightgbm import LGBMClassifier
except ImportError:
    LGBMClassifier = None

# Optional: Keras/TensorFlow
try:
    import tensorflow as tf
    from tensorflow import keras
except ImportError:
    tf = None
    keras = None

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["figure.dpi"] = 100

# ---- Config (edit per dataset) ----
DATA_DIR = Path("../input")
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"      # set to None if no test

TARGET_COL = "target"       # classification label
ID_COL = "id"               # optional ID

RANDOM_STATE = 42


In [None]:
# ========== 2. Load Data & Typing Helpers ==========

def load_data(
    data_dir: Path = DATA_DIR,
    train_file: str = TRAIN_FILE,
    test_file: Optional[str] = TEST_FILE,
):
    train_path = data_dir / train_file
    if not train_path.exists():
        raise FileNotFoundError(f"Train file not found: {train_path}")
    train_df = pd.read_csv(train_path)

    test_df = None
    if test_file is not None:
        test_path = data_dir / test_file
        if test_path.exists():
            test_df = pd.read_csv(test_path)
        else:
            print(f"Test file not found: {test_path} (continuing without test_df)")

    print("Train shape:", train_df.shape)
    if test_df is not None:
        print("Test shape :", test_df.shape)
    return train_df, test_df


def get_numeric_features(df: pd.DataFrame, exclude: Optional[List[str]] = None) -> List[str]:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if exclude:
        num_cols = [c for c in num_cols if c not in exclude]
    return num_cols


def get_categorical_features(df: pd.DataFrame) -> List[str]:
    return df.select_dtypes(include=["object", "category"]).columns.tolist()


def get_boolean_features(df: pd.DataFrame) -> List[str]:
    bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
    for col in df.select_dtypes(include=["int64", "int32", "int16"]).columns:
        unique_vals = df[col].dropna().unique()
        if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1}):
            bool_cols.append(col)
    return list(dict.fromkeys(bool_cols))


def summarize_dataframe(df: pd.DataFrame, name: str = "df"):
    print(f"===== {name} summary =====")
    print("Shape:", df.shape)
    display(df.head())
    print("\nDtypes:")
    display(df.dtypes)
    print("\nMissing (%):")
    display((df.isna().mean() * 100).sort_values(ascending=False))


train_df, test_df = load_data()
summarize_dataframe(train_df, "train_df")

num_cols = get_numeric_features(train_df, exclude=[TARGET_COL, ID_COL] if ID_COL in train_df.columns else [TARGET_COL])
cat_cols = get_categorical_features(train_df)
bool_cols = get_boolean_features(train_df)

print("Numeric cols:", num_cols[:10], "..." if len(num_cols) > 10 else "")
print("Categorical cols:", cat_cols)
print("Boolean cols:", bool_cols)


### 3Ô∏è‚É£ Target / Class Distribution

Key classification-specific checks:

- Is the problem **binary** or **multiclass**?
- Are the classes **balanced** or highly skewed?
- Is there any suspicious pattern suggesting **label leakage** (e.g. IDs perfectly predicting target)?


In [None]:
# Target distribution
print("Target value counts:")
display(train_df[TARGET_COL].value_counts(dropna=False))
print("\nTarget value proportions:")
display(train_df[TARGET_COL].value_counts(normalize=True))

sns.countplot(x=TARGET_COL, data=train_df)
plt.title("Target class distribution")
plt.show()


### 4Ô∏è‚É£ Simple Preprocessing & Baseline Classification Models

For classification, we can start with:

- Simple imputation (median for numerics, most frequent for categoricals)
- One-hot encoding for categoricals
- No scaling required for tree models; scaling useful for logistic regression / neural nets.

We'll define a small **model factory** and a single function to train/evaluate a few baselines.


In [None]:
# Simple imputers & encoders
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer = SimpleImputer(strategy="most_frequent")

def build_preprocessor(df: pd.DataFrame):
    num_cols = get_numeric_features(df, exclude=[TARGET_COL, ID_COL] if ID_COL in df.columns else [TARGET_COL])
    cat_cols = get_categorical_features(df)

    numeric_pipeline = Pipeline(steps=[
        ("imputer", numeric_imputer),
        ("scaler", StandardScaler(with_mean=False)),  # with_mean=False for sparse safety
    ])

    categorical_pipeline = Pipeline(steps=[
        ("imputer", categorical_imputer),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ])

    transformers = []
    if num_cols:
        transformers.append(("num", numeric_pipeline, num_cols))
    if cat_cols:
        transformers.append(("cat", categorical_pipeline, cat_cols))

    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder="drop",
    )
    return preprocessor


def build_classifier(model_type: str):
    if model_type == "logreg":
        return LogisticRegression(
            max_iter=1000,
            n_jobs=-1,
            class_weight="balanced",  # helps with imbalance
        )
    if model_type == "rf":
        return RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            n_jobs=-1,
            random_state=RANDOM_STATE,
            class_weight="balanced_subsample",
        )
    if model_type == "xgb":
        if XGBClassifier is None:
            raise ImportError("xgboost not installed")
        return XGBClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method="hist",
            random_state=RANDOM_STATE,
        )
    if model_type == "lgbm":
        if LGBMClassifier is None:
            raise ImportError("lightgbm not installed")
        return LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=RANDOM_STATE,
        )
    # Keras MLP option could be added later similarly
    raise ValueError(f"Unknown model_type: {model_type}")


In [None]:
def evaluate_classifiers(
    df: pd.DataFrame,
    target_col: str = TARGET_COL,
    id_col: Optional[str] = ID_COL,
    model_types: Optional[List[str]] = None,
    test_size: float = 0.2,
    random_state: int = RANDOM_STATE,
):
    if model_types is None:
        model_types = ["logreg", "rf", "xgb", "lgbm"]

    df = df.copy()
    drop_cols = [target_col]
    if id_col is not None and id_col in df.columns:
        drop_cols.append(id_col)

    X = df.drop(columns=drop_cols)
    y = df[target_col]

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    preprocessor = build_preprocessor(df)
    results = []

    for mt in model_types:
        print(f"\n=== Model: {mt} ===")
        try:
            clf = build_classifier(mt)
            pipe = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("model", clf),
                ]
            )
            pipe.fit(X_train, y_train)

            y_pred = pipe.predict(X_valid)
            acc = accuracy_score(y_valid, y_pred)
            f1 = f1_score(y_valid, y_pred, average="weighted")

            # ROC-AUC for binary only
            auc = np.nan
            if len(np.unique(y_valid)) == 2 and hasattr(pipe, "predict_proba"):
                y_proba = pipe.predict_proba(X_valid)[:, 1]
                auc = roc_auc_score(y_valid, y_proba)

            print(f"Accuracy: {acc:.4f} | F1 (weighted): {f1:.4f} | ROC-AUC: {auc if not np.isnan(auc) else 'N/A'}")

            cm = confusion_matrix(y_valid, y_pred)
            ConfusionMatrixDisplay(cm).plot()
            plt.title(f"Confusion Matrix ‚Äì {mt}")
            plt.show()

            results.append({"model_type": mt, "accuracy": acc, "f1_weighted": f1, "roc_auc": auc})
        except Exception as e:
            print("Error:", e)

    results_df = pd.DataFrame(results).sort_values("f1_weighted", ascending=False)
    display(results_df)
    return results_df


# Run baseline comparison (you can comment out models you don't have installed)
classification_results = evaluate_classifiers(train_df)
