# Ordinal Regression Template – Ordered Categories

This template is for **ordinal regression** problems, where labels are discrete but **ordered**:

- Star ratings (1–5)  
- Draft grades (mapped A,B,C,D → 3,2,1,0)  
- Severity levels (none, mild, moderate, severe)  

We cover:

1. Treating ordinal as numeric (regression)  
2. Treating ordinal as multiclass classification  
3. Simple cumulative ordinal model using multiple logistic regressions  


In [None]:
# ========== 1. Imports & Config (Ordinal Regression) ==========

from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["figure.dpi"] = 100

# ---- Config ----
DATA_DIR = Path("../input")
TRAIN_FILE = "ordinal_data.csv"

TARGET_COL = "rating"  # ordered categories like 1..5
ID_COL = "id"          # optional

RANDOM_STATE = 42


In [None]:
# ========== 2. Load Data & Feature Types ==========

def load_data(data_dir: Path = DATA_DIR, train_file: str = TRAIN_FILE) -> pd.DataFrame:
    path = data_dir / train_file
    if not path.exists():
        raise FileNotFoundError(f"Train file not found: {path}")
    df = pd.read_csv(path)
    print("Data shape:", df.shape)
    display(df.head())
    return df


def get_numeric_features(df: pd.DataFrame, exclude: Optional[List[str]] = None) -> List[str]:
    cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if exclude:
        cols = [c for c in cols if c not in exclude]
    return cols


def get_categorical_features(df: pd.DataFrame) -> List[str]:
    return df.select_dtypes(include=["object", "category"]).columns.tolist()


df = load_data()
print("Target distribution:")
display(df[TARGET_COL].value_counts().sort_index())

exclude_cols = [TARGET_COL]
if ID_COL in df.columns:
    exclude_cols.append(ID_COL)

num_cols = get_numeric_features(df, exclude=exclude_cols)
cat_cols = get_categorical_features(df)

print("Numeric cols:", num_cols[:10], "..." if len(num_cols) > 10 else "")
print("Categorical cols:", cat_cols)


### 3️⃣ Preprocessing Pipeline

Standard tabular preprocessing:

- Median imputation for numeric  
- Most-frequent imputation for categoricals  
- Scaling numeric features  
- One-hot encoding categoricals  


In [None]:
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer = SimpleImputer(strategy="most_frequent")

numeric_pipeline = Pipeline([
    ("imputer", numeric_imputer),
    ("scaler", StandardScaler(with_mean=False)),
])
categorical_pipeline = Pipeline([
    ("imputer", categorical_imputer),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

transformers = []
if num_cols:
    transformers.append(("num", numeric_pipeline, num_cols))
if cat_cols:
    transformers.append(("cat", categorical_pipeline, cat_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")


### 4️⃣ Baseline 1 – Treat Ordinal as Numeric Regression

Map the ordered labels to integers and use regression, then round back to nearest class.

Pros: very simple.  
Cons: ignores discrete nature and may over/under-penalize big vs small errors.


In [None]:
drop_cols = [TARGET_COL]
if ID_COL in df.columns:
    drop_cols.append(ID_COL)

X = df.drop(columns=drop_cols)
y = df[TARGET_COL].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

reg_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression()),
])

reg_pipe.fit(X_train, y_train)
y_pred_reg = reg_pipe.predict(X_valid)

unique_classes = np.sort(y.unique())
y_pred_reg_rounded = np.clip(np.rint(y_pred_reg), unique_classes.min(), unique_classes.max()).astype(int)

rmse = mean_squared_error(y_valid, y_pred_reg, squared=False)
f1w = f1_score(y_valid, y_pred_reg_rounded, average="weighted")
acc = accuracy_score(y_valid, y_pred_reg_rounded)
print("Regression baseline on ordinal:")
print(f"RMSE (continuous): {rmse:.4f}")
print(f"Accuracy (rounded): {acc:.4f}, F1-weighted (rounded): {f1w:.4f}")


### 5️⃣ Baseline 2 – Treat Ordinal as Multiclass Classification

Ignore ordering and train a standard multiclass classifier.

Pros: uses standard classification tooling.  
Cons: does not exploit ordering structure.


In [None]:
clf_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        n_jobs=-1,
        random_state=RANDOM_STATE,
    )),
])

clf_pipe.fit(X_train, y_train)
y_pred_clf = clf_pipe.predict(X_valid)

acc_clf = accuracy_score(y_valid, y_pred_clf)
f1w_clf = f1_score(y_valid, y_pred_clf, average="weighted")
print("Multiclass classification treating ordinal as nominal:")
print(f"Accuracy: {acc_clf:.4f}, F1-weighted: {f1w_clf:.4f}")
print(classification_report(y_valid, y_pred_clf, digits=4))


### 6️⃣ Simple Cumulative Ordinal Model

We create K-1 binary problems for ordered classes 1..K:

- For each threshold c (except last), model `P(y > c | x)` via logistic regression.  
- Recover class probabilities from cumulative probabilities.  


In [None]:
def fit_ordinal_cumulative(X_train_t, y_train, X_valid_t, base_clf, classes):
    classes = np.sort(classes)
    n_classes = len(classes)
    models = []
    proba_gt = []

    for c in classes[:-1]:
        y_bin = (y_train > c).astype(int)
        clf = clone(base_clf)
        clf.fit(X_train_t, y_bin)
        p = clf.predict_proba(X_valid_t)[:, 1]
        proba_gt.append(p)
        models.append(clf)

    proba_gt = np.vstack(proba_gt)

    n_samples = X_valid_t.shape[0]
    class_probs = np.zeros((n_samples, n_classes))

    class_probs[:, 0] = 1 - proba_gt[0, :]
    for i in range(1, n_classes - 1):
        class_probs[:, i] = proba_gt[i - 1, :] - proba_gt[i, :]
    class_probs[:, -1] = proba_gt[-1, :]

    class_probs = np.clip(class_probs, 1e-6, 1.0)
    class_probs = class_probs / class_probs.sum(axis=1, keepdims=True)

    pred_indices = np.argmax(class_probs, axis=1)
    y_pred_ord = classes[pred_indices]

    return y_pred_ord, class_probs


logit = LogisticRegression(max_iter=2000, n_jobs=-1)
X_train_t = preprocessor.fit_transform(X_train)
X_valid_t = preprocessor.transform(X_valid)

y_pred_ord, class_probs = fit_ordinal_cumulative(
    X_train_t, y_train, X_valid_t, logit, unique_classes
)

acc_ord = accuracy_score(y_valid, y_pred_ord)
f1w_ord = f1_score(y_valid, y_pred_ord, average="weighted")
print("Ordinal cumulative logit:")
print(f"Accuracy: {acc_ord:.4f}, F1-weighted: {f1w_ord:.4f}")


### 7️⃣ Decision Guide

- Use **regression baseline** when labels are many and almost continuous.  
- Use **multiclass classification** when ordering is less critical.  
- Use **ordinal cumulative model** when ordering is important and you care about "distance" between categories.

You can feed ordinal probabilities or scores into downstream models.
