In [2]:
# ============================================================
# 1. Imports
# ============================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

# ============================================================
# 2. Load data
#    (make sure the filename matches what you downloaded)
# ============================================================
df = pd.read_csv("medical_insurance.csv")
print(df.head())
print(df.columns)

# ============================================================
# 3. Choose a target and turn it into classes
#    Here we assume 'annual_premium' is the cost column.
#    If your file uses another name (e.g. 'PremiumPrice'),
#    change TARGET_COL accordingly.
# ============================================================
TARGET_COL = "annual_premium"   # change if needed

# create 3 classes based on quantiles: low / medium / high
n_bins = 3
df["premium_class"] = pd.qcut(
    df[TARGET_COL],
    q=n_bins,
    labels=["low", "medium", "high"]
)

# This will be our classification target
y = df["premium_class"]

# ============================================================
# 4. Define feature matrix X
#    Drop the target and any obvious leakage columns
#    (you already did something similar in your regression code)
# ============================================================
cols_to_drop = [
    TARGET_COL,
    "annual_medical_cost",
    "monthly_premium",
    "avg_claim_amount",
    "total_claims_paid",
    "claims_count"
]

cols_to_drop = [c for c in cols_to_drop if c in df.columns]

X = df.drop(columns=cols_to_drop + ["premium_class"])

print("Features used:", X.columns.tolist())

# ============================================================
# 5. Preprocessing: numeric + categorical
# ============================================================
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(exclude=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
    ],
    remainder="drop",
)

# ============================================================
# 6. Train / test split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Class distribution (train):")
print(y_train.value_counts(normalize=True))

# ============================================================
# 7. Helper function for evaluation
# ============================================================
def evaluate_classifier(name, model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)

    print(f"\n{name}")
    print("-" * len(name))
    print(f"Train accuracy: {acc_train:.3f}")
    print(f"Test  accuracy: {acc_test:.3f}")

    print("\nClassification report (test):")
    print(classification_report(y_test, y_pred_test))

    print("Confusion matrix (test):")
    print(confusion_matrix(y_test, y_pred_test))

# ============================================================
# 8. Baseline classifier (predicts majority class)
# ============================================================
baseline_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DummyClassifier(strategy="most_frequent"))
])

baseline_clf.fit(X_train, y_train)
evaluate_classifier("Baseline (most_frequent)", baseline_clf, X_train, X_test, y_train, y_test)

# ============================================================
# 9. Logistic Regression classifier
# ============================================================
log_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, multi_class="multinomial"))
])

log_reg.fit(X_train, y_train)
evaluate_classifier("Logistic Regression", log_reg, X_train, X_test, y_train, y_test)

# ============================================================
# 10. Random Forest classifier
# ============================================================
rf_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    ))
])

rf_clf.fit(X_train, y_train)
evaluate_classifier("Random Forest", rf_clf, X_train, X_test, y_train, y_test)

# ============================================================
# 11. K-Nearest Neighbors classifier
# ============================================================
knn_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier(n_neighbors=7))
])

knn_clf.fit(X_train, y_train)
evaluate_classifier("KNN (k=7)", knn_clf, X_train, X_test, y_train, y_test)

# ============================================================
# 12. Support Vector Machine classifier
# ============================================================
svm_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", SVC(kernel="rbf", probability=True, random_state=42))
])

svm_clf.fit(X_train, y_train)
evaluate_classifier("SVM (RBF kernel)", svm_clf, X_train, X_test, y_train, y_test)

# ============================================================
# 13. Decision Tree classifier
# ============================================================
tree_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ))
])

tree_clf.fit(X_train, y_train)
evaluate_classifier("Decision Tree", tree_clf, X_train, X_test, y_train, y_test)


   person_id  age     sex   region urban_rural   income     education  \
0      75722   52  Female    North    Suburban  22700.0     Doctorate   
1      80185   79  Female    North       Urban  12800.0         No HS   
2      19865   68    Male    North       Rural  40700.0            HS   
3      76700   15    Male    North    Suburban  15600.0  Some College   
4      92992   53    Male  Central    Suburban  89600.0     Doctorate   

  marital_status employment_status  household_size  ...  liver_disease  \
0        Married           Retired               3  ...              0   
1        Married          Employed               3  ...              0   
2        Married           Retired               5  ...              0   
3        Married     Self-employed               5  ...              0   
4        Married     Self-employed               2  ...              0   

   arthritis mental_health proc_imaging_count  proc_surgery_count  \
0          1             0                  1  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Logistic Regression
-------------------
Train accuracy: 0.495
Test  accuracy: 0.496

Classification report (test):
              precision    recall  f1-score   support

        high       0.54      0.59      0.57      6667
         low       0.51      0.65      0.58      6666
      medium       0.38      0.25      0.30      6667

    accuracy                           0.50     20000
   macro avg       0.48      0.50      0.48     20000
weighted avg       0.48      0.50      0.48     20000

Confusion matrix (test):
[[3939 1361 1367]
 [1034 4346 1286]
 [2287 2740 1640]]

Random Forest
-------------
Train accuracy: 0.905
Test  accuracy: 0.489

Classification report (test):
              precision    recall  f1-score   support

        high       0.52      0.61      0.56      6667
         low       0.52      0.60      0.56      6666
      medium       0.37      0.25      0.30      6667

    accuracy                           0.49     20000
   macro avg       0.47      0.49      0.48    