# Task 4 — Modeling & Interpretability
Goal:
1. Claim Severity Prediction (regression, on claims > 0)
2. Premium Prediction (regression)
3. Claim Occurrence Prediction (classification)
Run cells in order. Results and plots are saved to outputs/task4/.


In [26]:
# Cell 1: Setup (imports, directories, pip installs if needed)
import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

OUT = Path("outputs/task4")
OUT.mkdir(parents=True, exist_ok=True)
(OUT / "figures").mkdir(exist_ok=True)
(OUT / "models").mkdir(exist_ok=True)
(OUT / "tables").mkdir(exist_ok=True)

# Install heavy packages if not present (uncomment the next line if needed)
# !pip install xgboost shap

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

import xgboost as xgb
import shap

pd.set_option("display.max_columns", 200)
sns.set(style="whitegrid")


## 1. Load data

In [27]:
# Cell 2: Load cleaned data
DATA_PATH = Path("../data/processed/cleaned_data.csv")
assert DATA_PATH.exists(), f"Put cleaned_data.csv at {DATA_PATH}"
df = pd.read_csv(DATA_PATH, parse_dates=["TransactionMonth"], low_memory=False)
print("Loaded:", DATA_PATH, "shape:", df.shape)
df.head(2)


Loaded: ..\data\processed\cleaned_data.csv shape: (1000098, 56)


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,MaritalStatus,Gender,Country,Province,PostalCode,MainCrestaZone,SubCrestaZone,ItemType,mmcode,VehicleType,RegistrationYear,make,Model,Cylinders,cubiccapacity,kilowatts,bodytype,NumberOfDoors,VehicleIntroDate,CustomValueEstimate,AlarmImmobiliser,TrackingDevice,CapitalOutstanding,NewVehicle,WrittenOff,Rebuilt,Converted,CrossBorder,NumberOfVehiclesInFleet,SumInsured,TermFrequency,CalculatedPremiumPerTerm,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,LossRatio,VehicleAge,Province_Encoded,VehicleType_Encoded
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,6/2002,119300.0,Yes,No,119300,More than 6 months,No,No,No,No,,-0.400557,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,-0.173593,0.001403,0.0,21,0,0
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,6/2002,119300.0,Yes,No,119300,More than 6 months,No,No,No,No,,-0.400557,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,-0.173593,0.001403,0.0,21,0,0


## 2. Feature engineering & target definitions
- ClaimOccurrence (binary)
- ClaimSeverity (TotalClaims where TotalClaims>0)
- Margin (already exists if created)
- VehicleAge already available; create simple features if missing


In [28]:
# Cell 3: KPIs & feature engineering
df = df.copy()

# Targets
df["ClaimOccurrence"] = (df["TotalClaims"] > 0).astype(int)
df["ClaimSeverity"] = df["TotalClaims"].where(df["TotalClaims"] > 0, np.nan)
df["Margin"] = df.get("Margin", df["TotalPremium"] - df["TotalClaims"])

# Example engineered features (adjust to your dataset)
if "RegistrationYear" in df.columns:
    df["VehicleAge"] = df["TransactionMonth"].dt.year - df["RegistrationYear"]
else:
    df["VehicleAge"] = df.get("VehicleAge", np.nan)

# Fill or mark missing VehicleAge
df["VehicleAge"] = df["VehicleAge"].fillna(df["VehicleAge"].median())

# Quick KPI checks
df[["ClaimOccurrence","ClaimSeverity","TotalClaims","TotalPremium","Margin","VehicleAge"]].describe().T.to_csv(OUT / "tables/kpi_task4_summary.csv")


## 3. Choose features (example)
- Numeric features: TotalPremium, SumInsured, CustomValueEstimate, VehicleAge, Kilowatts, Cubiccapacity (where present)
- Categorical features: Province, VehicleType, make, Gender, CoverType
Adjust lists below depending on column availability in your data.


In [29]:
# Cell 4: Feature lists (customize if necessary)
num_features = []
cat_features = []

candidates_num = ["TotalPremium","SumInsured","CustomValueEstimate","VehicleAge","Kilowatts","cubiccapacity","Kilowatts","CapitalOutstanding"]
candidates_cat = ["Province","VehicleType","make","Gender","CoverType","Product","StatutoryClass"]

for c in candidates_num:
    if c in df.columns:
        num_features.append(c)

for c in candidates_cat:
    if c in df.columns:
        cat_features.append(c)

print("Numeric features:", num_features)
print("Categorical features:", cat_features)


Numeric features: ['TotalPremium', 'SumInsured', 'CustomValueEstimate', 'VehicleAge', 'cubiccapacity', 'CapitalOutstanding']
Categorical features: ['Province', 'VehicleType', 'make', 'Gender', 'CoverType', 'Product', 'StatutoryClass']


In [30]:
def clean_numeric_commas(df):
    for col in df.columns:
        # If a column contains comma-formatted numbers, convert it
        if df[col].dtype == "object":
            # Check if column contains any comma numbers
            if df[col].astype(str).str.contains(",", regex=False).any():
                df[col] = (
                    df[col]
                    .astype(str)
                    .str.replace(" ", "", regex=False)
                    .str.replace(".", "", regex=False)    # remove thousands separator
                    .str.replace(",", ".", regex=False)   # convert decimal comma → dot
                )
                df[col] = pd.to_numeric(df[col], errors="coerce")
    return df

df = clean_numeric_commas(df)


In [31]:
df.dtypes

UnderwrittenCoverID                  int64
PolicyID                             int64
TransactionMonth            datetime64[ns]
IsVATRegistered                       bool
Citizenship                         object
LegalType                           object
Title                               object
Language                            object
Bank                                object
AccountType                         object
MaritalStatus                       object
Gender                              object
Country                             object
Province                            object
PostalCode                           int64
MainCrestaZone                     float64
SubCrestaZone                       object
ItemType                            object
mmcode                             float64
VehicleType                         object
RegistrationYear                     int64
make                                object
Model                               object
Cylinders  

## 4. Preprocessing pipeline
OneHotEncode categorical features (drop='rare' handled by min frequency), scale numeric features.
We'll build ColumnTransformer + pipelines for each modeling task.


In [33]:
# Cell 5: Preprocessor
# For OneHotEncoder: handle_unknown='ignore'
num_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
], remainder="drop")


## 5. Model helper: train/evaluate & save


In [34]:
# Cell 6: Helpers
def evaluate_regression(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {"rmse": rmse, "r2": r2}

def evaluate_classification(y_true, y_pred_proba, threshold=0.5):
    y_pred = (y_pred_proba >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_pred_proba)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}


## 6.  Model A — Claim Severity (Regression) 
Use only records where claims > 0.


In [35]:
# Cell 7: Prepare data for severity regression
df_sev = df[df["ClaimOccurrence"] == 1].copy()
target_sev = "ClaimSeverity"
X_sev = df_sev[num_features + cat_features].copy()
y_sev = df_sev[target_sev].astype(float)

# Train test split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sev, y_sev, test_size=0.2, random_state=42)
print("Severity dataset:", X_train_s.shape, X_test_s.shape)


Severity dataset: (800074, 13) (200019, 13)


In [37]:
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)   # manual RMSE (works in all sklearn versions)
    r2 = r2_score(y_true, y_pred)
    return {"rmse": rmse, "r2": r2}

In [38]:
# Cell 8: Build models pipelines
# Linear Regression
pipe_lr_sev = Pipeline(steps=[("pre", preprocessor), ("lr", LinearRegression())])
pipe_lr_sev.fit(X_train_s, y_train_s)
pred_lr_sev = pipe_lr_sev.predict(X_test_s)
metrics_lr_sev = evaluate_regression(y_test_s, pred_lr_sev)

# Random Forest
pipe_rf_sev = Pipeline(steps=[("pre", preprocessor), ("rf", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])
pipe_rf_sev.fit(X_train_s, y_train_s)
pred_rf_sev = pipe_rf_sev.predict(X_test_s)
metrics_rf_sev = evaluate_regression(y_test_s, pred_rf_sev)

# XGBoost
pipe_xgb_sev = Pipeline(steps=[("pre", preprocessor), ("xgb", xgb.XGBRegressor(n_estimators=200, random_state=42, n_jobs= -1, verbosity=0))])
pipe_xgb_sev.fit(X_train_s, y_train_s)
pred_xgb_sev = pipe_xgb_sev.predict(X_test_s)
metrics_xgb_sev = evaluate_regression(y_test_s, pred_xgb_sev)

# Save metrics
sev_metrics = pd.DataFrame({
    "model":["LinearRegression","RandomForest","XGBoost"],
    "rmse":[metrics_lr_sev["rmse"], metrics_rf_sev["rmse"], metrics_xgb_sev["rmse"]],
    "r2":[metrics_lr_sev["r2"], metrics_rf_sev["r2"], metrics_xgb_sev["r2"]]
})
sev_metrics.to_csv(OUT / "tables/severity_model_metrics.csv", index=False)
sev_metrics


Unnamed: 0,model,rmse,r2
0,LinearRegression,1.734723e-18,-63.0
1,RandomForest,7.899063e-15,-1326999000.0
2,XGBoost,1.202947e-11,-3077601000000000.0


In [39]:
# Cell 9: Plot predicted vs actual for best model (choose by RMSE)
best_idx = sev_metrics["rmse"].idxmin()
best_model_name = sev_metrics.loc[best_idx, "model"]
print("Best severity model:", best_model_name)
best_pred = {"LinearRegression": pred_lr_sev, "RandomForest": pred_rf_sev, "XGBoost": pred_xgb_sev}[best_model_name]

plt.figure(figsize=(6,6))
plt.scatter(y_test_s, best_pred, alpha=0.4)
plt.plot([y_test_s.min(), y_test_s.max()], [y_test_s.min(), y_test_s.max()], color="red")
plt.xlabel("Actual ClaimSeverity")
plt.ylabel("Predicted ClaimSeverity")
plt.title(f"Actual vs Predicted - {best_model_name} (Severity)")
plt.tight_layout()
plt.savefig(OUT / f"figures/severity_actual_vs_pred_{best_model_name}.png", dpi=150)
plt.close()


Best severity model: LinearRegression


## 7. Model B — Premium Prediction (Regression)
Predict CalculatedPremiumPerTerm as a baseline premium model.


In [40]:
# Cell 10: Premium regression target check
target_prem = "CalculatedPremiumPerTerm" if "CalculatedPremiumPerTerm" in df.columns else "TotalPremium"
print("Using target for premium prediction:", target_prem)

df_prem = df.copy()
X_prem = df_prem[num_features + cat_features]
y_prem = df_prem[target_prem].astype(float)

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_prem, y_prem, test_size=0.2, random_state=42)
print("Premium dataset:", X_train_p.shape, X_test_p.shape)


Using target for premium prediction: CalculatedPremiumPerTerm
Premium dataset: (800078, 13) (200020, 13)


In [43]:
# Cell 11: Build premium models
pipe_lr_prem = Pipeline(steps=[("pre", preprocessor), ("lr", LinearRegression())])
pipe_lr_prem.fit(X_train_p, y_train_p)
pred_lr_prem = pipe_lr_prem.predict(X_test_p)
metrics_lr_prem = evaluate_regression(y_test_p, pred_lr_prem)

pipe_rf_prem = Pipeline(steps=[("pre", preprocessor), ("rf", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])
pipe_rf_prem.fit(X_train_p, y_train_p)
pred_rf_prem = pipe_rf_prem.predict(X_test_p)
metrics_rf_prem = evaluate_regression(y_test_p, pred_rf_prem)

pipe_xgb_prem = Pipeline(steps=[("pre", preprocessor), ("xgb", xgb.XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, verbosity=0))])
pipe_xgb_prem.fit(X_train_p, y_train_p)
pred_xgb_prem = pipe_xgb_prem.predict(X_test_p)
metrics_xgb_prem = evaluate_regression(y_test_p, pred_xgb_prem)

prem_metrics = pd.DataFrame({
    "model":["LinearRegression","RandomForest","XGBoost"],
    "rmse":[metrics_lr_prem["rmse"], metrics_rf_prem["rmse"], metrics_xgb_prem["rmse"]],
    "r2":[metrics_lr_prem["r2"], metrics_rf_prem["r2"], metrics_xgb_prem["r2"]]
})
prem_metrics.to_csv(OUT / "tables/premium_model_metrics.csv", index=False)
prem_metrics


Unnamed: 0,model,rmse,r2
0,LinearRegression,196.590977,0.515864
1,RandomForest,15.989034,0.996798
2,XGBoost,24.643411,0.992393


In [44]:
# Cell 12: Plot premium actual vs predicted for best model
best_idx_p = prem_metrics["rmse"].idxmin()
best_model_name_p = prem_metrics.loc[best_idx_p, "model"]
best_pred_p = {"LinearRegression": pred_lr_prem, "RandomForest": pred_rf_prem, "XGBoost": pred_xgb_prem}[best_model_name_p]

plt.figure(figsize=(6,6))
plt.scatter(y_test_p, best_pred_p, alpha=0.4)
plt.plot([y_test_p.min(), y_test_p.max()], [y_test_p.min(), y_test_p.max()], color="red")
plt.xlabel("Actual Premium")
plt.ylabel("Predicted Premium")
plt.title(f"Actual vs Predicted - {best_model_name_p} (Premium)")
plt.tight_layout()
plt.savefig(OUT / f"figures/premium_actual_vs_pred_{best_model_name_p}.png", dpi=150)
plt.close()


## 8. Model C — Claim Occurrence (Classification)
Predict whether a claim occurs (binary classification).


In [45]:
# Cell 13: Prepare classification data
target_clf = "ClaimOccurrence"
X_clf = df[num_features + cat_features]
y_clf = df[target_clf].astype(int)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf)
print("Classification dataset:", X_train_c.shape, X_test_c.shape, "Positive rate (train):", y_train_c.mean())


Classification dataset: (800078, 13) (200020, 13) Positive rate (train): 0.9999950004874525


In [46]:
# Cell 14: Classification models
pipe_rf_clf = Pipeline(steps=[("pre", preprocessor), ("rf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))])
pipe_rf_clf.fit(X_train_c, y_train_c)
proba_rf = pipe_rf_clf.predict_proba(X_test_c)[:,1]
clf_rf_metrics = evaluate_classification(y_test_c, proba_rf)

pipe_xgb_clf = Pipeline(steps=[("pre", preprocessor), ("xgb", xgb.XGBClassifier(n_estimators=200, random_state=42, use_label_encoder=False, eval_metric='logloss'))])
pipe_xgb_clf.fit(X_train_c, y_train_c)
proba_xgb = pipe_xgb_clf.predict_proba(X_test_c)[:,1]
clf_xgb_metrics = evaluate_classification(y_test_c, proba_xgb)

# Logistic baseline via LinearRegression on log-odds isn't included; you can add sklearn.linear_model.LogisticRegression similarly.
clf_metrics = pd.DataFrame({
    "model":["RandomForest","XGBoost"],
    "accuracy":[clf_rf_metrics["accuracy"], clf_xgb_metrics["accuracy"]],
    "precision":[clf_rf_metrics["precision"], clf_xgb_metrics["precision"]],
    "recall":[clf_rf_metrics["recall"], clf_xgb_metrics["recall"]],
    "f1":[clf_rf_metrics["f1"], clf_xgb_metrics["f1"]],
    "auc":[clf_rf_metrics["auc"], clf_xgb_metrics["auc"]],
})
clf_metrics.to_csv(OUT / "tables/classification_metrics.csv", index=False)
clf_metrics


Unnamed: 0,model,accuracy,precision,recall,f1,auc
0,RandomForest,0.999995,0.999995,1.0,0.999998,0.499883
1,XGBoost,0.999995,0.999995,1.0,0.999998,0.583577


In [47]:
# Cell 15: ROC curve for best classifier
best_clf_idx = clf_metrics["auc"].idxmax()
best_clf = clf_metrics.loc[best_clf_idx,"model"]
proba_best = {"RandomForest": proba_rf, "XGBoost": proba_xgb}[best_clf]
from sklearn.metrics import roc_curve
fpr, tpr, thr = roc_curve(y_test_c, proba_best)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"{best_clf} (AUC={clf_metrics.loc[best_clf_idx,'auc']:.3f})")
plt.plot([0,1],[0,1],"--", color="grey")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.tight_layout()
plt.savefig(OUT / f"figures/roc_{best_clf}.png", dpi=150)
plt.close()


## 9. Feature importance & SHAP (Interpretability)
Run SHAP on the best-performing model (choose the best regression model for severity and best classifier for claim occurrence).


In [48]:
# Cell 16: SHAP for Severity best model (if tree-based)
# We will attempt SHAP for RandomForest or XGBoost pipeline by extracting the trained estimator and the preprocessor.

def shap_for_pipeline(pipeline, X_sample, model_name, task="regression"):
    """Compute and save SHAP summary plot for a scikit-learn pipeline with ColumnTransformer + model."""
    # Extract preprocessor and model
    pre = pipeline.named_steps["pre"]
    model = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]]  # last step
    # Transform X to numeric matrix
    X_trans = pre.transform(X_sample)
    # Build feature names from transformer: numeric + ohe categories
    num_cols = pre.named_transformers_["num"].named_steps["scaler"].get_feature_names_out(num_features) if hasattr(pre.named_transformers_["num"].named_steps["scaler"], "get_feature_names_out") else num_features
    # Build OHE feature names
    cat_ohe = pre.named_transformers_["cat"].named_steps["ohe"]
    try:
        ohe_cols = cat_ohe.get_feature_names_out(cat_features)
    except Exception:
        # fallback
        ohe_cols = [f"{c}_{i}" for c in cat_features for i in range(1)]
    feat_names = list(num_features) + list(ohe_cols)
    # SHAP explainer
    if isinstance(model, (RandomForestRegressor, RandomForestClassifier, xgb.XGBRegressor, xgb.XGBClassifier)):
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_trans)
    else:
        explainer = shap.Explainer(model.predict, X_trans)
        shap_values = explainer(X_trans)
    # Summary plot
    plt.figure(figsize=(8,6))
    try:
        shap.summary_plot(shap_values, X_trans, feature_names=feat_names, show=False)
        plt.tight_layout()
        plt.savefig(OUT / f"figures/shap_summary_{model_name}.png", dpi=150, bbox_inches='tight')
        plt.close()
    except Exception as e:
        print("SHAP plotting error:", e)

# Run SHAP for best severity model if available and tree-based
best_sev_pipeline = {"RandomForest": pipe_rf_sev, "XGBoost": pipe_xgb_sev, "LinearRegression": pipe_lr_sev}[best_model_name]
if best_model_name in ["RandomForest","XGBoost"]:
    X_sample = X_train_s.sample(n=min(500, len(X_train_s)), random_state=42)
    try:
        shap_for_pipeline(best_sev_pipeline, X_sample, f"severity_{best_model_name}", task="regression")
        print("Saved SHAP summary for severity.")
    except Exception as e:
        print("SHAP severity failed:", e)
else:
    print("Best severity model not tree-based; skipping SHAP for severity.")


Best severity model not tree-based; skipping SHAP for severity.


In [49]:
# Cell 17: SHAP for classifier best model
best_clf_pipeline = {"RandomForest": pipe_rf_clf, "XGBoost": pipe_xgb_clf}[best_clf]
if best_clf in ["RandomForest","XGBoost"]:
    X_sample_c = X_train_c.sample(n=min(500, len(X_train_c)), random_state=42)
    try:
        shap_for_pipeline(best_clf_pipeline, X_sample_c, f"classifier_{best_clf}", task="classification")
        print("Saved SHAP summary for classifier.")
    except Exception as e:
        print("SHAP classifier failed:", e)
else:
    print("Best classifier not tree-based; skipping SHAP for classifier.")


SHAP classifier failed: could not convert string to float: '[9.99995E-1]'


## 10. Save models and wrap-up tables


In [50]:
# Cell 18: Save pipelines & metrics
joblib.dump(pipe_rf_sev, OUT / "models/pipe_rf_sev.joblib")
joblib.dump(pipe_xgb_sev, OUT / "models/pipe_xgb_sev.joblib")
joblib.dump(pipe_rf_clf, OUT / "models/pipe_rf_clf.joblib")
joblib.dump(pipe_xgb_clf, OUT / "models/pipe_xgb_clf.joblib")
joblib.dump(pipe_rf_prem, OUT / "models/pipe_rf_prem.joblib")
joblib.dump(pipe_xgb_prem, OUT / "models/pipe_xgb_prem.joblib")

sev_metrics.to_csv(OUT / "tables/severity_model_metrics.csv", index=False)
prem_metrics.to_csv(OUT / "tables/premium_model_metrics.csv", index=False)
clf_metrics.to_csv(OUT / "tables/classification_metrics.csv", index=False)

print("Models and metrics saved to outputs/task4/")


Models and metrics saved to outputs/task4/


In [None]:
# Cell 19: Git workflow (run in terminal or as a shell cell)
!git checkout -b task-4
!git add notebooks/task_4.ipynb
!git commit -m "task-4: modeling pipelines, evaluation, and SHAP analysis"
!git push -u origin task-4
