# Support Vector Machine — Model Training & Hyperparameter Optimization

This notebook trains and tunes a Support Vector Machine (SVC) for predicting obesity category using the training set.  
All preprocessing is embedded inside a scikit-learn pipeline to avoid data leakage.

**Dataset used:** `train_set.csv`  
**Target:** `NObeyesdad`  
**Task:** Multiclass Classification  
**Metrics:** Accuracy, F1-macro  
**CV Strategy:** StratifiedKFold (5 folds)


In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings("ignore")

# reproducibility
RANDOM_STATE = 42


In [12]:
# Load data
train = pd.read_csv("../../data/train_set.csv")

train.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,Female,1.63,60.0,Sometimes,yes,3.0,3.0,no,yes,2.0,yes,2.0,0.0,Always,Public_Transportation,Normal_Weight
1,20.924956,Female,1.752531,133.618706,Sometimes,yes,3.0,3.0,no,no,2.887659,yes,1.480919,0.779641,Sometimes,Public_Transportation,Obesity_Type_III
2,22.89974,Female,1.661715,82.595793,Sometimes,yes,1.203754,1.355354,no,no,2.765593,yes,0.128342,1.659476,Sometimes,Public_Transportation,Obesity_Type_I
3,21.837996,Female,1.588046,44.236067,no,no,3.0,1.69608,no,no,2.550307,no,1.098862,0.0,Frequently,Public_Transportation,Insufficient_Weight
4,25.994746,Male,1.811602,106.042142,Sometimes,yes,3.0,3.0,no,no,2.858171,yes,1.813318,0.680215,Sometimes,Public_Transportation,Obesity_Type_I


In [13]:
# Identify Features & Target
target = "NObeyesdad"

X = train.drop(columns=[target])
y = train[target]

# classify columns by dtype
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

numeric_features, categorical_features


(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'],
 ['Gender',
  'CALC',
  'FAVC',
  'SCC',
  'SMOKE',
  'family_history_with_overweight',
  'CAEC',
  'MTRANS'])

In [14]:
# Preprocessing Pipeline
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


In [15]:
# SVM Pipeline
svm_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("svm", SVC(probability=False, random_state=RANDOM_STATE))
])


## Hyperparameter Tuning

We tune:

- **C**: regularization strength  
- **kernel**: linear or RBF  
- **gamma**: kernel coefficient (for RBF only)

Evaluation metrics:
- Accuracy
- F1-macro (important for multiclass balance)


In [16]:
param_grid = {
    "svm__kernel": ["linear", "rbf"],
    "svm__C": [0.1, 1, 5, 10],
    "svm__gamma": ["scale", "auto"]
}

In [17]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

grid = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,
    verbose=1
)

grid.fit(X, y)


Fitting 5 folds for each of 16 candidates, totalling 80 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'svm__C': [0.1, 1, ...], 'svm__gamma': ['scale', 'auto'], 'svm__kernel': ['linear', 'rbf']}"
,scoring,'f1_macro'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,5
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [18]:
best_svm = grid.best_estimator_
best_params = grid.best_params_

print("Best Parameters:", best_params)

cv_results = pd.DataFrame(grid.cv_results_)[
    ["params", "mean_test_score", "std_test_score"]
].sort_values(by="mean_test_score", ascending=False)

cv_results.head()


Best Parameters: {'svm__C': 5, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}


Unnamed: 0,params,mean_test_score,std_test_score
8,"{'svm__C': 5, 'svm__gamma': 'scale', 'svm__ker...",0.962,0.013909
10,"{'svm__C': 5, 'svm__gamma': 'auto', 'svm__kern...",0.962,0.013909
12,"{'svm__C': 10, 'svm__gamma': 'scale', 'svm__ke...",0.957232,0.013999
14,"{'svm__C': 10, 'svm__gamma': 'auto', 'svm__ker...",0.957232,0.013999
15,"{'svm__C': 10, 'svm__gamma': 'auto', 'svm__ker...",0.949414,0.01025
