# Exercise 4 continued
## SVMs, Hyperparameters, and Cross Validation

In [65]:
import sys
import sklearn as sk
import pandas as pd

Load the dataset

In [66]:
df = pd.read_csv("JO_pivoted.csv")
name = df.columns[0]
df = df.drop(name, axis=1)
df

Unnamed: 0,region,year,barley,energy forest,fallow land,"field peas for cooking, fodder peas, vetches and field beans",green fodder,green peas,horticulture plants,ley for hay and forage plants,...,triticale,unspecified arable land,utilized ley for hay,utilized ley for hay and pasture,utilized pasture,white beans,winter barley,winter rape,winter turnip rape,winter wheat
0,0114 Upplands Väsby,1981,500.0,0.0,179.0,0.0,43.0,0.0,0.0,0.0,...,0.0,0.0,0.0,229.0,0.0,0.0,0.0,0.0,0.0,80.0
1,0114 Upplands Väsby,1985,586.0,0.0,30.0,11.0,63.0,0.0,0.0,0.0,...,0.0,0.0,0.0,201.0,0.0,0.0,0.0,0.0,0.0,40.0
2,0114 Upplands Väsby,1989,264.0,0.0,124.0,22.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,275.0,0.0,0.0,0.0,0.0,14.0,477.0
3,0114 Upplands Väsby,1990,213.0,0.0,57.0,38.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,213.0,0.0,0.0,0.0,0.0,2.0,520.0
4,0114 Upplands Väsby,1991,328.0,0.0,91.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,217.0,0.0,0.0,0.0,0.0,6.0,180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4055,2584 Kiruna,1999,0.0,0.0,17.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,272.0,0.0,0.0,0.0,0.0,0.0,0.0
4056,2584 Kiruna,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,109.0,0.0,151.0,0.0,0.0,0.0,0.0,0.0,0.0
4057,2584 Kiruna,2002,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,90.0,0.0,140.0,0.0,0.0,0.0,0.0,0.0,0.0
4058,2584 Kiruna,2003,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,...,0.0,69.0,0.0,143.0,0.0,0.0,0.0,0.0,0.0,0.0


### SVMs

In [67]:
import sklearn.svm as svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

Build SVM pipelines for two approaches.

Regression on target="barley"

In [68]:
# perform regression SVM modeling on dataset
exclude = ["barley", "region", "total arable land"]
X_reg = df.drop(columns=exclude)
y_reg = df["barley"]

X_train_reg, X_test, y_train_reg, y_test = sk.model_selection.train_test_split(X_reg, y_reg, test_size=0.2)
svm_models_reg = {
    "SVR with linear kernel": Pipeline([
        ('scaler', MinMaxScaler()),
        ('svr', svm.SVR(C=1.0, kernel="linear"))
    ]),
    "SVR with poly kernel": Pipeline([
        ('scaler', MinMaxScaler()),
        ('svr', svm.SVR(C=1.0, kernel="poly"))
    ]),
    "SVR with rbf kernel": Pipeline([
        ('scaler', MinMaxScaler()),
        ('svr', svm.SVR(C=1.0, kernel="rbf"))
    ]),
}
results_reg = {}
for name, model in svm_models_reg.items():
    sys.stdout.write(f"\rTraining {name}..."); sys.stdout.flush()
    model.fit(X_train_reg, y_train_reg)
    sys.stdout.write(f"\rRunning inference for {name}..."); sys.stdout.flush()
    score = model.score(X_test, y_test)
    results_reg[name] = score
    print(f"\r[✓] {name} - R² Score: {score:.2f}")

results_df = pd.DataFrame.from_dict(results_reg, orient="index", columns=["R² Score"])
results_df = results_df.reset_index().rename(columns={"index": "Model"})
results_df

Training SVR with linear kernel...

[✓] SVR with linear kernel - R² Score: -0.09...
[✓] SVR with poly kernel - R² Score: 0.09l...
[✓] SVR with rbf kernel - R² Score: -0.11...


Unnamed: 0,Model,R² Score
0,SVR with linear kernel,-0.093847
1,SVR with poly kernel,0.089414
2,SVR with rbf kernel,-0.105294


Classification on target="region"

In [69]:
# perform classification SVM modeling on dataset

exclude = ["region", "total arable land"]
X_clf = df.drop(columns=exclude)
y_clf = df["region"]

X_train_clf, X_test, y_train_clf, y_test = sk.model_selection.train_test_split(X_clf, y_clf, test_size=0.2)
svm_models_clf = {
    "SVC with linear kernel": Pipeline([
        ('scaler', MinMaxScaler()),
        ('svr', svm.SVC(C=1.0, kernel="linear"))
    ]),
    "SVC with poly kernel": Pipeline([
        ('scaler', MinMaxScaler()),
        ('svr', svm.SVC(C=1.0, kernel="poly"))
    ]),
    "SVC with rbf kernel": Pipeline([
        ('scaler', MinMaxScaler()),
        ('svr', svm.SVC(C=1.0, kernel="rbf"))
    ]),
}
results_clf = {}
for name, model in svm_models_clf.items():
    sys.stdout.write(f"\rTraining {name}..."); sys.stdout.flush()
    model.fit(X_train_clf, y_train_clf)
    sys.stdout.write(f"\rRunning inference for {name}..."); sys.stdout.flush()
    score = model.score(X_test, y_test)
    results_clf[name] = score
    print(f"\r[✓] {name} - Accuracy: {score:.2f}")

results_df = pd.DataFrame.from_dict(results_clf, orient="index", columns=["Accuracy"])
results_df = results_df.reset_index().rename(columns={"index": "Model"})
results_df

[✓] SVC with linear kernel - Accuracy: 0.10l...
[✓] SVC with poly kernel - Accuracy: 0.18l...
[✓] SVC with rbf kernel - Accuracy: 0.14l...


Unnamed: 0,Model,Accuracy
0,SVC with linear kernel,0.098522
1,SVC with poly kernel,0.184729
2,SVC with rbf kernel,0.139163


### 10-fold Cross Validation

In [70]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

Perform 10 fold cross validation on both regression and classification approach

Regression on target="barley"

In [71]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)
results_reg = {}

for name, model in svm_models_reg.items():
    sys.stdout.write(f"Cross-validating {name}..."); sys.stdout.flush()
    scores = cross_val_score(model, X_reg, y_reg, cv=cv, scoring='r2', n_jobs=-1)
    results_reg[name] = np.mean(scores)
    print(f"\r[✓] {name} - Mean R² Score: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

results_df = pd.DataFrame.from_dict(results_reg, orient="index", columns=["R² Score"])
results_df = results_df.reset_index().rename(columns={"index": "Model"})
results_df

Cross-validating SVR with linear kernel...

[✓] SVR with linear kernel - Mean R² Score: -0.08 ± 0.01
[✓] SVR with poly kernel - Mean R² Score: 0.08 ± 0.06
[✓] SVR with rbf kernel - Mean R² Score: -0.09 ± 0.01


Unnamed: 0,Model,R² Score
0,SVR with linear kernel,-0.079093
1,SVR with poly kernel,0.081631
2,SVR with rbf kernel,-0.086078


Classification on target="region"

In [72]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)
results_clf = {}

for name, model in svm_models_clf.items():
    sys.stdout.write(f"Cross-validating {name}..."); sys.stdout.flush()
    scores = cross_val_score(model, X_clf, y_clf, cv=cv, scoring='accuracy', n_jobs=-1)
    results_clf[name] = np.mean(scores)
    print(f"\r[✓] {name} - Mean Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

results_df = pd.DataFrame.from_dict(results_clf, orient="index", columns=["Accuracy"])
results_df = results_df.reset_index().rename(columns={"index": "Model"})
results_df

[✓] SVC with linear kernel - Mean Accuracy: 0.10 ± 0.01
[✓] SVC with poly kernel - Mean Accuracy: 0.19 ± 0.02
[✓] SVC with rbf kernel - Mean Accuracy: 0.16 ± 0.01


Unnamed: 0,Model,Accuracy
0,SVC with linear kernel,0.104926
1,SVC with poly kernel,0.193842
2,SVC with rbf kernel,0.155911


### Grid Search
Kernels:
- Radial basis function kernel (RBF)
- Polynomial kernel 
- Linear kernel

Regularization params C:
- C=1
- C=10
- C=100

In [73]:
from sklearn.model_selection import GridSearchCV

Regression on target="barley"

In [74]:
param_grid = {
    'svr__kernel': ['linear', 'rbf', 'poly'],
    'svr__C': [1.0, 10.0, 100.0]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svr', svm.SVR())
])

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_reg, y_reg)

all_results_df = pd.DataFrame(grid.cv_results_)
display_columns = ['param_svr__kernel', 'param_svr__C', 'mean_test_score', 'rank_test_score']
all_results_df[display_columns].sort_values(by='rank_test_score')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


Unnamed: 0,param_svr__kernel,param_svr__C,mean_test_score,rank_test_score
8,poly,100.0,0.674161,1
6,linear,100.0,0.640853,2
5,poly,10.0,0.471326,3
7,rbf,100.0,0.393702,4
3,linear,10.0,0.167299,5
4,rbf,10.0,0.093756,6
2,poly,1.0,0.07149,7
0,linear,1.0,-0.085131,8
1,rbf,1.0,-0.092416,9


Classification on target="region"

In [75]:
param_grid = {
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__C': [1.0, 10.0, 100.0]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svc', svm.SVC())
])

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_clf, y_clf)

all_results_df = pd.DataFrame(grid.cv_results_)
display_columns = ['param_svc__kernel', 'param_svc__C', 'mean_test_score', 'rank_test_score']
all_results_df[display_columns].sort_values(by='rank_test_score')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


Unnamed: 0,param_svc__kernel,param_svc__C,mean_test_score,rank_test_score
7,rbf,100.0,0.613054,1
6,linear,100.0,0.540394,2
8,poly,100.0,0.48399,3
4,rbf,10.0,0.369212,4
5,poly,10.0,0.337931,5
3,linear,10.0,0.283744,6
2,poly,1.0,0.188177,7
1,rbf,1.0,0.14532,8
0,linear,1.0,0.099507,9
