<p style="font-family: Arial; font-size:3em;color:black;"> Lab Exercise 8</p>

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
# Import the built-in iris dataset to work with
iris = datasets.load_iris() # the dataset consists of 3 different types of irises' (Setosa, Versicolour, and Virginica) petal and sepal length
print("iris dataset",iris.data[:10,:])
X = iris.data[:, :2] # We only want to use the first two features (Sepal Length, Sepal Width)
y = iris.target # corresponding to 'setosa', 'versicolor', 'virginica
print("iris target",y[:10,])

iris dataset [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]
iris target [0 0 0 0 0 0 0 0 0 0]


In [14]:
# Develop a linear Support Vector Machine to classify the irises
# Use GridSearchCV to optimize the model's hyperparameters (C, gamma, kernel)
# Compare the accuracies of the above two models

In [15]:
iris = datasets.load_iris()

X = iris.data[:, :2]     
y = iris.target          

print("X shape:", X.shape)
print("X sample:\n", X[:5])
print("y sample:\n", y[:10])
print("target names:", iris.target_names)

X shape: (150, 2)
X sample:
 [[5.1 3.5]
 [4.9 3. ]
 [4.7 3.2]
 [4.6 3.1]
 [5.  3.6]]
y sample:
 [0 0 0 0 0 0 0 0 0 0]
target names: ['setosa' 'versicolor' 'virginica']


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train class counts:", np.bincount(y_train))
print("y_test class counts:", np.bincount(y_test))


X_train shape: (105, 2)
X_test shape: (45, 2)
y_train class counts: [35 35 35]
y_test class counts: [15 15 15]


In [22]:
baseline_model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="linear", C=1.0))
])

baseline_model.fit(X_train, y_train)

y_pred_base = baseline_model.predict(X_test)

acc_base = accuracy_score(y_test, y_pred_base)
cm_base = confusion_matrix(y_test, y_pred_base)

print("Baseline Linear SVM accuracy:", acc_base)
print("Confusion matrix:\n", cm_base)

Baseline Linear SVM accuracy: 0.6888888888888889
Confusion matrix:
 [[15  0  0]
 [ 0  9  6]
 [ 0  8  7]]


In [23]:
# Pipeline again: scale + SVM
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

# Hyperparameter grid to search
param_grid = {
    "svm__kernel": ["linear", "rbf", "poly", "sigmoid"],
    "svm__C": [0.1, 1, 10, 100],
    "svm__gamma": ["scale", "auto", 0.01, 0.1, 1]
}

# Cross-validation strategy (keeps class balance per fold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)

best_model = grid.best_estimator_
y_pred_tuned = best_model.predict(X_test)

acc_tuned = accuracy_score(y_test, y_pred_tuned)
cm_tuned = confusion_matrix(y_test, y_pred_tuned)

print("\nTuned SVM test accuracy:", acc_tuned)
print("Tuned confusion matrix:\n", cm_tuned)

Best parameters: {'svm__C': 1, 'svm__gamma': 'scale', 'svm__kernel': 'sigmoid'}
Best CV accuracy: 0.8285714285714286

Tuned SVM test accuracy: 0.7777777777777778
Tuned confusion matrix:
 [[15  0  0]
 [ 0  8  7]
 [ 0  3 12]]


In [24]:
print("Baseline Linear SVM accuracy:", acc_base)
print("Tuned SVM accuracy:         ", acc_tuned)

print("\nBaseline confusion matrix:\n", cm_base)
print("\nTuned confusion matrix:\n", cm_tuned)

Baseline Linear SVM accuracy: 0.6888888888888889
Tuned SVM accuracy:          0.7777777777777778

Baseline confusion matrix:
 [[15  0  0]
 [ 0  9  6]
 [ 0  8  7]]

Tuned confusion matrix:
 [[15  0  0]
 [ 0  8  7]
 [ 0  3 12]]


In [20]:
print(best_model)


Pipeline(steps=[('scaler', StandardScaler()), ('svm', SVC(C=1))])
