In [2]:


import mglearn as mglearn

!pip install numpy scipy scikit-learn matplotlib pandas
from sklearn.decomposition import PCA
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC


You should consider upgrading via the '/Users/engineer/workspace/cse590-machine-learning/venv/bin/python -m pip install --upgrade pip' command.[0m


In [3]:

X = pd.read_csv("../dataset/extracted_features.csv").values
y = pd.read_csv("../dataset/labels.csv").values
# images = pd.read_csv("../dataset/raw_images.csv").values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
y_train = y_train.ravel()


In [4]:
def shape_params_linear_kernel(c_values):
    hyper_params = dict(C=c_values)
    return hyper_params


def shape_params_polynomial_kernel(c_values, d_values):
    hyper_params = shape_params_linear_kernel(c_values)
    hyper_params['degree'] = d_values
    return hyper_params


def shape_params_radial_kernel(c_values, gamma_values):
    hyper_params = shape_params_linear_kernel(c_values)
    hyper_params['gamma'] = gamma_values
    return hyper_params


def run_grid_svm(kernel, hyper_parameters, X_data):
    classifier = SVC(
        kernel=kernel,
        random_state=0
    )
    grid_search = GridSearchCV(
        classifier,
        hyper_parameters,
        cv=4,
        return_train_score=True
    )
    grid_search.fit(X_data, y_train)

    print_grid_search_results(grid_search, hyper_parameters)
    return grid_search


def print_grid_search_results(grid_search, parameters):
    for parameter in parameters:
        print(f'Best {parameter}:', grid_search.best_params_[parameter])
    print("Train score: ", grid_search.cv_results_['mean_train_score'])
    print("Test score: ", grid_search.cv_results_['mean_test_score'])


def preprocess_data(X_scaled):
    pca = PCA(n_components=2)
    pca.fit(X_scaled)

    return pca.fit_transform(X_scaled)


def display_preprocessed_2d(pca_data):
    plt.figure(figsize=(32, 32))
    mglearn.discrete_scatter(pca_data[:, 0], pca_data[:, 1], y_train)
    plt.legend(
        [
            '0: T-shirt/top',
            '1: Trouser',
            '2: Pullover',
            '3: Dress',
            '4: Coat,',
        ],
        loc="best"
    )
    plt.gca().set_aspect("equal")
    plt.xlabel("First principal component")
    plt.ylabel("Second principal component")


def scale(scaler, X_data):
    return scaler.fit(X_data).transform(X_data)


In [5]:
grid_results_linear_vanilla = run_grid_svm(
    'linear',
    shape_params_linear_kernel([1, 5, 10, 25]),
    X_train
)


Best C: 1
Train score:  [0.99117346 0.99192464 0.99192464 0.99230029]
Test score:  [0.86198346 0.85860381 0.85747768 0.85747768]


In [10]:
print("train ", grid_results_linear_vanilla.score(X_train, y_train))

print("test ", grid_results_linear_vanilla.score(X_test, y_test))

train  0.984225352112676
test  0.8671171171171171


In [6]:
grid_results_poly_vanilla = run_grid_svm(
    'poly',
    shape_params_polynomial_kernel([1, 5, 10, 25], [1, 2, 3]),
    X_train
)

Best C: 5
Best degree: 1
Train score:  [0.90572691 0.87793489 0.82723045 0.94441258 0.92131571 0.8878884
 0.9592485  0.93558745 0.90403928 0.97051625 0.9631922  0.91831144]
Test score:  [0.88450598 0.8439451  0.77070496 0.90028445 0.86704467 0.82591437
 0.89690989 0.86367519 0.82817044 0.89240157 0.86536056 0.82085189]


In [11]:
print("train ", grid_results_poly_vanilla.score(X_train, y_train))

print("test ", grid_results_poly_vanilla.score(X_test, y_test))

train  0.9414084507042253
test  0.9009009009009009


In [7]:
grid_results_rbf_vanilla = run_grid_svm(
    'rbf',
    shape_params_radial_kernel([1, 25, 50], [.1, 1, 10]),
    X_train
)

Best C: 1
Best gamma: 1
Train score:  [0.99380292 0.99380292 0.99380292 0.99380292 0.99380292 0.99380292
 0.99380292 0.99380292 0.99380292]
Test score:  [0.14873127 0.17069072 0.14704335 0.14873127 0.14929433 0.14704335
 0.14873127 0.14929433 0.14704335]


In [12]:
print("train ", grid_results_rbf_vanilla.score(X_train, y_train))

print("test ", grid_results_rbf_vanilla.score(X_test, y_test))

train  0.9915492957746479
test  0.1463963963963964
