In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score
import random

## Load data

In [2]:
x_train = np.load("x_train.npy")
y_train = np.load("y_train.npy")
x_test = np.load("x_test.npy")
y_test = np.load("y_test.npy")

In [3]:
# 550 data with 300 features
print(x_train.shape)

(550, 300)


In [4]:
# It's a binary classification problem 
print(np.unique(y_train))

[0 1]


In [5]:
# x_train_part = x_train[:86,:]
# print(x_train_part.shape)

# y_train_part = y_train[:86]
# print(y_train_part.shape)

## Question 1
K-fold data partition: Implement the K-fold cross-validation function. Your function should take K as an argument and return a list of lists (len(list) should equal to K), which contains K elements. Each element is a list contains two parts, the first part contains the index of all training folds, e.g. Fold 2 to Fold 5 in split 1. The second part contains the index of validation fold, e.g. Fold 1 in  split 1

In [6]:
def cross_validation(x_train, y_train, k=5):
    KFoldData = []
    num_of_data = x_train.shape[0]
    # Get all training data's indices and shuffle the ordering
    data_indices = np.arange(num_of_data)
    random.shuffle(data_indices)

    if num_of_data % k == 0:
        # Create k splits to become validation folds
        valFolds = np.split(data_indices, k, axis=0)
        for i in range(k):
            currentSplit = []
            val_fold = np.sort(valFolds[i])
            train_fold = np.setdiff1d(data_indices, val_fold)
            currentSplit.append(train_fold)
            currentSplit.append(val_fold)
            KFoldData.append(currentSplit)
    else:
        """ 
        When num_of_data is not divisible by K:
        The first (n_samples % n_splits) folds have size (n_samples // n_splits + 1),
        other folds have size (n_samples // n_splits), where n_samples is the number of samples.
        """
        num_of_firstValFolds = num_of_data % k
        num_of_otherValFolds = k - num_of_firstValFolds
        size_of_each_firstValFold = num_of_data // k + 1
        size_of_each_otherValFold = num_of_data // k
        data_of_firstValFolds = data_indices[:num_of_firstValFolds*size_of_each_firstValFold]
        data_of_otherValFolds = np.setdiff1d(data_indices, data_of_firstValFolds)

        # For first parts
        firstValFolds = np.split(data_of_firstValFolds, num_of_firstValFolds, axis=0)
        for i in range(num_of_firstValFolds):
            currentSplit = []
            val_fold = np.sort(firstValFolds[i])
            train_fold = np.setdiff1d(data_of_firstValFolds, val_fold)
            currentSplit.append(train_fold)
            currentSplit.append(val_fold)
            KFoldData.append(currentSplit)

        # For other parts
        otherValFolds = np.split(data_of_otherValFolds, num_of_otherValFolds, axis=0)
        for i in range(num_of_otherValFolds):
            currentSplit = []
            val_fold = np.sort(otherValFolds[i])
            train_fold = np.setdiff1d(data_of_otherValFolds, val_fold)
            currentSplit.append(train_fold)
            currentSplit.append(val_fold)
            KFoldData.append(currentSplit)

    return KFoldData

In [7]:
kfold_data = cross_validation(x_train, y_train, k=10)
assert len(kfold_data) == 10 # should contain 10 fold of data
assert len(kfold_data[0]) == 2 # each element should contain train fold and validation fold
assert kfold_data[0][1].shape[0] == 55 # The number of data in each validation fold should equal to training data divieded by K

In [8]:
# test_kfold_data = cross_validation(x_train_part, y_train_part, k=10)

# for i in range(10):
#     print(f'Split {i}')
#     print(test_kfold_data[i])

## example

In [9]:
from sklearn.model_selection import KFold

X = np.arange(20)
kf = KFold(n_splits=5, shuffle=True)
kfold_data= []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    print("Split: %s, Training index: %s, Validation index: %s" % (i+1, train_index, val_index))
    kfold_data.append([train_index, val_index])

Split: 1, Training index: [ 1  2  4  5  6  7  8 11 12 13 14 15 16 17 18 19], Validation index: [ 0  3  9 10]
Split: 2, Training index: [ 0  1  3  5  6  8  9 10 11 12 13 14 15 16 18 19], Validation index: [ 2  4  7 17]
Split: 3, Training index: [ 0  2  3  4  6  7  9 10 12 13 14 15 16 17 18 19], Validation index: [ 1  5  8 11]
Split: 4, Training index: [ 0  1  2  3  4  5  6  7  8  9 10 11 14 17 18 19], Validation index: [12 13 15 16]
Split: 5, Training index: [ 0  1  2  3  4  5  7  8  9 10 11 12 13 15 16 17], Validation index: [ 6 14 18 19]


In [10]:
assert len(kfold_data) == 5 # should contain 5 fold of data
assert len(kfold_data[0]) == 2 # each element should contains index of training fold and validation fold
assert kfold_data[0][1].shape[0] == 4 # The number of data in each validation fold should equal to training data divieded by K

## Question 2
Using sklearn.svm.SVC to train a classifier on the provided train set and conduct the grid search of “C”, “kernel” and “gamma” to find the best parameters by cross-validation.

In [15]:
clf = SVC(C=1.0, kernel='rbf', gamma=0.01)
clf.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.6927083333333334

In [29]:
## your code
C_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
Gamma_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
best_accuracy = 0.0
best_C, best_gamma = None, None

for C in C_list:
    for gamma in Gamma_list:
        clf = SVC(C=C, kernel='rbf', gamma=gamma)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        current_accuracy = accuracy_score(y_pred, y_test)
        print(f'(C:{C}, gamma:{gamma} -> accuracy:{current_accuracy})')

        if(best_accuracy < current_accuracy):
            best_accuracy =  current_accuracy
            best_C = C
            best_gamma = gamma

(C:0.001, gamma:0.001 -> accuracy:0.6927083333333334)
(C:0.001, gamma:0.01 -> accuracy:0.6927083333333334)
(C:0.001, gamma:0.1 -> accuracy:0.6927083333333334)
(C:0.001, gamma:1.0 -> accuracy:0.6927083333333334)
(C:0.001, gamma:10.0 -> accuracy:0.6927083333333334)
(C:0.001, gamma:100.0 -> accuracy:0.6927083333333334)
(C:0.001, gamma:1000.0 -> accuracy:0.6927083333333334)
(C:0.001, gamma:10000.0 -> accuracy:0.6927083333333334)
(C:0.01, gamma:0.001 -> accuracy:0.6927083333333334)
(C:0.01, gamma:0.01 -> accuracy:0.6927083333333334)
(C:0.01, gamma:0.1 -> accuracy:0.6927083333333334)
(C:0.01, gamma:1.0 -> accuracy:0.6927083333333334)
(C:0.01, gamma:10.0 -> accuracy:0.6927083333333334)
(C:0.01, gamma:100.0 -> accuracy:0.6927083333333334)
(C:0.01, gamma:1000.0 -> accuracy:0.6927083333333334)
(C:0.01, gamma:10000.0 -> accuracy:0.6927083333333334)
(C:0.1, gamma:0.001 -> accuracy:0.6927083333333334)
(C:0.1, gamma:0.01 -> accuracy:0.6927083333333334)
(C:0.1, gamma:0.1 -> accuracy:0.692708333333333

In [30]:
print(best_accuarcy)
print(best_C)
print(best_gamma)

0.8958333333333334
10.0
0.001


In [None]:
print(best_parameters)

## Question 3
Plot the grid search results of your SVM. The x, y represents the hyperparameters of “gamma” and “C”, respectively. And the color represents the average score of validation folds
You reults should be look like this reference image below ![image](https://miro.medium.com/max/1296/1*wGWTup9r4cVytB5MOnsjdQ.png) 

## Question 4
Train your SVM model by the best parameters you found from question 2 on the whole training set and evaluate the performance on the test set. **You accuracy should over 0.85**

In [None]:
y_pred = best_model.predict(x_test)
print("Accuracy score: ", accuracy_score(y_pred, y_test))

## Question 5
Compare the performance of the model you have implemented in HW1 with the SVM

### HW1

In [8]:
train_df = pd.read_csv("../HW1/train_data.csv")
x_train = train_df['x_train'].to_numpy().reshape(-1,1)
y_train = train_df['y_train'].to_numpy().reshape(-1,1)

test_df = pd.read_csv("../HW1/test_data.csv")
x_test = test_df['x_test'].to_numpy().reshape(-1,1)
y_test = test_df['y_test'].to_numpy().reshape(-1,1)

In [42]:
print("Square error of Linear regression: ")
print("Square error of SVM regresssion model: ")

Square error of Linear regression: 
Square error of SVM regresssion model: 
