# Setup and loading data

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import time

from matplotlib.colors import Normalize
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV


In [None]:
# Iyer data import
iyer_datapath = '/mnt/c/Users/isaac/Desktop/CSE447/final_project/DataMiningProject/data/6_cho_iyer/'
iyer_filename = 'iyer.txt'

iyer_data = np.genfromtxt(iyer_datapath+iyer_filename)
# Eliminate outliers from this
iyer_data = iyer_data[iyer_data[:,1]>-1, :]
X = iyer_data [:,2:]
y = iyer_data[:,1]



#rng = np.random.RandomState(0)
# X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=rng)

Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=101)


## Fit a scaler, and scale the training data

In [None]:
# Scale the data for SVM training, doing the scaling based on the Xtrain data. 
# We will be doing the same scaling on the testing data. 

scaler = StandardScaler()
scaler = scaler.fit(Xtrain)
Xtrain = scaler.transform(Xtrain)

## Rbf kernel grid evaluation

In [None]:
# For {RBF} kernel, the relevant parameters are {C, gamma}

# Set parameter range for grid search space
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)

# Use parameter ranges to create a parameter grid
param_grid = dict(gamma=gamma_range, C=C_range)

# Create a cross validation generator
cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True)


# Run GridSearchCV 
grid = GridSearchCV(svm.SVC(decision_function_shape='ovr'), param_grid=param_grid, cv=cv,scoring='accuracy')
results = grid.fit(Xtrain, ytrain)



# Identify best parameter set based on grid scoring method
# Grid scores come from...
print("The best parameters are %s with a score of %0.2f"
      % (results.best_params_, results.best_score_))


In [None]:
# Draw heatmap of the validation accuracy as a function of the kernel parameters

# Utility function to move the midpoint of a colormap to be around
# the values of interest.

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))


scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),
                                                     len(gamma_range))

plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
           norm=MidpointNormalize(vmin=0.2, midpoint=0.92))

# Parameter names
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.set_cmap('gray')
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Iyer RBF Training Validation Accuracy')
plt.savefig("iyer_rbf_gridsearch.png")
plt.show()

## Evaluation of the method

In [None]:
# Train the SVC model on the kernel and optimal parameters
rbf = svm.SVC(kernel='rbf', gamma=1e-06, C=10000000).fit(Xtrain, ytrain) 
 # 1000000.0, 'gamma': 1e-07
# Rescale the testing data
Xtest = scaler.transform(Xtest)

# Use the predictor on the rescaled testing data
rbf_pred = rbf.predict(Xtest)

# Compute the accuracy, f1 score and auc accuracy for the 
rbf_accuracy = accuracy_score(ytest, rbf_pred)
rbf_f1 = f1_score(ytest, rbf_pred, average='weighted')

def multiclass_roc_auc_score(y_test, y_pred, average="macro"): # Method from https://medium.com/@plog397/auc-roc-curve-scoring-function-for-multi-class-classification-9822871a6659
    lb = sklearn.preprocessing.LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return sklearn.metrics.roc_auc_score(y_test, y_pred, average=average)
auc_accuracy = multiclass_roc_auc_score(ytest,rbf_pred)

# Print the results for the method
print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy*100))
print('F1 (RBF Kernel): ', "%.2f" % (rbf_f1*100))
print('AUC Accuracy (RBF Kernel): ','%.2f' % (auc_accuracy))




In [None]:
Xtrain.shape
Xtest.shape