# SVM Learning

In [1]:
%load preamble_directives.py

## Learning Pipeline

(**TODO**) -> Chart of the learnign Pipeline

## Load Dataset

In [2]:
from coherence import load_coherence_dataset

coherence_ds = load_coherence_dataset()
X = coherence_ds.data
y = coherence_ds.target

## Train-Test Split

Before we construct out model pipeline, we divide the dataset into separate a separate **training** dataset (75% of the data) and a separate **test** dataset (25% of the data):

In [3]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)

## K-fold Cross Validation and HyperParameter Tuning

### Tuning hyperparameters via Grid Search

In [4]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

#### Setting HyperParameters to tune

In [5]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [# Parameters for Linear Kernel 
              {'clf__C': param_range,
               'clf__kernel': ['linear']}, 
              # Parameter for RBF Kernel
              {'clf__C': param_range,
               'clf__gamma': param_range,
               'clf__kernel': ['rbf']},
              # Parameter for Polynomial Kernel
              {'clf__C': param_range,
               'clf__degree': [2, 3, 5, 8, 16],
               'clf__coef0': [0, 1],
               'clf__kernel': ['poly']},
            ]

# So far, the `Pipeline` is not actually needed, but
# this code is ready in case multiple (features selection)
# steps would be considered for inclusion
pipe_svc = Pipeline([('clf', SVC(random_state=1))]) 

#### Show HyperParameters Grid

In [6]:
from sklearn.grid_search import ParameterGrid
hyperp_grid = list(ParameterGrid(param_grid))

# Pretty Printing Grid
kernel_combinations = dict()
for hp in hyperp_grid:
    kernel = hp['clf__kernel']
    kernel_combinations.setdefault(kernel, list())
    kernel_combinations[kernel].append(hp)
    
for kernel in kernel_combinations:
    print('Hyperparamters for Kernel: ', kernel.title())
    for hp in kernel_combinations[kernel]:
        print('\t', hp)
    print('-'*80)

Hyperparamters for Kernel:  Poly
	 {'clf__C': 0.0001, 'clf__degree': 2, 'clf__coef0': 0, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 3, 'clf__coef0': 0, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 5, 'clf__coef0': 0, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 8, 'clf__coef0': 0, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 16, 'clf__coef0': 0, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 2, 'clf__coef0': 1, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 3, 'clf__coef0': 1, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 5, 'clf__coef0': 1, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 8, 'clf__coef0': 1, 'clf__kernel': 'poly'}
	 {'clf__C': 0.0001, 'clf__degree': 16, 'clf__coef0': 1, 'clf__kernel': 'poly'}
	 {'clf__C': 0.001, 'clf__degree': 2, 'clf__coef0': 0, 'clf__kernel': 'poly'}
	 {'clf__C': 0.001, 'clf__degree': 3, 'clf__coef0': 0, 'clf__kernel': 'poly'}
	 {'clf__C': 0.001,

#### Stratified K-Fold Cross Validation

In [7]:
from sklearn.cross_validation import cross_val_score

gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10, n_jobs=-1)

In [8]:
gs = gs.fit(X_train, y_train)  # Fit the GridSearchCV Estimator 

# Printing training results 
print('Best Score: ', gs.best_score_)
print('Best Parameters: ', gs.best_params_)
#print('Grid Scores: ', gs.grid_scores_)

Best Score:  0.832407407407
Best Parameters:  {'clf__kernel': 'rbf', 'clf__gamma': 1.0, 'clf__C': 1000.0}

In [9]:
# Predict with Best Estimator
clf = gs.best_estimator_
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('Mean Accuracy on Test set: %.3f' %  (clf.score(X_test, y_test)))

Mean Accuracy on Test set: 0.838


In [10]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)


[[217  73]
 [ 44 387]]


In [12]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(gs, X, y, scoring='accuracy', cv=10, n_jobs=-1)
print('All CV Scores')
for i, score in enumerate(scores):
    print(i, '): ', score)

print('CV accuracy : %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))


All CV Scores
0 ):  0.560553633218
1 ):  0.602076124567
2 ):  0.515570934256
3 ):  0.565972222222
4 ):  0.645833333333
5 ):  0.770833333333
6 ):  0.826388888889
7 ):  0.756944444444
8 ):  0.662020905923
9 ):  0.595818815331
CV accuracy : 0.650 +/- 0.098
