# SVM
    

In [2]:
# Use Pandas to import as a data frame
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')




In [3]:
from sklearn.cross_validation import ShuffleSplit

# Create a training set and a cross validation set using 50% of the data for cross validation

# dftrain, dfcross = train_test_split(train, test_size = 0.5)

# Set X and y
X = train.values
y = train['label'].values

num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n=num_instances,
                         n_iter=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(42000, n_iter=3, test_size=0.2, random_state=None)


In [4]:
# Check Data 

print "Checking for Null values or duplicate rows... ", 
if train.isnull().all().any() or train.duplicated().all(): 
    print('error in data')
else:
    print('None found')

Checking for Null values or duplicate rows...  None found


In [5]:
from sklearn.preprocessing import StandardScaler

# we want to normalize the features based upon the mean and standard deviation of each column. 
# However, we do not want to accidentally use the testing data to find out the mean and std (this would be snooping)
# to Make things easier, let's start by just using whatever was last stored in the variables:
##    X_train , y_train , X_test, y_test (they were set in a for loop above)

# scale attributes by the training set
scl_obj = StandardScaler()

# okay, so run through the cross validation loop and set the training and testing variable for one single iteration
for train_indices, test_indices in cv_object: 
    # I will create new variables here so that it is more obvious what 
    # the code is doing (you can compact this syntax and avoid duplicating memory,
    # but it makes this code less readable)
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
scl_obj.fit(X_train) # find scalings for each column that make this zero mean and unit std
# the line of code above only looks at training data to get mean and std and we can use it 
# to transform new feature data

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) # apply those means and std to the test set (without snooping at the test set values)
    
X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) 



# Below is a linear Support Vector Classifier


In [23]:
# lets investigate SVMs on the data and play with the parameters and kernels
from sklearn.svm import SVC
from sklearn import metrics as mt


# train the model just as before
svm_clf = SVC(C=0.5, kernel='linear', degree=3, gamma='auto') # get object
svm_clf.fit(X_train_scaled, y_train)  # train object

y_hat = svm_clf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print('accuracy:', acc )
print(conf)

('accuracy:', 0.97928571428571431)
[[847   0   4   0   0   0   0   0   0   0]
 [  1 951   8   0   0   0   0   0   0   0]
 [  6  13 823   1   8   0   0   0   0   0]
 [  3   2   7 845   3   0   0   0   0   0]
 [  0   4   4   1 754   7   0   2   0   0]
 [  1   0   0   3   5 739   8   1   0   0]
 [  0   0   0   0   5  15 769   1   0   0]
 [  0   0   0   0   4   0   1 917  10   2]
 [  0   0   0   0   0   2   9   3 772  16]
 [  0   0   0   0   2   0   2   2   8 809]]


In [None]:
# if using linear kernel, these make sense to look at (not otherwise, why?)

import seaborn as sns


print(svm_clf.coef_)
weights = pd.Series(svm_clf.coef_[0],index=train.columns)

sns = weights.plot(color = "purple")


In [24]:
# 
# Split Training Data in half to allow cross validation
# The method,train_test_split divides our data set into 
# training and test data through randomization
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=28)

In [None]:
import time
start = time.time()

import sklearn as sk
import numpy as np
#  default settings sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, 
#  C=1.0, multi_class='ovr', fit_intercept=True, 
#  intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)[source]
nsvc_clf = sk.svm.NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, 
      probability=False, tol=0.001, cache_size=200, class_weight=None, 
      verbose=False, max_iter=1000, decision_function_shape=None, random_state=None)

nsvc_clf.fit(X_train_scaled, y_train)  # train object

y_hat = svm_clf.predict(X_test_scaled) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)

print('')
print('accuracy:', acc )
print(conf)

In [None]:
# These are the number of support vectors
svm_clf.n_support_ 

In [None]:
# look at the support vectors
print(svm_clf.support_vectors_.shape)
print(svm_clf.support_.shape)
print(svm_clf.n_support_ )

### The plot above shows a high concentration of vectors in pixels 300-500

In [None]:
# Now let's do some different analysis with the SVM and look at the instances that were chosen as support vectors

# now lets look at the support for the vectors and see if we they are indicative of anything
# grabe the rows that were selected as support vectors (these are usually instances that are hard to classify)

# make a dataframe of the training data
df_tested_on = train.iloc[train_indices] # saved from above, the indices chosen for training
# now get the support vectors from the trained model
df_support = df_tested_on.loc[svm_clf.support_,:]

df_support['label'] = y[svm_clf.support_] # add back in the 'Survived' Column to the pandas dataframe
train['label'] = y # also add it back in for the original data
df_support.info()

In [None]:
# now lets see the statistics of these attributes
from pandas.tools.plotting import boxplot

# group the original data and the support vectors
df_grouped_support = df_support.groupby(['label'])
df_grouped = train.groupby(['label'])

# plot KDE of Different variables
vars_to_plot = []
for i in range(len(df_grouped)):
    if i > 300 and i < 500:
        vars_to_plot.append(df_grouped.columns[i])

for v in vars_to_plot:
    sns = plt.figure(figsize=(10,4))
    # plot support vector stats
    plt.subplot(1,2,1)
    ax = df_grouped_support[v].plot.kde() 
    plt.legend(['pixel300','pixel450'])
    plt.title(v+' (Instances chosen as Support Vectors)')
    
    # plot original distributions
    plt.subplot(1,2,2)
    ax = df_grouped[v].plot.kde() 
    plt.legend(['pixel300','pixel450'])
    plt.title(v+' (Original)')

