# SVM for classification

In [1]:
from sklearn.datasets import load_svmlight_file
import urllib

In [2]:
download=False

if download:
    target_page ='http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.bz2'
    with urllib.request.urlopen(target_page) as response:
        with open('ijcnn1.bz2','wb') as W:
            W.write(response.read())
        

In [3]:
X_train, y_train = load_svmlight_file('../datasets/ijcnn1.bz2')
first_rows = 2500
X_train, y_train = X_train[:first_rows,:], y_train[:first_rows]

In [6]:
X_train[:10]

<10x22 sparse matrix of type '<class 'numpy.float64'>'
	with 130 stored elements in Compressed Sparse Row format>

In [8]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

hypothesis = SVC(kernel='rbf', random_state=101)
scores = cross_val_score(hypothesis, X_train, y_train, cv=5,scoring='accuracy')

print ("SVC with rbf kernel -> cross validation accuracy: mean = %0.3f \ std = %0.3f" % (np.mean(scores), np.std(scores)))


SVC with rbf kernel -> cross validation accuracy: mean = 0.910 \ std = 0.001


In [9]:
import pickle
from sklearn.datasets import fetch_covtype

download_pickle=False
if download_pickle:
    covertype_dataset = fetch_covtype(random_state=101, shuffle=True)
    pickle.dump(covertype_dataset, open( "covertype_dataset.pickle", "wb" ))
    

In [10]:
covertype_dataset = pickle.load(open("../datasets/covertype_dataset.pickle","rb"))
covertype_X = covertype_dataset.data[:25000,:]
covertype_y = covertype_dataset.target[:25000] -1

In [11]:
import numpy as np

covertypes = ['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine','Cottonwood/Willow', 'Aspen', 'Douglas-fir', 'Krummholz']

print ('original dataset:', covertype_dataset.data.shape)
print ('sub-sample:', covertype_X.shape)
print('target freq:', list(zip(covertypes,np.bincount(covertype_y))))


original dataset: (581012, 54)
sub-sample: (25000, 54)
target freq: [('Spruce/Fir', 9107), ('Lodgepole Pine', 12122), ('Ponderosa Pine', 1583), ('Cottonwood/Willow', 120), ('Aspen', 412), ('Douglas-fir', 779), ('Krummholz', 877)]


In [15]:
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.svm import LinearSVC

hypothesis = LinearSVC(dual=False, class_weight='balanced')
cv_strata = StratifiedKFold(n_splits=3, shuffle=True,random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y,cv=cv_strata, scoring='accuracy')

print ("LinearSVC -> cross validation accuracy: mean = %0.3f \ std = %0.3f" % (np.mean(scores), np.std(scores)))

LinearSVC -> cross validation accuracy: mean = 0.652 \ std = 0.030
