# SVM w praktyce

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.utils import shuffle


# fmnist - domyślnie jest 60k treningowych i 10k testowych
(x_train_fmnist, y_train_fmnist), (x_test_fmnist, y_test_fmnist) = tf.keras.datasets.fashion_mnist.load_data()

# polaczenie w 1 zbior
x_fmnist=np.concatenate((x_train_fmnist,x_test_fmnist))
y_fmnist=np.concatenate((y_train_fmnist,y_test_fmnist))


# tng
from sklearn.datasets import fetch_20newsgroups_vectorized
newsgroups_train = fetch_20newsgroups_vectorized(subset='all')

news_x = newsgroups_train.data
news_y = newsgroups_train.target

x, y = resample(news_x, news_y, n_samples=18000, replace=False, random_state=0)
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.4, random_state=0 )


# MISC FUNKCJE DO POTENCJALNEGO WYKORZYSTANIA

# x_train = x_train.reshape((x_train.shape[0],-1))
# x_test = x_test.reshape((x_test.shape[0],-1))
# x_train, y_train = resample(x_train, y_train, n_samples=60000, replace=False, random_state=0)
# x_test, y_test = resample(x_test, y_test, n_samples=10000, replace=False, random_state=0)

# #SVM
# tng_df=SVM_analyze(x_train,y_train,x_test,y_test)
# tng_df

# y_train = y_train.reshape((y_train.shape[0],))

# from sklearn.preprocessing import StandardScaler

# scaler_mnist = StandardScaler().fit(x_train)
# x_train = scaler_mnist.transform(x_train)
# x_test = scaler_mnist.transform(x_test)

In [2]:
# pełen zbiór SVC
clf = SVC(C=100, kernel='linear')
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
score = accuracy_score(y_test, y_pred)
print('kernel=linear, C=%f, score=%f' %(100, score))
score

kernel=linear, C=100.000000, score=0.862917


0.8629166666666667

In [3]:
# gwoli splitowania w 3a:
split_sizes=range(1,10)

for split_size in split_sizes:
    split=split_size/10
#     print(split)
    train_test_split(x, y, test_size=split, random_state=0)
    # dalsza logika tutaj

sklearn.svm.SVC pozwala na dobieranie kernela oraz parametrów uczenia.

Możliwe kernele to: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ albo kernel stworzony przez siebie.

Najważniejsze parametry uczenia, których wartości można dobierać, to:
- C: parametr służący do regularyzacji, proporcjonalny do 1/lambda. Musi być liczbą dodatnią (default=1.0)
- degree: stopień wielomianu (przy użyciu kernela 'poly')
- gamma: współczynnik dla kerneli 'rbf', 'poly', 'sigmoid' ze zbioru {'scale', 'auto'} albo float (default='scale')
- coef0: wartość "r" dla kerneli 'poly' i 'sigmoid', float (default=0.0)

(szczegóły: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html )

In [None]:
def SVM_analyze(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    
    arrC=[0.01, 0.1, 1.0, 10.0, 100.0]

    kernelArr=[]
    supportArrC=[]
    arrDegree=[]
    arrScore=[]
    arrCoef0=[]
    
    for C in arrC:
        clf = SVC(C=C, kernel='linear')
        clf.fit(x_train,y_train)
        y_pred = clf.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        kernelArr.append("linear")
        supportArrC.append(C)
        arrDegree.append("")
        arrScore.append(score)
        arrCoef0.append("")        

    for C in arrC:
        for degree in [2,3,4]:
            clf = SVC(C=C, kernel='poly', degree=degree)
            clf.fit(x_train,y_train)
            y_pred = clf.predict(x_test)
            score = accuracy_score(y_test, y_pred)
            kernelArr.append("poly")
            supportArrC.append(C)
            arrDegree.append(degree)
            arrScore.append(score)
            arrCoef0.append("")

    for C in arrC:
        for coef0 in [0.0,0.5,1.0]:
            clf = SVC(C=C, kernel='sigmoid', coef0=coef0)
            clf.fit(x_train,y_train)
            y_pred = clf.predict(x_test)
            score = accuracy_score(y_test, y_pred)
            kernelArr.append("sigmoid")
            supportArrC.append(C)
            arrDegree.append("")
            arrScore.append(score)
            arrCoef0.append(coef0)

    dataframe=pd.DataFrame({"C":supportArrC,"degree":arrDegree,"coef0":arrCoef0,"score":arrScore,"kernel":kernelArr})
    
    return dataframe.style.apply(lambda x: ["background: red" if v == x.max() else "" for v in x], axis = 0, subset="score")

In [4]:
import random


def select_random_features(X, feats_num):
    arrX = X.toarray()
    X_random = []
    random_feats = random.choices(list(range(arrX.shape[1])), k=feats_num)
    X_random = arrX[:, random_feats]
#     for x in arrX:
#         row = [x[i] for i in random_feats]
#         X_random.append(row)
    return np.asarray(X_random)

In [6]:
# wszystkich cech jest 130k
x, y = resample(news_x, news_y, n_samples=500, replace=False, random_state=0)

## RANDOM
X_r = select_random_features(x, 13000)
x_train, x_test, y_train, y_test = train_test_split(X_r, y, test_size=0.4, random_state=0)
clf = SVC(C=100, kernel='linear')
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
score = accuracy_score(y_test, y_pred)
print('Random accuracy', score)

## KBEST
from sklearn.feature_selection import SelectKBest, chi2
clf = SVC(C=100, kernel='linear')
X_kbest = SelectKBest(chi2, k=1300).fit_transform(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.4, random_state=0)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
score = accuracy_score(y_test, y_pred)
print('Kbest accuracy', score)
    
## RFE    
# from sklearn.feature_selection import RFE
# clf = SVC(C=100, kernel='linear')
# rfe_selector = RFE(estimator=clf, n_features_to_select=10, step=100)
# X_rfe = rfe_selector.fit_transform(x, y)
# x_train, x_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.4, random_state=0)
# clf.fit(x_train,y_train)
# y_pred = clf.predict(x_test)
# score = accuracy_score(y_test, y_pred)
# print('RFE accuracy', score)


## SFS
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# clf = SVC(C=100, kernel='linear')
# sfsForward = SFS(clf, k_features=10, forward=True, n_jobs=-1)
# X_sfs = sfsForward.fit_transform(x, y)
# x_train, x_test, y_train, y_test = train_test_split(X_sfs, y, test_size=0.4, random_state=0)
# clf.fit(x_train,y_train)
# y_pred = clf.predict(x_test)
# score = accuracy_score(y_test, y_pred)
# print('SFS accuracy', score)
    

Random accuracy 0.235
Kbest accuracy 0.385


In [None]:
num_samples = [100, 300, 500, 1000, 2000]
num_feats = [100, 1300, 6500, 13000]

for i in num_samples:
    for j in num_feats:
        x, y = resample(news_x, news_y, n_samples=i, replace=False, random_state=0)
        X_r = select_random_features(x, j)
        x_train, x_test, y_train, y_test =  train_test_split(X_r, y, test_size=0.4, random_state=0)
        clf = SVC(C=100, kernel='linear')
        clf.fit(x_train,y_train)
        y_pred = clf.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        print('samples=%d, feats=%d, score=%f' %(i, j, score))
        

samples=100, feats=100, score=0.025000
samples=100, feats=1300, score=0.050000
samples=100, feats=6500, score=0.025000
samples=100, feats=13000, score=0.075000
samples=300, feats=100, score=0.033333
samples=300, feats=1300, score=0.033333
samples=300, feats=6500, score=0.141667
samples=300, feats=13000, score=0.191667
samples=500, feats=100, score=0.030000
samples=500, feats=1300, score=0.095000
samples=500, feats=6500, score=0.130000
samples=500, feats=13000, score=0.190000
samples=1000, feats=100, score=0.055000
samples=1000, feats=1300, score=0.080000
samples=1000, feats=6500, score=0.147500
samples=1000, feats=13000, score=0.237500
samples=2000, feats=100, score=0.057500
samples=2000, feats=1300, score=0.115000
samples=2000, feats=6500, score=0.273750
