# SVM w praktyce

In [None]:
#he found one simple trick to ignore warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import imgaug.augmenters as iaa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#set only 3 precision
pd.set_option('precision', 3)

import random
import tensorflow as tf
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.utils import resample, shuffle

In [None]:
# fmnist - domyślnie jest 60k treningowych i 10k testowych
(x_train_fmnist, y_train_fmnist), (x_test_fmnist, y_test_fmnist) = tf.keras.datasets.fashion_mnist.load_data()

# polaczenie w 1 zbior
x_fmnist=np.concatenate((x_train_fmnist,x_test_fmnist))
y_fmnist=np.concatenate((y_train_fmnist,y_test_fmnist))


# tng
from sklearn.datasets import fetch_20newsgroups_vectorized
newsgroups_train = fetch_20newsgroups_vectorized(subset='all')

news_x = newsgroups_train.data
news_y = newsgroups_train.target

x, y = resample(news_x, news_y, n_samples=18000, replace=False, random_state=0)
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.4, random_state=0 )

In [None]:
def select_random_features(X, feats_num):
    random_feats = random.choices(list(range(X.shape[1])), k=feats_num)
    col_selector = ColumnSelector(cols=random_feats)
    return col_selector.transform(X)

In [None]:
num_samples = [100, 300, 500, 1000, 3000, 5000]
num_feats = [100, 1300, 6500, 13000, 26000, 39000, 52000]

samples=[]
features=[]
scores=[]

for i in num_samples:
    x, y = resample(news_x, news_y, n_samples=i, replace=False, random_state=0)
    for j in num_feats:
        X_r = select_random_features(x, j)
        x_train, x_test, y_train, y_test =  train_test_split(X_r, y, test_size=0.4, random_state=0)
        clf = SVC(C=100, kernel='linear')
        clf.fit(x_train,y_train)
        y_pred = clf.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        
        samples.append(i)
        features.append(j)
        scores.append(score)
        print('samples=%d, feats=%d, score=%f' %(i, j, score))

In [None]:
dataframe=pd.DataFrame({"samples":samples,"features":features,"accuracy":scores})
dataframe

In [None]:
pivot=dataframe.pivot(index='samples', columns='features', values='accuracy')
pivot

In [None]:
pivot.style.apply(lambda x: ["background: red" if v == x.max() else "" for v in x], axis = 1)

In [None]:
import seaborn.apionly as sns
ax = sns.heatmap(pivot, square=True)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=90 )
plt.tight_layout()
plt.show()

# Zadanie 3a

In [None]:
def analyze_SVM(x, y, split, max_iter):
    x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=split, random_state=0)
    clf = SVC(C=100, kernel='linear', max_iter=max_iter)
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    print('\t- accuracy = %f' %(score))
    return score

In [None]:
def create_clfs_and_pipe(x,y,max_iter=-1):
    clfs = []
    clfs_pipe = []

    for i in range(10):
        clf = SVC(C=100, kernel='linear', probability=True, max_iter=max_iter)
        name = 'svc' + str(i)
        # 10% cech
        feats_num = int(0.15 * x.shape[1])
        random_feats = random.choices(list(range(x.shape[1])), k=feats_num)
        
        col_sel = ColumnSelector(cols=random_feats)
        clf_pipe = Pipeline([('sel', col_sel), (name, clf)])
        p_name = 'pipe' + str(i)
        clfs_pipe.append((p_name, clf_pipe))
        clfs.append(clf)
        
    return clfs, clfs_pipe

In [None]:
def analyze_sub_SVM(x, y, split, max_iter):
    x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=split, random_state=0)
    
    clfs, clfs_pipe=create_clfs_and_pipe(x,y,max_iter)

    eclf = VotingClassifier(estimators=clfs_pipe, voting='soft')
    eclf = eclf.fit(x_train, y_train)
    y_pred = eclf.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    print('\t- voting score = %f' %(score))
    return score

In [7]:
def compare_train_size(x, y, max_iter=-1):
    matrix=[]
    
    # pelny SVM
    # docelowo caly zbior, ale dla sprawdzenia poprawnosci i zobaczenia wynikow w realnym czasie wycinamy z niego 10% 
    samples_num = int(0.1*x.shape[0])
    x1, y1 = resample(x, y, n_samples=samples_num, replace=False, random_state=0)
    
    #zbior pod-SVMow
    x2, y2 = resample(x, y, n_samples=500, replace=False, random_state=0)
    split_sizes=range(1,10)
    
    for split_size in split_sizes:
        row=[]
        split=split_size/10
        print('\nTrain size =', 1-split)
        row.append(1-split)
        row.append(analyze_SVM(x1, y1, split, max_iter))
        row.append(analyze_sub_SVM(x2, y2, split, max_iter))
        matrix.append(row)
    
    return matrix

### TNG

In [None]:
matrix=compare_train_size(news_x, news_y)
dataframe=pd.DataFrame(matrix,columns=['Test_Size','full_accuracy','voting_accuracy'])
dataframe

### FMNIST

In [None]:
d1, d2, d3 = x_fmnist.shape
x_fmnist_reshaped = x_fmnist.reshape((d1, d2*d3))
matrix=compare_train_size(x_fmnist_reshaped, y_fmnist)
dataframe=pd.DataFrame(matrix,columns=['Test_Size','full_accuracy','voting_accuracy'])
dataframe

# Zadanie 3b

In [None]:
max_iterations=[10,100,1000,10000]

def compute_limit_iterations(x,y):
    dataframes=[]
    
    for max_iter in max_iterations:
        print('\n ## ITERATIONS =', max_iter,' ##\n')    
        matrix=compare_train_size(x, y, max_iter)
        df=pd.DataFrame(matrix,columns=['train_size','full_accuracy','voting_accuracy'])
        df['iters']=max_iter
        dataframes.append(df)

    return pd.concat(dataframes)

### TNG

In [None]:
big_tng_df=compute_limit_iterations(news_x, news_y)  
big_tng_df

In [None]:
tng_pivots=[]

In [None]:
tng_pivots.append(big_tng_df.pivot(index='train_size', columns='iters', values='full_accuracy'))
tng_pivots[0]

In [None]:
tng_pivots.append(big_tng_df.pivot(index='train_size', columns='iters', values='voting_accuracy'))
tng_pivots[1]

In [None]:
# import seaborn.apionly as sns
# ax1 = sns.heatmap(tng_pivot, square=True)
# plt.setp(ax1.xaxis.get_majorticklabels(), rotation=90 )
# plt.tight_layout()
# plt.show()

In [None]:
fig, axes = plt.subplots(1,2, sharey='row')

# tng_pivots[0]['type']='accuracy'
# tng_pivots[1]['type']='accuracy'

# pd.concat(tng_pivots).pivot_table(index=['type','name'], columns='percent_noise', values='accuracy_score')

axes[0].set_yticks(range(len(pivot.index)))
axes[0].set_yticklabels(pivot.index)
axes[0].set_ylabel("Train Size")

for pivot,ax in zip(tng_pivots,axes):
    im=ax.imshow(pivot, cmap="Greens")

    ax.set_xticks(range(len(pivot.columns)))
    ax.set_xticklabels(pivot.columns, rotation=90)
    ax.set_xlabel("Iterations")
    
    fig.colorbar(im, ax=ax)

### FMNIST

In [None]:
d1, d2, d3 = x_fmnist.shape
x_fmnist_reshaped = x_fmnist.reshape((d1, d2*d3))
compute_limit_iterations(x_fmnist_reshaped, y_fmnist)    

# Zadanie 4

In [8]:
def random_label(label, a, b):
    new_label = random.randint(a, b)
    while new_label == label:
        new_label = random.randint(a, b)
    return new_label
    

def noise_label(y, noise_size, labels_count):
    if noise_size == 0:
        return y
    
    y_noised = np.array(y)
    
    noise_num = int(noise_size*y_noised.shape[0])
    random_noise = random.choices(list(range(y_noised.shape[0])), k=noise_num)
    for i in random_noise:
        y_noised[i] = random_label(y_noised[i], 0, labels_count)
    return y_noised


def noise_fmnist(x, noise_size, noise_level=0.2):
    if noise_size == 0:
        return x
    
    x_noised = np.array(x)
#     noise_level = 0.2 ## mozna sprawdzic rozne poziomy zaszumienia
    
    noise_num = int(noise_size*x_noised.shape[0])
    random_noise = random.choices(list(range(x_noised.shape[0])), k=noise_num)
    for i in random_noise:
        image = noise_image(x_noised[i])
        x_noised[i] = image
#         image = x_noised[i].reshape((28, 28))
#         aug = iaa.AdditiveGaussianNoise(scale=(0, noise_level*255))
#         blurred = aug(images=[image])
#         noised_image = blurred[0]
#         x_noised[i] = noised_image.reshape((28*28))
    return x_noised

def noise_image(image):
    noised_image = np.array(image)
    for i in range(noised_image.shape[0]):
#         bit = noised_image[i]
#         if bit == 0:
#             bound = 250 
#         else:
#             bound = int(0.5*bit)
            
#         noise = random.randint(0, 100)
        noised_image[i] = noised_image[i] * 0.1
        
    return noised_image


def noise_tng(x, noise_size):
    if noise_size == 0:
        return x
    
    ## konwersja do array, zeby moc zmieniac komorki tablicy (sparse matrix do tego sie nie nada)
    x_noised = np.array(x.toarray())
    noise_num = int(noise_size*x_noised.shape[0])
    random_noise = random.choices(list(range(x_noised.shape[0])), k=noise_num)
    for i in random_noise:
        x_noised[i] = noise_vector(x_noised[i])
    return x_noised


def noise_vector(vector, noise_level=0.1):
    vector_noised = np.array(vector)
    
    ## zaszumiamy co 10 ceche tylko
    noise_num = int(noise_level*vector_noised.shape[0])
    random_noise = random.choices(list(range(vector_noised.shape[0])), k=noise_num)
    
    for i in random_noise:
        if vector_noised[i] == 0:
            bound = 0.1
        elif vector_noised[i] > 0.9:
            bound = 0
        else:
            bound = noise_level * vector_noised[i]
        
        noise = random.uniform(0, bound)
        vector_noised[i] = vector_noised[i] + noise
    
    return np.array(vector_noised)

def dataframe_noise_labels(percentage_noise, noised_classifiers,
                           x_train, y_train, x_test, y_test, class_num):
    print("### LABEL NOISE ###")
    matrix=[]
    for s in percentage_noise:
        print('\nnoise_level = ', s)
        y_noised = noise_label(y_train, s, class_num) ## tng 20 cech
        
        for (name, clf) in noised_classifiers:
            print("\t-", name)
            row=[name,s]

            clf.fit(x_train, y_noised)
            y_pred = clf.predict(x_test)
            score = accuracy_score(y_test, y_pred)
            print('\t    noise accuracy = %f' %(score))
            row.append(score)
            matrix.append(row)
            
    return pd.DataFrame(matrix, columns=['name','percent_noise','accuracy_score'])

### TNG

In [9]:
x, y = resample(news_x, news_y, n_samples=1000, replace=False, random_state=0)
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.3, random_state=0)

clfs, clfs_pipe= create_clfs_and_pipe(x,y)

# deklaracja klasyfikatorow
percentage_noise=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
noised_classifiers= [
    ('SVC', SVC(C=100, kernel='linear')), 
    ('Voting', VotingClassifier(estimators=clfs_pipe, voting='hard'))
]


In [10]:
## bledne etykiety
df_tng_noised_label=dataframe_noise_labels(percentage_noise, 
                                           noised_classifiers, 
                                           x_train=x_train,
                                           y_train=y_train, 
                                           x_test=x_test,
                                           y_test=y_test,
                                           class_num=19)
df_tng_noised_label

### LABEL NOISE ###

noise_level =  0.0
	- SVC
	    noise accuracy = 0.486667
	- Voting
	    noise accuracy = 0.416667

noise_level =  0.1
	- SVC
	    noise accuracy = 0.463333
	- Voting
	    noise accuracy = 0.410000

noise_level =  0.2
	- SVC
	    noise accuracy = 0.403333
	- Voting
	    noise accuracy = 0.376667

noise_level =  0.3
	- SVC
	    noise accuracy = 0.390000
	- Voting
	    noise accuracy = 0.356667

noise_level =  0.4
	- SVC
	    noise accuracy = 0.336667
	- Voting
	    noise accuracy = 0.336667

noise_level =  0.5
	- SVC
	    noise accuracy = 0.266667
	- Voting
	    noise accuracy = 0.323333

noise_level =  0.6
	- SVC
	    noise accuracy = 0.243333
	- Voting
	    noise accuracy = 0.280000

noise_level =  0.7
	- SVC
	    noise accuracy = 0.283333
	- Voting
	    noise accuracy = 0.223333


Unnamed: 0,name,percent_noise,accuracy_score
0,SVC,0.0,0.487
1,Voting,0.0,0.417
2,SVC,0.1,0.463
3,Voting,0.1,0.41
4,SVC,0.2,0.403
5,Voting,0.2,0.377
6,SVC,0.3,0.39
7,Voting,0.3,0.357
8,SVC,0.4,0.337
9,Voting,0.4,0.337


In [None]:
# ## bledne wektory, baaaardzo dlugo sie licza
print("### VECTOR NOISE ###")

for s in percentage_noise:
    print('\nnoise_level = ', s)
    x_noised = noise_tng(x_train, s)
    print('Noise applied')
    
    for (name, clf) in noised_classifiers:
        print("\t-", name)
        
        clf.fit(x_noised, y_train)
        print('Classifier fitted')
        y_pred = clf.predict(x_test.toarray()) ## konwersja sparse matrix do array
        score = accuracy_score(y_test, y_pred)
        print('\t    vector noise accuracy = %f' %(score))

### FMNIST

In [12]:
d1, d2, d3 = x_fmnist.shape
x_fmnist_reshaped = x_fmnist.reshape((d1, d2*d3))

x, y = resample(x_fmnist_reshaped, y_fmnist, n_samples=1000, replace=False, random_state=0)
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.3, random_state=0)

clfs, clfs_pipe= create_clfs_and_pipe(x,y)

# deklaracja klasyfiaktorow
percentage_noise=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
noised_classifiers= [
    ('SVC', SVC(C=100, kernel='linear')), 
    ('Voting', VotingClassifier(estimators=clfs_pipe, voting='soft'))
]

# for s in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
#     print('\nnoise_level = ', s)
    
#     for (name, clf) in [('SVC', svc), ('Voting', eclf)]:
#         print("\t-", name)
        
#         y_noised = noise_label(y_train, s, 9) # fmnist 10 cech
#         clf.fit(x_train, y_noised)
#         y_pred = clf.predict(x_test)
#         score = accuracy_score(y_test, y_pred)
#         print('\t\t+ label noise accuracy = %f' %(score))
        
#         x_noised = noise_fmnist(x_train, s) 
#         clf.fit(x_noised, y_train)
#         y_pred = clf.predict(x_test)
#         score = accuracy_score(y_test, y_pred)
#         print('\t\t+ vector noise accuracy = %f' %(score))

In [14]:
## bledne etykiety
df_fmnist_noised_label=dataframe_noise_labels(percentage_noise, 
                                           noised_classifiers, 
                                           x_train=x_train,
                                           y_train=y_train, 
                                           x_test=x_test,
                                           y_test=y_test,
                                           class_num=9)
df_fmnist_noised_label

### LABEL NOISE ###

noise_level =  0.0
	- SVC
	    noise accuracy = 0.806667
	- Voting
	    noise accuracy = 0.763333

noise_level =  0.1
	- SVC
	    noise accuracy = 0.766667
	- Voting
	    noise accuracy = 0.730000

noise_level =  0.2
	- SVC
	    noise accuracy = 0.680000
	- Voting
	    noise accuracy = 0.713333

noise_level =  0.3
	- SVC
	    noise accuracy = 0.670000
	- Voting
	    noise accuracy = 0.700000

noise_level =  0.4
	- SVC
	    noise accuracy = 0.630000
	- Voting
	    noise accuracy = 0.640000

noise_level =  0.5
	- SVC
	    noise accuracy = 0.526667
	- Voting
	    noise accuracy = 0.606667

noise_level =  0.6
	- SVC
	    noise accuracy = 0.443333
	- Voting
	    noise accuracy = 0.530000

noise_level =  0.7
	- SVC
	    noise accuracy = 0.446667
	- Voting
	    noise accuracy = 0.500000

noise_level =  0.8
	- SVC
	    noise accuracy = 0.430000
	- Voting
	    noise accuracy = 0.403333


Unnamed: 0,name,percent_noise,accuracy_score
0,SVC,0.0,0.807
1,Voting,0.0,0.763
2,SVC,0.1,0.767
3,Voting,0.1,0.73
4,SVC,0.2,0.68
5,Voting,0.2,0.713
6,SVC,0.3,0.67
7,Voting,0.3,0.7
8,SVC,0.4,0.63
9,Voting,0.4,0.64


In [None]:
## bledne wektory
print("### VECTOR NOISE ###")

fmnist_noised_vector=[]

for s in percentage_noise:
    print('\npercent noise = ', s)
    x_noised = noise_fmnist(x_train, s) 
    for (name, clf) in noised_classifiers:
        print("\t-", name)
        row=[name,s]
        
        clf.fit(x_noised, y_train)
        y_pred = clf.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        print('\t    vector noise accuracy = %f' %(score))
        row.append(score)
        fmnist_noised_vector.append(row)        

### VECTOR NOISE ###

percent noise =  0.0
	- SVC
	    vector noise accuracy = 0.806667
	- Voting
	    vector noise accuracy = 0.766667

percent noise =  0.1
	- SVC
	    vector noise accuracy = 0.700000
	- Voting
	    vector noise accuracy = 0.753333

percent noise =  0.2
	- SVC
	    vector noise accuracy = 0.723333
	- Voting
	    vector noise accuracy = 0.763333

percent noise =  0.3
	- SVC
	    vector noise accuracy = 0.666667
	- Voting
	    vector noise accuracy = 0.700000

percent noise =  0.4
	- SVC
	    vector noise accuracy = 0.686667
	- Voting
	    vector noise accuracy = 0.730000

percent noise =  0.5
	- SVC
	    vector noise accuracy = 0.660000
	- Voting
	    vector noise accuracy = 0.713333

percent noise =  0.6
	- SVC
	    vector noise accuracy = 0.670000
	- Voting
	    vector noise accuracy = 0.716667

percent noise =  0.7
	- SVC
	    vector noise accuracy = 0.626667
	- Voting


In [None]:
df_fmnist_noised_vector=pd.DataFrame(fmnist_noised_vector, columns=['name','percent_noise','accuracy_score'])
df_fmnist_noised_vector

In [None]:
df_tng_noised_label
df_fmnist_noised_label
df_fmnist_noised_vector

In [None]:
df_fmnist_noised_label2=df_fmnist_noised_label.copy()
df_fmnist_noised_label2['type']='label'
df_fmnist_noised_vector2=df_fmnist_noised_vector.copy()
df_fmnist_noised_vector2['type']='vector'
pd.concat([
    df_fmnist_noised_label2,
    df_fmnist_noised_vector2
]).pivot_table(index=['type','name'], columns='percent_noise', values='accuracy_score')

### ŚMIETNIK

In [None]:
def select_random_features(X, feats_num):
    arrX = X.toarray()
    X_random = []
    random_feats = random.choices(list(range(arrX.shape[1])), k=feats_num)
    X_random = arrX[:, random_feats]
    result = select_features(X, random_feats)
    return result, random_feats


def select_features(X, feats):
    arrX = X.toarray()
    X_random = []
    X_random = arrX[:, feats]
    return np.asarray(X_random)

In [None]:
# wszystkich cech jest 130k
x, y = resample(news_x, news_y, n_samples=500, replace=False, random_state=0)

## RANDOM
X_r, x_feats = select_random_features(x, 13000)
x_train, x_test, y_train, y_test = train_test_split(X_r, y, test_size=0.4, random_state=0)
clf = SVC(C=100, kernel='linear')
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
score = accuracy_score(y_test, y_pred)
print('Random accuracy', score)

## KBEST
from sklearn.feature_selection import SelectKBest, chi2
clf = SVC(C=100, kernel='linear')
X_kbest = SelectKBest(chi2, k=1300).fit_transform(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.4, random_state=0)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
score = accuracy_score(y_test, y_pred)
print('Kbest accuracy', score)
    
## RFE    
# from sklearn.feature_selection import RFE
# clf = SVC(C=100, kernel='linear')
# rfe_selector = RFE(estimator=clf, n_features_to_select=10, step=100)
# X_rfe = rfe_selector.fit_transform(x, y)
# x_train, x_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.4, random_state=0)
# clf.fit(x_train,y_train)
# y_pred = clf.predict(x_test)
# score = accuracy_score(y_test, y_pred)
# print('RFE accuracy', score)


## SFS
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# clf = SVC(C=100, kernel='linear')
# sfsForward = SFS(clf, k_features=10, forward=True, n_jobs=-1)
# X_sfs = sfsForward.fit_transform(x, y)
# x_train, x_test, y_train, y_test = train_test_split(X_sfs, y, test_size=0.4, random_state=0)
# clf.fit(x_train,y_train)
# y_pred = clf.predict(x_test)
# score = accuracy_score(y_test, y_pred)
# print('SFS accuracy', score)
    

In [None]:
## zespół podklasyfikatorw SVM
## wszystkie podklasyfikatory uczą sie na tym samym zbiorze danych
## jedynie każdy podklasyfikator losuje swój zbiór cech i wycina z danych, tylko interesujące go cechy
## wyniki póki co są dosyć satysfakcjonujące 
## ... ale aby sprawdzić to dla większego zbiory danych potrzeba troche czasu lub lepszej maszyny

clfs = []
fts = []

x, y = resample(news_x, news_y, n_samples=500, replace=False)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

print("TRAINING")
for i in range(10):
    print('#', end='')
    x_train_r, feats = select_random_features(x_train, 13000)
    clf = SVC(C=100, kernel='linear')
    clf = clf.fit(x_train_r, y_train)
    clfs.append(clf)
    fts.append(feats)
    
    
    
x, y = resample(news_x, news_y, n_samples=500, replace=False)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

print("\n\nTEST")
for i in range(10):
    clf = clfs[i]
    feats = fts[i]
    x_test_r = select_features(x_test, feats)
    y_pred = clf.predict(x_test_r)
    score = accuracy_score(y_test, y_pred)
    print('SVM-' + str(i), 'accuracy =', score)
    


In [None]:
def SVM_analyze(x_train,y_train,x_test,y_test):
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    
    arrC=[0.01, 0.1, 1.0, 10.0, 100.0]

    kernelArr=[]
    supportArrC=[]
    arrDegree=[]
    arrScore=[]
    arrCoef0=[]
    
    for C in arrC:
        clf = SVC(C=C, kernel='linear')
        clf.fit(x_train,y_train)
        y_pred = clf.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        kernelArr.append("linear")
        supportArrC.append(C)
        arrDegree.append("")
        arrScore.append(score)
        arrCoef0.append("")        

    for C in arrC:
        for degree in [2,3,4]:
            clf = SVC(C=C, kernel='poly', degree=degree)
            clf.fit(x_train,y_train)
            y_pred = clf.predict(x_test)
            score = accuracy_score(y_test, y_pred)
            kernelArr.append("poly")
            supportArrC.append(C)
            arrDegree.append(degree)
            arrScore.append(score)
            arrCoef0.append("")

    for C in arrC:
        for coef0 in [0.0,0.5,1.0]:
            clf = SVC(C=C, kernel='sigmoid', coef0=coef0)
            clf.fit(x_train,y_train)
            y_pred = clf.predict(x_test)
            score = accuracy_score(y_test, y_pred)
            kernelArr.append("sigmoid")
            supportArrC.append(C)
            arrDegree.append("")
            arrScore.append(score)
            arrCoef0.append(coef0)

    dataframe=pd.DataFrame({"C":supportArrC,"degree":arrDegree,"coef0":arrCoef0,"score":arrScore,"kernel":kernelArr})
    
    return dataframe.style.apply(lambda x: ["background: red" if v == x.max() else "" for v in x], axis = 0, subset="score")

sklearn.svm.SVC pozwala na dobieranie kernela oraz parametrów uczenia.

Możliwe kernele to: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ albo kernel stworzony przez siebie.

Najważniejsze parametry uczenia, których wartości można dobierać, to:
- C: parametr służący do regularyzacji, proporcjonalny do 1/lambda. Musi być liczbą dodatnią (default=1.0)
- degree: stopień wielomianu (przy użyciu kernela 'poly')
- gamma: współczynnik dla kerneli 'rbf', 'poly', 'sigmoid' ze zbioru {'scale', 'auto'} albo float (default='scale')
- coef0: wartość "r" dla kerneli 'poly' i 'sigmoid', float (default=0.0)

(szczegóły: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html )

In [None]:
# MISC FUNKCJE DO POTENCJALNEGO WYKORZYSTANIA

# x_train = x_train.reshape((x_train.shape[0],-1))
# x_test = x_test.reshape((x_test.shape[0],-1))
# x_train, y_train = resample(x_train, y_train, n_samples=60000, replace=False, random_state=0)
# x_test, y_test = resample(x_test, y_test, n_samples=10000, replace=False, random_state=0)

# #SVM
# tng_df=SVM_analyze(x_train,y_train,x_test,y_test)
# tng_df

# y_train = y_train.reshape((y_train.shape[0],))

# from sklearn.preprocessing import StandardScaler

# scaler_mnist = StandardScaler().fit(x_train)
# x_train = scaler_mnist.transform(x_train)
# x_test = scaler_mnist.transform(x_test)