In [1]:
%matplotlib inline

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer

# SVM Demos

In [13]:
income_data = pd.read_csv("../datasets/adult/adult.data", header=None)
income_data.columns = [
    "age",
    "workclass",
    "final_weight",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_class",
]

income_target = income_data.income_class
income_target = income_target.str.strip()

income_attributes = income_data.drop(columns="income_class")
income_attributes = pd.get_dummies(income_attributes, drop_first=True)
scaler = MinMaxScaler()
income_attributes = scaler.fit_transform(income_attributes)

income_attributes_train, income_attributes_test, \
income_target_train, income_target_test, \
= train_test_split(income_attributes, income_target, train_size=0.8)

for x in [income_attributes_train, income_attributes_test, income_target_train, income_target_test]:
    print(x.shape)

(26048, 100)
(6513, 100)
(26048,)
(6513,)


In [17]:
linear_svc = LinearSVC(
    penalty='l2',
    loss='squared_hinge',
    dual=True,
    tol=0.0001,
    C= 1e6,
    max_iter = 10000,
)

In [29]:
linear_svc.fit(income_attributes_train, income_target_train)



тегловните коефициенти на класификатора (модела) можем да ползваме за интерпретация кои са важните feature-и. Това се нарича **feature importances**.

In [31]:
np.sort(linear_svc.coef_)

array([[-3.38700023e+00, -7.87053396e-01, -7.39325023e-01,
        -7.19332974e-01, -7.06867061e-01, -6.33006200e-01,
        -5.18366822e-01, -4.96677483e-01, -4.46403281e-01,
        -3.87557499e-01, -3.85846609e-01, -3.74114109e-01,
        -3.53517635e-01, -2.94391497e-01, -2.87371306e-01,
        -2.78244476e-01, -2.42672234e-01, -2.21913384e-01,
        -2.11896531e-01, -2.03136488e-01, -1.92391380e-01,
        -1.82090138e-01, -1.81849893e-01, -1.78755071e-01,
        -1.73329804e-01, -1.69053969e-01, -1.44728338e-01,
        -1.32430277e-01, -1.11897161e-01, -1.07032285e-01,
        -1.04572095e-01, -6.45400593e-02, -4.52668306e-02,
        -1.68335807e-02, -1.32061708e-02, -1.12196940e-02,
        -9.76046053e-03, -9.42193376e-03, -7.77154252e-03,
         0.00000000e+00,  1.77095234e-02,  2.21172510e-02,
         3.59332928e-02,  5.21040975e-02,  5.49636456e-02,
         7.35498620e-02,  7.67747555e-02,  7.76144862e-02,
         8.01692060e-02,  8.11102371e-02,  8.84476486e-0

In [58]:
# Seems wrong?! TODO
print("Predicted\tActual")
for i in range(8):
    predicted_class = np.sign(np.sum(income_attributes_test[i] * linear_svc.coef_))
    actual_class = income_target_test.iloc[i]
    print(predicted_class,"\t\t", actual_class)

Predicted	Actual
1.0 		 <=50K
1.0 		 >50K
1.0 		 <=50K
-1.0 		 <=50K
1.0 		 >50K
1.0 		 <=50K
1.0 		 >50K
1.0 		 <=50K


In [69]:
param_grid = {
    "C": [0.01, 1, 10, 100, ],
    "loss": ["hinge", "squared_hinge"],
}

linear_grid_search = GridSearchCV(
    estimator=LinearSVC(max_iter = 1000),
    param_grid=param_grid,
    scoring = make_scorer(f1_score, pos_label=">50K"),
)

In [70]:
linear_grid_search.fit(income_attributes_train, income_target_train)



In [68]:
linear_grid_search.best_params_

{'C': 1, 'loss': 'squared_hinge'}

In [71]:
linear_grid_search.cv_results_

{'mean_fit_time': array([0.22379608, 0.15300069, 0.22130098, 0.61760221, 0.75397968,
        2.71148052, 4.22423034, 4.10031514]),
 'std_fit_time': array([0.07367959, 0.03849641, 0.03897995, 0.05273338, 0.11896818,
        0.47096226, 1.16206395, 0.31778346]),
 'mean_score_time': array([0.09879832, 0.09900146, 0.07619872, 0.06079612, 0.07720094,
        0.06627936, 0.07640023, 0.08499804]),
 'std_score_time': array([0.01742902, 0.06444135, 0.0210284 , 0.01847655, 0.03199794,
        0.01937166, 0.02108056, 0.01693184]),
 'param_C': masked_array(data=[0.01, 0.01, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_loss': masked_array(data=['hinge', 'squared_hinge', 'hinge', 'squared_hinge',
                    'hinge', 'squared_hinge', 'hinge', 'squared_hinge'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtyp

In [77]:
test_predictions = linear_grid_search.best_estimator_.predict(income_attributes_test)

In [74]:
f1_score(income_target_test, predictions, pos_label=">50K")

0.6639118457300276

Това не е много силен модел. Вероятно основната причина е че ползваме линеен алгоритъм.

In [78]:
train_predictions = linear_grid_search.best_estimator_.predict(income_attributes_train)

In [81]:
print("Training set")
print(classification_report(income_target_train, train_predictions))

print("Testing set")
print(classification_report(income_target_test, test_predictions))

Training set
              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91     19832
        >50K       0.74      0.59      0.66      6216

    accuracy                           0.85     26048
   macro avg       0.81      0.77      0.78     26048
weighted avg       0.85      0.85      0.85     26048

Testing set
              precision    recall  f1-score   support

       <=50K       0.87      0.94      0.90      4888
        >50K       0.75      0.59      0.66      1625

    accuracy                           0.85      6513
   macro avg       0.81      0.76      0.78      6513
weighted avg       0.84      0.85      0.84      6513



Моделът предсказва еднакво добре и двата сета следоватално е стабилен. Няма голяма нестабилност, няма high variance.

In [83]:
svc = SVC(kernel='poly', degree=2, max_iter=1000)

In [84]:
svc.fit(income_attributes_train, income_target_train)



In [87]:
# Нямаме коеф
# svc.coef_

In [88]:
svc.decision_function(income_attributes_train[:10])

array([-0.19647866,  0.47366197,  0.00297235, -0.21024704,  0.12134607,
       -0.24626395,  0.00511959,  0.28019573, -0.12862762,  0.09808085])

Проверка на първите 10 предсказания.

In [96]:
first_10_predictions = svc.predict(income_attributes_train[:10])
income_target_test[:10] == first_10_predictions

26015     True
18062     True
31943    False
19123     True
3438      True
29122     True
8758      True
9170     False
23857    False
31263    False
Name: income_class, dtype: bool

In [97]:
svc_train_predictions = svc.predict(income_attributes_train)
svc_test_predictions = svc.predict(income_attributes_test)

In [100]:
print("Classification reports for SVC with polynomial kernel\n")
print("1. Training set")
print(classification_report(income_target_train, svc_train_predictions))

print("2. Testing set")
print(classification_report(income_target_test, svc_test_predictions))

Classification reports for SVC with polynomial kernel

1. Training set
              precision    recall  f1-score   support

       <=50K       0.93      0.56      0.70     19832
        >50K       0.38      0.87      0.53      6216

    accuracy                           0.63     26048
   macro avg       0.66      0.71      0.61     26048
weighted avg       0.80      0.63      0.66     26048

2. Testing set
              precision    recall  f1-score   support

       <=50K       0.93      0.55      0.69      4888
        >50K       0.39      0.88      0.54      1625

    accuracy                           0.63      6513
   macro avg       0.66      0.71      0.62      6513
weighted avg       0.80      0.63      0.65      6513



## SVC with rbf kernel and different gamma-s

In [118]:
svc_gamma1 = SVC(kernel='rbf', gamma=0.1, C = 10000, max_iter=1000)
svc_gamma100 = SVC(kernel='rbf', gamma=10, C = 10000,max_iter=1000)

In [119]:
svc_gamma1.fit(income_attributes_train, income_target_train)
svc_gamma100.fit(income_attributes_train, income_target_train)



In [120]:
svc_gamma1_test_predictions = svc_gamma1.predict(income_attributes_test)
svc_gamma100_test_predictions = svc_gamma100.predict(income_attributes_test)

In [122]:
print("Classification reports for SVC with rbf kernel and different gamma-s\n")
print("1. Gamma = 0.1")
print(classification_report(income_target_test, svc_gamma1_test_predictions))

print("2. Gamma = 10")
print(classification_report(income_target_test, svc_gamma100_test_predictions))

Classification reports for SVC with rbf kernel and different gamma-s

1. Gamma = 0.1
              precision    recall  f1-score   support

       <=50K       0.83      0.80      0.81      4888
        >50K       0.45      0.50      0.47      1625

    accuracy                           0.72      6513
   macro avg       0.64      0.65      0.64      6513
weighted avg       0.73      0.72      0.73      6513

2. Gamma = 10
              precision    recall  f1-score   support

       <=50K       0.75      0.99      0.86      4888
        >50K       0.40      0.01      0.02      1625

    accuracy                           0.75      6513
   macro avg       0.57      0.50      0.44      6513
weighted avg       0.66      0.75      0.65      6513



Gamma = 0.1 e значително по-добра.