# Support Vector Machine
クラス間を分離する**超平面(hyperplane)**を用いて分類器を定義する。

* **超平面**……`N`次元空間を2つに分ける。2次元における線、3次元における平面の拡張。

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn import model_selection

In [2]:
input_file = 'const/income_data.txt'

In [3]:
Xy = []
count_class1 = 0
count_class2 = 0
max_datapoints = 25000

入力ファイルの例
```
39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K

```

In [4]:
with open(input_file, mode='r', encoding='utf-8') as f:
    for line in f.readlines():
        # 特殊条件での停止 or スキップ
        if count_class1 >= max_datapoints and count_class2 >= max_datapoints:
            break

        if '?' in line:
            continue

        # 処理
        data = line[:-1].split(', ')  # [:-1]は、おそらく改行文字を除外している

        if data[-1] == '<=50K' and count_class1 < max_datapoints:
            Xy.append(data)
            count_class1 += 1

        if data[-1] == '>50K' and count_class2 < max_datapoints:
            Xy.append(data)
            count_class2 += 1

In [5]:
Xy = np.array(Xy)

In [6]:
label_encoder = []
Xy_encoded = np.empty(Xy.shape)
for i, item in enumerate(Xy[0]):
    if item.isdigit():
        Xy_encoded[:, i] = Xy[:, i]
    else:
        encoder = preprocessing.LabelEncoder()
        Xy_encoded[:, i] = encoder.fit_transform(Xy[:, i])
        label_encoder.append(encoder)

In [7]:
X = Xy_encoded[:, :-1].astype(int)
y = Xy_encoded[:, -1].astype(int)

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                                                                    test_size=0.2, random_state=5)

In [9]:
classifier = LinearSVC(random_state=0)
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)



In [10]:
f1 = model_selection.cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3)
print('F1 score: ', round(f1.mean(), 4))



F1 score:  0.7082


In [11]:
input_data = np.array([
    ['37', 'Private', '215646', 'HS-grad', '9', 'Never-married',
    'Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40',
     'United-States'],
    ['55', 'Private', '287927', 'Doctorate', '16', 'Married-civ-spouse',
     'Exec-managerial', 'Husband', 'White', 'Female', '15000', '0', '40',
     'United-States']
])

In [12]:
ipt_data_encoded = np.zeros(input_data.shape)
c = 0
for i, item in enumerate(input_data[0]):
    if item.isdigit():
        ipt_data_encoded[:, i] = input_data[:, i]
    else:
        ipt_data_encoded[:, i] = label_encoder[c].transform(input_data[:, i])
        c += 1

In [13]:
predict_class = classifier.predict(ipt_data_encoded)
print(label_encoder[-1].inverse_transform(predict_class))

['<=50K' '>50K']
