In [1]:
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
digits = datasets.load_digits()

In [4]:
print digits.data

[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]


In [5]:
print digits.target

[0 1 2 ..., 8 9 8]


In [9]:
digits.images[0]

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

In [7]:
from sklearn import svm

In [8]:
clf = svm.SVC(gamma=0.001,C=100.)

我们使用python中的[:-1]语法选择训练集，产生的新数组包含数据集中除最后一个以外的所有图像。

In [10]:
clf.fit(digits.data[:-1],digits.target[:-1])

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

现在可以利用训练好的分类器，预测数据集中未进行训练的最后一个图像。

In [11]:
clf.predict(digits.data[-1:])

array([8])

In [12]:
from sklearn.externals import joblib

In [15]:
joblib.dump(clf,'filename.pkl')

['filename.pkl']

In [16]:
clf2 = joblib.load('filename.pkl')

In [18]:
clf2.predict(digits.data[0:1])

array([0])

In [19]:
digits.target[0]

0

### 再次拟合以及更新参数

In [20]:
import numpy as np
from sklearn.svm import SVC

In [25]:
rng = np.random.RandomState(0)
X = rng.rand(100,10)
y = rng.binomial(1,0.5,100)
X_test = rng.rand(5,10)

In [26]:
clf = SVC()
clf.set_params(kernel='linear').fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [27]:
clf.predict(X_test)

array([1, 0, 1, 1, 0])

In [28]:
clf.set_params(kernel='rbf').fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
clf.predict(X_test)

array([0, 0, 0, 1, 0])

### 多分类 vs. 多标签拟合

In [30]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

In [31]:
X = [[1,2],[2,4],[4,5],[3,2],[3,1]]
y = [0,0,1,1,2]

In [32]:
classif = OneVsRestClassifier(estimator=SVC(random_state=0))
classif.fit(X,y).predict(X)

array([0, 0, 1, 1, 2])

In [33]:
from sklearn.preprocessing import MultiLabelBinarizer
y = [[0,1],[0,2],[1,3],[0,2,3],[2,4]]
y = MultiLabelBinarizer().fit_transform(y)
classif.fit(X,y).predict(X)

array([[1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0]])

## 2.2 用于科学数据处理的统计学习教程

A simple example shipped with the scikit:iris dataset

In [34]:
from sklearn import datasets

In [35]:
iris = datasets.load_iris()
data = iris.data
data.shape

(150L, 4L)

In [37]:
digits = datasets.load_digits()

In [38]:
digits.images.shape

(1797L, 8L, 8L)

In [40]:
import matplotlib.pyplot as plt
plt.imshow(digits.images[-1],cmap=plt.cm.gray_r)

<matplotlib.image.AxesImage at 0x1100af60>

In [41]:
data = digits.images.reshape((digits.images.shape[0],-1))

In [43]:
data.shape

(1797L, 64L)