# Loading an example datasets

In [1]:
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()

In [2]:
print(digits.data)

[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]


In [3]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [4]:
digits.images[0]

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

# Learning and Predicting

In [5]:
from sklearn import svm
clf = svm.SVC(gamma=0.001,C=100)

In [6]:
clf.fit(digits.data[:-1],digits.target[:-1])

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
clf.predict(digits.data[-1:])

array([8])

# Model persistence

通过使用Python的内置持久化模型--pickle，可以在scikit中保存模型

In [2]:
from sklearn import svm
from sklearn import datasets

In [3]:
clf = svm.SVC()

In [4]:
iris = datasets.load_iris()
X, y = iris.data,iris.target

In [5]:
clf.fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [6]:
import pickle

In [7]:
s = pickle.dumps(clf)

In [8]:
clf2 = pickle.loads(s)

In [10]:
clf2.predict(X[0:1])

array([0])

In [11]:
y[0]

0

在scikit的特定情况下，使用joblib替换pickle（joblib.dump＆joblib.load）可能会更有趣，这对于大数据更有效，但只能腌制到磁盘而不是字符串：

In [12]:
from sklearn.externals import joblib

In [13]:
joblib.dump(clf,'filename.pkl')

['filename.pkl']

In [14]:
clf3 = joblib.load('filename.pkl')

In [15]:
clf3.predict(X[0:1])

array([0])

# Conventions

scikit-learn估计器遵循一定的规则使他们的行为更具预测性

## Type casting

除非另有说明，否则输入将转换为float64

In [16]:
import numpy as np

In [17]:
from sklearn import random_projection

In [18]:
rng = np.random.RandomState(0)

In [20]:
x = rng.rand(10,2000)

In [27]:
X = np.array(x,dtype='float32')

In [28]:
X.dtype

dtype('float32')

In [29]:
transform = random_projection.GaussianRandomProjection()

In [30]:
X_new = transform.fit_transform(x)

In [31]:
X_new.dtype

dtype('float64')

回归目标会转变成float64，分类器目标不变:

In [33]:
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data,iris.target
clf.fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [34]:
list(clf.predict(X[0:3]))

[0, 0, 0]

In [37]:
clf.fit(iris.data, iris.target_names[iris.target]) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
clf.predict(X[0:3])

array(['setosa', 'setosa', 'setosa'],
      dtype='<U10')

在这里，第一个predict()返回一个整数数组，因为iris.target （一个整数数组）被用于fit。第二个predict()返回一个字符串数组，因为iris.target_names用于拟合。

## Refitting and updating parameters

估计器的超参数可以在通过该sklearn.pipeline.Pipeline.set_params方法构建之后更新。fit() 多次调用会覆盖以前所学的内容fit()

In [39]:
import numpy as np
from sklearn.svm import SVC

In [40]:
rng = np.random.RandomState(0)

In [41]:
X = rng.rand(100,10)

In [43]:
y = rng.binomial(1,0.5,100)

In [45]:
X_test = rng.rand(5,10)

In [46]:
clf = SVC()

In [47]:
clf.set_params(kernel='linear').fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
clf.predict(X_test)

array([1, 0, 1, 1, 0])

In [49]:
clf.set_params(kernel='rbf').fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
clf.predict(X_test)

array([0, 0, 0, 1, 0])

# Multiclass vs. multilabel fitting

In [51]:
from sklearn.svm import SVC

In [52]:
from sklearn.multiclass import OneVsRestClassifier

In [53]:
from sklearn.preprocessing import LabelBinarizer

In [61]:
X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
y = [0,0,1,1,2]

In [62]:
clf = OneVsRestClassifier(estimator=SVC(random_state=0))

In [63]:
clf.fit(X,y).predict(X)

array([0, 0, 1, 1, 2])

In [64]:
y = LabelBinarizer().fit_transform(y)

In [65]:
y

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [66]:
clf.fit(X,y).predict(X)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])