拟合和预测：估算器基础

In [1]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
clf = RandomForestClassifier(random_state=0)

In [24]:
x = [[1, 2, 3], [11, 12, 13]]
y = [0, 1]
clf.fit(x, y)

In [20]:
clf.predict(x)

array([0, 1])

In [21]:
clf.predict([[4, 5, 6], [14, 15, 16]])

array([0, 1])

In [22]:
clf.predict([[14, 15, 16], [14, 15, 16]])

array([1, 1])

转换器和预处理器

In [1]:
from sklearn.preprocessing import StandardScaler

In [2]:
x = [[0, 15], [1, -10]]
StandardScaler().fit(x).transform(x)

array([[-1.,  1.],
       [ 1., -1.]])

管道：连接预处理器和估算器

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))

In [14]:
# 加载鸢尾花数据集并切分训练&测试数据
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [16]:
pipe.fit(X_train, y_train)

In [18]:
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

模型评估

In [19]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [31]:
# 模拟线性回归数据集
X, y = make_regression(n_samples=1000, random_state=0)

In [21]:
lr = LinearRegression()

In [22]:
# 默认迭代次数(cv)5次
results = cross_validate(lr, X, y)

In [23]:
results['test_score']

array([1., 1., 1., 1., 1.])

自动参数搜索

In [35]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [48]:
# 加州住房数据, 用于回归任务
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [45]:
param_distributions = {'n_estimators': randint(1, 5), 'max_depth': randint(5, 10)}

In [49]:
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)

In [52]:
search.fit(X_train, y_train)

In [53]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [57]:
search.score(X_train, y_train)

0.8220507914419701