# Getting Started

https://scikit-learn.org/stable/getting_started.html

In [1]:
import sklearn

sklearn.show_versions()


System:
    python: 3.8.6 | packaged by conda-forge | (default, Nov 27 2020, 19:17:44)  [Clang 11.0.0 ]
executable: /opt/anaconda3/envs/py38/bin/python
   machine: macOS-10.15.7-x86_64-i386-64bit

Python dependencies:
          pip: 21.1.1
   setuptools: 49.6.0.post20210108
      sklearn: 0.24.2
        numpy: 1.20.1
        scipy: 1.6.3
       Cython: 0.29.23
       pandas: 1.2.4
   matplotlib: 3.4.2
       joblib: 1.0.1
threadpoolctl: 2.1.0

Built with OpenMP: True


In [4]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
X = [[1, 2, 3], [11, 12, 13]]
y = [0, 1]
clf.fit(X, y)

RandomForestClassifier(random_state=0)

In [6]:
clf.predict(X)
clf.predict([[4, 5, 6], [14, 15, 16]])

array([0, 1])

In [7]:
from sklearn.preprocessing import StandardScaler

X = [[0, 15], [1, -10]]
StandardScaler().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [10]:
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

In [11]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [12]:
X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()

In [13]:
result = cross_validate(lr, X, y)
result["test_score"]

array([1., 1., 1., 1., 1.])

In [14]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [15]:
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [16]:
param_distributions = {"n_estimators": randint(1, 5), "max_depth": randint(5, 10)}

In [18]:
search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=0),
    n_iter=5,
    param_distributions=param_distributions,
    random_state=0,
)

In [19]:
search.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fbfbe1f30d0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fbfbe21be50>},
                   random_state=0)

In [20]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [21]:
search.score(X_test, y_test)

0.735363411343253