In [None]:
import sklearn
print('Version: ', sklearn.__version__)

In [None]:
from sklearn import set_config
set_config(display="diagram")

## Easy start - Training and predictions

In [None]:
from sklearn.ensemble import RandomForestClassifier

# The classifier
model = RandomForestClassifier(random_state=42)

# Two observations
X = [[ 1,  2,  3],
    [11, 12, 13]]

# Two possible classes
y = [0, 1]

# Training
model.fit(X, y)

In [None]:
# Predict 
model.predict(X)

In [None]:
# Predict with new unseen data
model.predict([[14, 15, 16],[4, 5, 6] ])

In [None]:
# Predict and show the probabilities
model.predict_proba([[14, 15, 16],[4, 5, 6] ])

![Choosing the right estimator](ml_map.svg)

[https://scikit-learn.org/stable/machine_learning_map.html](https://scikit-learn.org/stable/machine_learning_map.html)

| Supervised learning               | Unsupervised learning             |
| --------------------------------- | --------------------------------- |
|   modules/linear_model |   modules/mixture
|   modules/lda_qda |   modules/manifold
|   modules/kernel_ridge |   modules/clustering
|   modules/svm |   modules/clustering
|   modules/sgd |   modules/biclustering
|   modules/neighbors |   modules/decomposition
|   modules/gaussian_process |   modules/covariance
|   modules/cross_decomposition |   modules/outlier_detection
|   modules/naive_bayes |   modules/density
|   modules/tree |   modules/neural_networks_unsupervised
|   modules/ensemble
|   modules/multiclass
|   modules/feature_selection
|   modules/semi_supervised
|   modules/isotonic
|   modules/calibration
|   modules/neural_networks_supervised

[https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

## Another example

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

X, y = load_iris(return_X_y=True)

In [None]:
# Inspect the data before proceeding
print(type(X)) # Type of variable X
print(type(y)) # Type of variable y

print(len(X)) # Length of the ndarray X
print(len(y)) # Length of the ndarray y

print(X[:5]) # First n elements of ndarray X
print(y[:5]) # First n elements of ndarray y

<img src="iris_dataset.png" alt="Iris dataset" width="600"/>

In [None]:
clf = LogisticRegression(random_state=42,max_iter=1000).fit(X, y)

In [None]:
clf.predict(X[:5, :])

In [None]:
clf.predict_proba(X[:5, :])

In [None]:
clf.score(X, y)

## Second gear - Transformers and pre-processors

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

x = np.array([1,2,3,4,5,6])
print(x) # Show the effect of the Numpy reshape, x is a row
X = np.array([1,2,3,4,5,6]).reshape(-1, 1)
print(X) # Show the effect of the Numpy reshape, X is a column

# Create a pre-processor, i.e StandardScaler = Standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler().fit(X) 
scaler.transform(X)

In [None]:
# Chain it if that suits you better. Same result.
StandardScaler().fit(X).transform(X)

In [None]:
# All paths leading to the same place.
StandardScaler().fit_transform(X)

In [None]:
XX = np.array([1.1,2.2,3.3,4.4,5.5,6.6]).reshape(-1, 1)
scaler.transform(XX) # Reuse the calculated values to transform another dataset.

In [None]:
import pandas as pd

X = pd.DataFrame(
    {'city': ['Oslo', 'Bergen', 'Tromsø', 'Oslo'], 
     'slogan': ["Unanimiter et constanter", "Byen mellom de syv fjell",
     "Nordens Paris", "Tenk om"], 
     'tripadvisor_rating': [5, 3, 4, 5],
     'hotels_rating': [4, 5, 4, 3]})

In [None]:
X

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
column_trans = ColumnTransformer(
    [('encode_cities', OneHotEncoder(dtype='int'), ['city']),
    ('vectorize_slogan', CountVectorizer(), 'slogan')],
    remainder='drop', verbose_feature_names_out=False)

In [None]:
column_trans.fit(X)

In [None]:
column_trans.get_feature_names_out()

In [None]:
column_trans.transform(X).toarray()

## Speeding up - Pipeplines

In [None]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pipe = make_pipeline(
    StandardScaler(), # Pre-processor / Transformer / fit() + transform()
    MinMaxScaler(), # Pre-processor / Transformer / fit() + transform()
    LogisticRegression() # Estimator  / Classifier / fit() + predict()
)

In [None]:
X, y = load_iris(return_X_y=True) # Same as before.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
print(type(X_train)) # Type of variable
print(type(X_test)) # Type of variable
print(type(y_train)) # Type of variable
print(type(y_test)) # Type of variable

print(len(X_train)) # Length of the ndarray 
print(len(X_test)) # Length of the ndarray 
print(len(y_train)) # Length of the ndarray 
print(len(y_test)) # Length of the ndarray 

print(X_train[:5]) # First n elements of ndarray
print(X_test[:5]) # First n elements of ndarray
print(y_train[:5]) # First n elements of ndarray
print(y_test[:5]) # First n elements of ndarray

In [None]:
pipe.fit(X_train, y_train)

In [None]:
accuracy_score(pipe.predict(X_test), y_test)

## Get ready for landing - Evaluation

In [None]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

X, y = make_regression(n_samples=100, random_state=42) # Generata some data, 100 observations.
lr = LinearRegression() # Estimator

In [None]:
print(type(X)) # Type of the variable
print(type(y)) # Type of the variable

print(len(X)) # Length of the ndarray 
print(len(y)) # Length of the ndarray 

print(X[:1]) # First n elements of ndarray
print(y[:1]) # First n elements of ndarray

In [None]:
result = cross_validate(lr, X, y)

In [None]:
result['test_score']

## Land ahoy - Automatic parameter search

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X, y = fetch_california_housing(return_X_y=True) # https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Train-test-split as previously seen.


param_distributions = {'n_estimators': randint(1, 5),
                       'max_depth': randint(5, 10)}

search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42),
                             n_iter=5, # Test 5 random combinations of the params.
                             param_distributions=param_distributions, # The parameters of interest.
                             random_state=42)

In [None]:
print(type(X_train)) # Type of variable
print(type(X_test)) # Type of variable
print(type(y_train)) # Type of variable
print(type(y_test)) # Type of variable

print(len(X_train)) # Length of the ndarray 
print(len(X_test)) # Length of the ndarray 
print(len(y_train)) # Length of the ndarray 
print(len(y_test)) # Length of the ndarray 

print(X_train[:5]) # First n elements of ndarray
print(X_test[:5]) # First n elements of ndarray
print(y_train[:5]) # First n elements of ndarray
print(y_test[:5]) # First n elements of ndarray

In [None]:
search.fit(X_train, y_train)

In [None]:
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
results = results.sort_values('rank_test_score')
results

In [None]:
search.score(X_test, y_test) # Check score on our testdata.

In [None]:
search.best_estimator_.score(X_test, y_test) # Another way to check score on our testdata.