# Scikit-Learn (sklearn)

## Key class: datasets

In [48]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
print(X.shape, y.shape)

(150, 4) (150,)


In [49]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [50]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Key class: models

In [61]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import  LinearRegression, LogisticRegression

In [62]:
# Create the model
model = KNeighborsClassifier(n_neighbors=5)

# Fit the model
model.fit(X,y)

# Get model predictions
y_pred = model.predict(X)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Key class: evaluation

### evaluation metrics

In [53]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, log_loss)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [54]:
y_pred = [0,2,1,3,1]
y_true = [0,1,1,3,2]

In [55]:
accuracy_score(y_true, y_pred)

0.6

In [56]:
mean_squared_error(y_true, y_pred)

0.4

## Key class: experiments


### data split

In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=2)
print(f"X.shape = {X.shape}")
print(f"X_test.shape = {X_test.shape}")
print(f"X_train.shape = {X_train.shape}")


X.shape = (150, 4)
X_test.shape = (30, 4)
X_train.shape = (120, 4)


### cross validation

In [58]:
from sklearn.model_selection import cross_validate
clf = DecisionTreeClassifier(max_depth=2)
scores = cross_validate(clf, X_train, y_train,
                        scoring='accuracy', cv=10,
                        return_train_score=True)

## Example with random forest

In [45]:
from sklearn.model_selection import GridSearchCV

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y)
clf = RandomForestClassifier()
parameters = {'n_estimators': [100,150,200], 'criterion': ['gini', 'entropy']}
gridsearch = GridSearchCV(clf, parameters, scoring='accuracy', cv=5)
gridsearch.fit(X_tr, y_tr)
print(f"gridsearch.best_params_ = {gridsearch.best_params_}")

best_clf = gridsearch.best_estimator_
best_clf

gridsearch.best_params_ = {'criterion': 'gini', 'n_estimators': 150}


RandomForestClassifier(n_estimators=150)

In [46]:
y_pred = best_clf.predict(X_ts)
test_acc = accuracy_score(y_ts, y_pred)
print(f"test_acc = {test_acc}")

test_acc = 0.8947368421052632


In [47]:
final_model = RandomForestClassifier(**gridsearch.best_params_)
final_model.fit(X_tr,y_tr)

RandomForestClassifier(n_estimators=150)