# SCIKIT-LEARN 29/10 REGRESSION

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import matplotlib as plt

**REGRESSION**

In [6]:
from sklearn.datasets import load_iris # carico il dataset iris
from sklearn.model_selection import train_test_split
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y, shuffle=True)

In [7]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression() # instance of the regressor
reg.fit(X_train, y_train)
y_test_pred = reg.predict(X_test)

In [8]:
y_test_pred

array([-0.02180204,  1.59467891,  0.8918912 ,  0.88638718, -0.03070495,
        1.39307286, -0.13427151, -0.05136299,  1.74514762,  1.38990627,
        1.78696349,  1.86558563,  2.00128801,  1.28144856, -0.05544091,
       -0.07293937, -0.22764325,  1.01167134,  1.30634878,  1.58268446,
       -0.1086172 ,  2.05746663,  1.17240103,  1.51725244,  1.97624657,
        1.49467551,  1.18627016, -0.03681482,  1.68655651, -0.09730179])

**evaluation metrics for regression**

In [10]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
# Compute R2, MAE and MSE:
r2 = r2_score(y_test, y_test_pred)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)

**evaluation with cross_val_score()**

In [11]:
from sklearn.model_selection import cross_val_score
reg = LinearRegression()
r2 = cross_val_score(reg, X, y, cv=5, scoring='r2')

**POlYNOMIAL REGRESSION**

In [12]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5) # costruisco polinomio con grado massimo a 5
X_poly = poly.fit_transform(X) # trasformiamo i dati ma devo prima aver fatto il fit del modello

**PIPELINE**

In [15]:
from sklearn.pipeline import make_pipeline
reg = make_pipeline(PolynomialFeatures(5), LinearRegression())
reg.fit(X_train, y_train)
y_test_pred = reg.predict(X_test)

**RIDGE AND LASSO**

In [16]:
from sklearn.linear_model import Ridge
reg = Ridge(alpha=0.5) 
# alpha corrisponde al coefficiente che usiamo per la penalità

In [17]:
from sklearn.linear_model import Lasso
reg = Lasso(alpha=0.5)

**HYPERPARAMETERS SELECTION**

In [22]:
from sklearn.model_selection import ParameterGrid

In [23]:
params = {"criterion": ['gini', 'entropy', 'log_loss'],
         "max_depth":list(range(1,100,20)), 
         "min_impurity_decrease": np.linspace(0.1, 1, 10)}

In [24]:
list(ParameterGrid(params))

[{'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.1)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.2)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.30000000000000004)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.4)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.5)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.6)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.7000000000000001)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.8)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(0.9)},
 {'criterion': 'gini',
  'max_depth': 1,
  'min_impurity_decrease': np.float64(1.0)},
 {'criterion': 'gini',
  'max_depth': 21,
  'min_impurity_decrease': np.float64(0.1)},
 {'criterion': 'gini',

In [25]:
params

{'criterion': ['gini', 'entropy', 'log_loss'],
 'max_depth': [1, 21, 41, 61, 81],
 'min_impurity_decrease': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}

In [30]:
# con paramgrid facciamo tutte le combinazioni --> 150
len(list(ParameterGrid(params))) # 5*3*10

150

In [37]:
X = np.random.random((1000, 10))
y = np.random.randint(0,10, 1000)

In [56]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, train_size=0.8)

In [57]:
X_train_valid.shape

(800, 10)

In [58]:
X_test.shape

(200, 10)

In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, train_size=0.75)

In [60]:
X_train.shape

(600, 10)

In [61]:
X_test.shape

(200, 10)

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
accuracies = []
for config in ParameterGrid(params):
    #clf = DecisionTreeClassifier(criterion=config["criterion"],
                                # max_depth=config["max_depth"],
                                #min_impurity=config["min_impurity_decrease"])
    clf = DecisionTreeClassifier(**config) # same as previous lines
    clf.fit(X_train, y_train)
    acc = accuracy_score(y_valid, clf.predict(X_valid))
    accuracies.append(acc)

In [63]:
np.argmax(accuracies)

np.int64(0)