In [71]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingRegressor, BaggingClassifier 

from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

Set random seed for reproducibility

In [72]:
np.random.seed(42)

REGRESSION EXAMPLE

Generate synthetic regression dataset with noise

In [73]:
X_reg, y_reg = make_regression(
    n_samples=300,          # Number of samples
    n_features=1,           # Number of features
    noise=20,               # Standard deviation of noise
    random_state=42
)

In [74]:
print(X_reg.ndim)
print(X_reg.shape)
print(X_reg.dtype)
print(X_reg)



2
(300, 1)
float64
[[-0.62269952]
 [-0.25256815]
 [-1.72491783]
 [-0.68002472]
 [-0.3853136 ]
 [ 0.34115197]
 [-0.39210815]
 [ 0.82206016]
 [ 0.37569802]
 [ 0.08704707]
 [ 0.24196227]
 [ 0.25988279]
 [-0.56228753]
 [ 1.05712223]
 [ 1.76545424]
 [-0.3011037 ]
 [ 0.77463405]
 [ 0.30154734]
 [-0.11564828]
 [ 0.97554513]
 [ 0.09176078]
 [-1.76304016]
 [ 0.96337613]
 [ 0.67959775]
 [ 0.64768854]
 [-1.43014138]
 [ 0.8496021 ]
 [-1.1913035 ]
 [ 1.15859558]
 [ 1.47789404]
 [-0.47917424]
 [-0.5297602 ]
 [-0.20812225]
 [-0.32766215]
 [-0.65160035]
 [ 1.0035329 ]
 [ 0.89959988]
 [-0.24538812]
 [ 1.0889506 ]
 [-0.44651495]
 [ 2.19045563]
 [-0.38508228]
 [-1.24778318]
 [-0.88385744]
 [ 0.93128012]
 [-0.01349722]
 [ 0.62962884]
 [ 1.8861859 ]
 [-0.48423407]
 [ 0.81351722]
 [-0.82068232]
 [ 0.50498728]
 [ 2.31465857]
 [-1.32818605]
 [ 0.51503527]
 [-0.42064532]
 [ 0.46210347]
 [-1.98756891]
 [-0.32206152]
 [ 1.46564877]
 [ 2.13303337]
 [-1.23086432]
 [ 0.78182287]
 [-2.02514259]
 [ 0.51504769]
 [ 0.2

In [75]:
print(y_reg.ndim)
print(y_reg.shape)
print(y_reg.dtype)
print(y_reg)


1
(300,)
float64
[-9.52158969e+00 -2.06515555e+01 -5.34331423e+01 -1.80295681e+01
 -1.88746330e+01  1.12113142e+01 -5.87101936e+00  1.06544549e+01
  1.04196421e+01  1.38819785e+01 -3.63958929e+00  4.84314455e+01
  1.22104192e+00  2.88505918e+01  5.95527948e+01  2.29084804e+01
  2.90629269e+01  3.12278882e+00 -4.40032001e+01  2.86007569e+01
  1.74272315e+01 -3.43065656e+01  7.46533724e+01  1.30326781e+01
 -1.74863762e+01 -5.72058305e+01  1.50689653e+01 -1.65717920e+01
  2.17308042e+01  4.02094992e+01  3.20008558e+01 -9.42836611e+00
 -1.28203017e+01 -1.03509045e+01 -1.30225292e+01  4.78292797e+01
  4.46488330e+01  1.01072486e+01  1.06712371e+01  3.48933227e+01
  1.83611051e+01 -9.08319151e+00 -7.97983978e+01  2.98680330e+00
  3.54188481e+01  1.18961584e+01  3.79297691e+01  5.80436372e+01
 -2.00382154e+01  2.04857719e+01 -4.77071591e+01 -2.24394511e-01
  1.06147784e+02 -1.43723517e+01  6.88757070e+00 -1.15381618e+00
  1.87717408e+01 -2.65231424e+01 -3.79071634e+01  5.49546041e+00
  5.0411

In [76]:
outlier_indices = np.random.choice(len(X_reg), size=20, replace=False)
outlier_indices

array([203, 266, 152,   9, 233, 226, 196, 109,   5, 175, 237,  57, 218,
        45, 182, 221, 289, 211, 148, 165], dtype=int32)

In [77]:
print(y_reg[outlier_indices])

[ -0.60653302   0.36310294  46.6194103   13.88197847  34.71121334
 -30.59536162  38.27938534 -16.6382647   11.21131418 -45.66737021
  -2.80451533 -26.52314242   7.03812635  11.89615843   0.73969302
 -41.36199282  26.00298727 -33.95410432 -21.61701299  -5.54945755]


Add some outliers to make the problem harder

In [None]:
y_reg[outlier_indices] += np.random.normal(0, 50, size=20)
print(y_reg[outlier_indices])

[ 24.22917463  -6.55011211  79.0038372   90.0334713   23.0035446
 -42.30220947 117.24002611  21.73347175 -12.26240512 -18.53936803
 -25.97539997 -49.8096301   19.13623993 -83.7678538  -85.50619861
 -69.47636928 -24.63856875 -18.24173769 -67.01821676 -76.16464262]


In [79]:
np.random.normal()

1.465648768921554