## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [1]:
import numpy as np

In [9]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob
        
    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        self.indices_list = []
        data_length = len(data)
        for _ in range(self.num_bags):
            self.indices_list.append(np.random.randint(data_length, size=data_length))
        
    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.
        
        example:
        
        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []
        for bag in range(self.num_bags):
            model = model_constructor()
            data_bag, target_bag = data[self.indices_list[bag]], target[self.indices_list[bag]] # Your Code Here
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here
        if self.oob:
            self.data = data
            self.target = target
        
    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        # Your code here
        return np.mean([model.predict(data) for model in self.models_list], axis=0)
    
    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]
        # Your Code Here
        for i in range(len(list_of_predictions_lists)):
            list_of_predictions_lists[i].extend([
                model.predict(self.data[i].reshape(1, -1))[0] 
                for bag, model in enumerate(self.models_list) 
                if i not in self.indices_list[bag]
            ])
        
        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)
    
    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()
        self.oob_predictions = [
            np.mean(predictions) if len(predictions)>0 else None 
            for predictions in self.list_of_predictions_lists
            ] # Your Code Here
        
        
    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()

        valid_indices = [i for i, pred in enumerate(self.oob_predictions) if pred is not None]
        valid_predictions = np.array([self.oob_predictions[i] for i in valid_indices])
        valid_targets = np.array([self.target[i] for i in valid_indices])

        return np.mean((valid_predictions - valid_targets)**2) # Your Code Here

### Local tests:

In [3]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Simple tests:

In [10]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    print(oob_score)
    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Simple tests done!')

  2%|▏         | 2/100 [00:00<00:13,  7.24it/s]

3.3784825427089575e-32
2.330170302933673e-31


  4%|▍         | 4/100 [00:00<00:14,  6.60it/s]

5.028538053679104e-32
7.925968429539873e-32


  6%|▌         | 6/100 [00:00<00:14,  6.55it/s]

1.288938547644707e-31
2.920724151192151e-32


  8%|▊         | 8/100 [00:01<00:15,  6.04it/s]

6.749359429952203e-32
4.453098568121115e-32


 10%|█         | 10/100 [00:01<00:14,  6.37it/s]

5.666990216541466e-32
3.1022590671146694e-32


 12%|█▏        | 12/100 [00:01<00:13,  6.73it/s]

8.220847388492858e-32
6.699857970211369e-32


 14%|█▍        | 14/100 [00:02<00:13,  6.47it/s]

4.564807304222051e-32
1.0870719809512095e-31


 16%|█▌        | 16/100 [00:02<00:12,  6.93it/s]

5.683326924199245e-32
3.7301991327299856e-32


 18%|█▊        | 18/100 [00:02<00:11,  7.20it/s]

4.7659183431042584e-32
6.230328285900534e-32


 20%|██        | 20/100 [00:02<00:11,  6.82it/s]

6.562476878934234e-32
2.64606879350734e-31


 22%|██▏       | 22/100 [00:03<00:11,  7.03it/s]

2.3516011942815166e-32
3.367311216381259e-32


 24%|██▍       | 24/100 [00:03<00:10,  7.01it/s]

3.112353588963317e-32
3.478453305605056e-32


 26%|██▌       | 26/100 [00:03<00:11,  6.53it/s]

1.2730812268589058e-31
3.6610819314532425e-32


 28%|██▊       | 28/100 [00:04<00:10,  6.82it/s]

9.557168484837746e-32
6.941623393302789e-32


 30%|███       | 30/100 [00:04<00:10,  6.89it/s]

5.649439838352975e-32
8.951485559525595e-32


 32%|███▏      | 32/100 [00:04<00:10,  6.49it/s]

6.791663544676385e-32
8.229966596056476e-32


 34%|███▍      | 34/100 [00:05<00:09,  6.76it/s]

6.432958764064421e-32
1.39260588536714e-31


 36%|███▌      | 36/100 [00:05<00:09,  6.70it/s]

4.091952693453254e-32
5.517917777176825e-32


 38%|███▊      | 38/100 [00:05<00:09,  6.52it/s]

5.837157263950009e-32
4.0130478202418443e-32


 40%|████      | 40/100 [00:05<00:09,  6.64it/s]

3.066153480062384e-32
1.7008760225615956e-31


 42%|████▏     | 42/100 [00:06<00:08,  6.47it/s]

1.2487945558496657e-31
1.4395326165052413e-31


 44%|████▍     | 44/100 [00:06<00:08,  6.89it/s]

3.2394399584616685e-31
2.8987612147420435e-32


 46%|████▌     | 46/100 [00:06<00:07,  7.10it/s]

4.561485132199975e-32
9.43618037823994e-32


 48%|████▊     | 48/100 [00:07<00:07,  6.75it/s]

4.5071793807229365e-32
9.630407116179265e-32


 50%|█████     | 50/100 [00:07<00:07,  6.89it/s]

3.097024691250505e-32
4.41134817065076e-32


 52%|█████▏    | 52/100 [00:07<00:07,  6.40it/s]

5.122119042019332e-32
5.743304663535259e-32


 54%|█████▍    | 54/100 [00:08<00:06,  6.63it/s]

4.114512031361319e-32
2.5719476272246747e-32


 56%|█████▌    | 56/100 [00:08<00:06,  6.59it/s]

8.023832190110026e-32
2.1311593489366675e-31


 58%|█████▊    | 58/100 [00:08<00:06,  6.46it/s]

8.098465711082659e-32
6.403908920147733e-32


 60%|██████    | 60/100 [00:08<00:05,  6.86it/s]

3.864342451402911e-32
5.056437947745992e-32


 62%|██████▏   | 62/100 [00:09<00:05,  6.64it/s]

5.506682923049282e-32
1.3804477098693659e-31


 64%|██████▍   | 64/100 [00:09<00:05,  7.01it/s]

2.2624796307377958e-32
4.371152448560652e-32


 66%|██████▌   | 66/100 [00:09<00:04,  7.14it/s]

7.150107037369467e-32
3.896278664583732e-32


 68%|██████▊   | 68/100 [00:10<00:04,  6.96it/s]

4.9580235706702e-32
3.2665487335939234e-32


 70%|███████   | 70/100 [00:10<00:04,  7.20it/s]

3.8713377708372167e-32
5.016547949061516e-32


 72%|███████▏  | 72/100 [00:10<00:04,  6.84it/s]

3.63330219703315e-32
1.9479674074772845e-31


 74%|███████▍  | 74/100 [00:10<00:03,  7.11it/s]

3.7139331368027105e-32
4.188531554387453e-32


 76%|███████▌  | 76/100 [00:11<00:03,  7.20it/s]

1.27824301040485e-31
2.939987798775703e-32


 78%|███████▊  | 78/100 [00:11<00:03,  6.89it/s]

4.461497544965218e-32
1.1344754291853108e-31


 80%|████████  | 80/100 [00:11<00:02,  6.97it/s]

7.926367857231052e-32
1.3123841909090402e-31


 82%|████████▏ | 82/100 [00:12<00:02,  6.25it/s]

5.786144735176905e-32
6.042032012475286e-32


 84%|████████▍ | 84/100 [00:12<00:02,  6.55it/s]

5.186220678791735e-32
9.304799345993894e-32


 86%|████████▌ | 86/100 [00:12<00:02,  6.74it/s]

4.51011529799355e-31
1.1454706346750863e-31


 88%|████████▊ | 88/100 [00:13<00:01,  6.40it/s]

1.9743098616146748e-31
6.519345811985987e-32


 90%|█████████ | 90/100 [00:13<00:01,  6.57it/s]

5.015105303664314e-32
3.2902801866622686e-32


 92%|█████████▏| 92/100 [00:13<00:01,  6.29it/s]

3.560091289568503e-32
1.2040756702103698e-31


 94%|█████████▍| 94/100 [00:13<00:00,  6.64it/s]

9.120904085715028e-32
8.395673852371244e-32


 96%|█████████▌| 96/100 [00:14<00:00,  7.01it/s]

4.2847387679825206e-32
5.99521429975538e-32


 98%|█████████▊| 98/100 [00:14<00:00,  6.87it/s]

2.653519183970563e-32
1.626375188881506e-31


100%|██████████| 100/100 [00:14<00:00,  6.75it/s]

4.6102575994877353e-32
7.578200387133849e-32
Simple tests done!





#### Medium tests

In [11]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Medium tests done!')

100%|██████████| 10/10 [00:00<00:00, 13.89it/s]

Medium tests done!





#### Complex tests:

In [12]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'
    
print('Complex tests done!')

100%|██████████| 10/10 [00:13<00:00,  1.36s/it]

Complex tests done!





In [13]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

np.float64(-0.0002894411714423062)

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!