## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [1]:
import numpy as np

In [39]:
"""
import numpy as np

class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob
        
    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        self.indices_list = []
        data_length = len(data)
        for bag in range(self.num_bags):
            self.indices_list.append(np.random.randint(low=0, 
                                                       high=data_length,
                                                       size=data_length))


    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.
        
        example:
        
        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []
        for bag in range(self.num_bags):
            model = model_constructor()
            data_bag, target_bag = data[self.indices_list[bag]], target[self.indices_list[bag]] # Your Code Here
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here
        if self.oob:
            self.data = data
            self.target = target
        
    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        '''
        predictions = np.zeros(len(data))
        for model in self.models_list:
            predictions += model.predict(data)
            #preds.append(model.predict(data))
        '''
        return [model.predict(data) for model in self.models_list]
        

    
    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]
        list_of_predictions_lists = [[model.predict(self.data[i].reshape(1, -1)).item() for model in np.array(self.models_list)[np.array([i not in val for val in self.indices_list])]] for i in range(len(self.data))]
        '''
        for i in range(len(self.data)):
            sample = self.data[i].reshape(1,-1)
            predictions = []
            for bag in range(self.num_bags):
                if i not in self.indices_list[bag]:
                    predictions.append(self.models_list[bag].predict(sample))
            list_of_predictions_lists[i] = predictions
        '''
        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)
        
    
    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()

        self.oob_predictions = [np.mean(el) if len(el) > 0 else None for el in self.list_of_predictions_lists]
        
        
    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()

        return np.mean([(x[0] - x[1])**2 for x in zip(self.oob_predictions, self.target) if x[0] is not None])

"""
import numpy as np

class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob
        
    def _generate_splits(self, data: np.ndarray):
        '''Generate indices for every bag and store in self.indices_list list'''
        
        self.indices_list = []
        data_length = len(data)
        
        for bag in range(self.num_bags):
            self.indices_list.append(np.random.randint(low=0, 
                                                       high=data_length,
                                                       size=data_length))


    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.
        
        example:
        
        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        
        self.data = None
        self.target = None
        self._generate_splits(data)
        
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        
        self.models_list = []
        
        for bag in range(self.num_bags):
            data_bag, target_bag = data[self.indices_list[bag]], target[self.indices_list[bag]] # Your Code Here
            self.models_list.append(model_constructor().fit(data_bag, target_bag)) # store fitted models here
        if self.oob is True:
            self.data = data
            self.target = target
        
    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        result_predictions = []
        for model in self.models_list:
            result_predictions.append(model.predict(data))
        return result_predictions

        
    
    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]


        
        for i in range(len(self.data)):
            
            predictions = []
            
            for bag in range(self.num_bags):
            
                if i not in self.indices_list[bag]:
            
                    predictions.append(float(self.models_list[bag].predict(self.data[i].reshape(1, -1))))
            
            list_of_predictions_lists[i] = predictions
        
        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)
        
    
    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()
        self.oob_predictions = []
        
        for elem in self.list_of_predictions_lists:
        
            if len(elem) > 0:
                self.oob_predictions.append(np.mean(elem))
        
            else:
                self.oob_predictions.append(None)
        
        
    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()
        result = []
        for elem in range(len(self.oob_predictions)):
            if self.oob_predictions[elem] is not None:
                result.append((self.oob_predictions[elem] - self.target[elem])**2)
        return np.mean(result)


### Local tests:

In [3]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Simple tests:

In [41]:
import warnings
warnings.filterwarnings("ignore")

for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Simple tests done!')

  1%|          | 1/100 [00:00<00:42,  2.36it/s]

1.457712463115084e-31


  2%|▏         | 2/100 [00:00<00:43,  2.23it/s]

1.6431237158961804e-31


  3%|▎         | 3/100 [00:01<00:49,  1.95it/s]

1.0357016875890091e-31


  4%|▍         | 4/100 [00:01<00:47,  2.01it/s]

1.1097099441787926e-31


  5%|▌         | 5/100 [00:02<00:43,  2.18it/s]

2.379590089757003e-31


  6%|▌         | 6/100 [00:02<00:37,  2.50it/s]

9.222116748966438e-32


  7%|▋         | 7/100 [00:02<00:35,  2.60it/s]

1.3696217013377505e-31


  8%|▊         | 8/100 [00:03<00:36,  2.54it/s]

2.5691521029905363e-31


  9%|▉         | 9/100 [00:03<00:35,  2.59it/s]

1.2417238186486267e-31


 10%|█         | 10/100 [00:04<00:34,  2.58it/s]

1.4075995584515687e-31


 11%|█         | 11/100 [00:04<00:35,  2.50it/s]

1.210234415361037e-31


 12%|█▏        | 12/100 [00:04<00:33,  2.62it/s]

1.2135490217695755e-31


 13%|█▎        | 13/100 [00:05<00:34,  2.54it/s]

1.239735993434364e-31


 14%|█▍        | 14/100 [00:05<00:34,  2.52it/s]

1.3861317328895427e-31


 15%|█▌        | 15/100 [00:06<00:34,  2.49it/s]

1.049383374652699e-31


 16%|█▌        | 16/100 [00:06<00:34,  2.40it/s]

1.344361560137816e-31


 17%|█▋        | 17/100 [00:07<00:36,  2.29it/s]

1.3993407054481922e-31


 18%|█▊        | 18/100 [00:07<00:37,  2.20it/s]

8.462090712153746e-32


 19%|█▉        | 19/100 [00:07<00:35,  2.28it/s]

1.144325263575633e-31


 20%|██        | 20/100 [00:08<00:35,  2.28it/s]

1.5230130508062438e-31


 21%|██        | 21/100 [00:08<00:34,  2.30it/s]

1.3129823324796643e-31


 22%|██▏       | 22/100 [00:09<00:33,  2.33it/s]

3.091581043058287e-31


 23%|██▎       | 23/100 [00:09<00:32,  2.40it/s]

1.447542297204471e-31


 24%|██▍       | 24/100 [00:10<00:30,  2.49it/s]

9.434337605182719e-32


 25%|██▌       | 25/100 [00:10<00:30,  2.46it/s]

2.522143937536093e-31


 26%|██▌       | 26/100 [00:10<00:30,  2.46it/s]

1.5842340920532103e-31


 27%|██▋       | 27/100 [00:11<00:28,  2.53it/s]

7.615425100766365e-32


 28%|██▊       | 28/100 [00:11<00:28,  2.57it/s]

1.1975320203663884e-31


 29%|██▉       | 29/100 [00:11<00:27,  2.59it/s]

8.19297808078349e-32


 30%|███       | 30/100 [00:12<00:27,  2.56it/s]

7.965775762359086e-32


 31%|███       | 31/100 [00:12<00:27,  2.54it/s]

9.47460767197173e-32


 32%|███▏      | 32/100 [00:13<00:26,  2.55it/s]

2.6648094439592223e-31


 33%|███▎      | 33/100 [00:13<00:26,  2.55it/s]

1.072868647276357e-31


 34%|███▍      | 34/100 [00:13<00:25,  2.57it/s]

1.358496367589451e-31


 35%|███▌      | 35/100 [00:14<00:26,  2.42it/s]

8.819518847585221e-32


 36%|███▌      | 36/100 [00:14<00:27,  2.34it/s]

1.5751349109669467e-31


 37%|███▋      | 37/100 [00:15<00:30,  2.05it/s]

1.1232443858201728e-31


 38%|███▊      | 38/100 [00:15<00:28,  2.17it/s]

2.7674463101979014e-31


 39%|███▉      | 39/100 [00:16<00:26,  2.31it/s]

7.398694026331939e-32


 40%|████      | 40/100 [00:16<00:24,  2.45it/s]

1.23360374246211e-31


 41%|████      | 41/100 [00:16<00:22,  2.64it/s]

1.177988433613118e-31


 42%|████▏     | 42/100 [00:17<00:20,  2.82it/s]

1.52359451847133e-31


 43%|████▎     | 43/100 [00:17<00:20,  2.83it/s]

1.4058631579124815e-31


 44%|████▍     | 44/100 [00:17<00:19,  2.86it/s]

1.3446482890094514e-31


 45%|████▌     | 45/100 [00:18<00:18,  2.91it/s]

1.11172446782658e-31


 46%|████▌     | 46/100 [00:18<00:18,  2.94it/s]

6.28439349379926e-32


 47%|████▋     | 47/100 [00:18<00:19,  2.79it/s]

3.0996836802103825e-31


 48%|████▊     | 48/100 [00:19<00:18,  2.88it/s]

2.048383019375656e-31


 49%|████▉     | 49/100 [00:19<00:17,  2.98it/s]

1.3561849442998119e-31


 50%|█████     | 50/100 [00:19<00:17,  2.88it/s]

1.3280069508145645e-31


 51%|█████     | 51/100 [00:20<00:16,  2.89it/s]

1.3542116319434415e-31


 52%|█████▏    | 52/100 [00:20<00:17,  2.81it/s]

1.158550257260194e-31


 53%|█████▎    | 53/100 [00:21<00:16,  2.79it/s]

2.9683344064398936e-31


 54%|█████▍    | 54/100 [00:21<00:16,  2.79it/s]

1.2978857858428481e-31


 55%|█████▌    | 55/100 [00:21<00:16,  2.72it/s]

1.1575653800054855e-31


 56%|█████▌    | 56/100 [00:22<00:16,  2.72it/s]

1.4238953265544995e-31


 57%|█████▋    | 57/100 [00:22<00:15,  2.79it/s]

2.542413086686005e-31


 58%|█████▊    | 58/100 [00:22<00:15,  2.79it/s]

1.3788111474794252e-31


 59%|█████▉    | 59/100 [00:23<00:14,  2.84it/s]

9.913808712941018e-32


 60%|██████    | 60/100 [00:23<00:13,  2.89it/s]

6.905954356134547e-32


 61%|██████    | 61/100 [00:23<00:13,  2.89it/s]

9.752655314876535e-32


 62%|██████▏   | 62/100 [00:24<00:13,  2.85it/s]

1.1136016305298821e-31


 63%|██████▎   | 63/100 [00:24<00:13,  2.66it/s]

1.728396555493038e-31


 64%|██████▍   | 64/100 [00:25<00:13,  2.66it/s]

1.6144046416018607e-31


 65%|██████▌   | 65/100 [00:25<00:12,  2.71it/s]

9.896402002828506e-32


 66%|██████▌   | 66/100 [00:25<00:12,  2.69it/s]

9.958748519449901e-32


 67%|██████▋   | 67/100 [00:26<00:11,  2.77it/s]

2.0042916035595877e-31


 68%|██████▊   | 68/100 [00:26<00:11,  2.71it/s]

1.013072066140923e-31


 69%|██████▉   | 69/100 [00:26<00:11,  2.67it/s]

1.0200694973279868e-31


 70%|███████   | 70/100 [00:27<00:11,  2.55it/s]

1.271738302526746e-31


 71%|███████   | 71/100 [00:27<00:10,  2.68it/s]

2.6934505005929926e-31


 72%|███████▏  | 72/100 [00:27<00:10,  2.80it/s]

9.85889678659609e-32


 73%|███████▎  | 73/100 [00:28<00:09,  2.82it/s]

9.509195071692883e-32


 74%|███████▍  | 74/100 [00:28<00:09,  2.78it/s]

1.2513748985677876e-31


 75%|███████▌  | 75/100 [00:29<00:09,  2.52it/s]

1.099256092794958e-31


 76%|███████▌  | 76/100 [00:29<00:09,  2.66it/s]

1.4594043478065244e-31


 77%|███████▋  | 77/100 [00:29<00:08,  2.73it/s]

8.844610481329654e-32


 78%|███████▊  | 78/100 [00:30<00:07,  2.82it/s]

1.5399568497459335e-31


 79%|███████▉  | 79/100 [00:30<00:07,  2.94it/s]

2.5139935855563505e-31


 80%|████████  | 80/100 [00:30<00:06,  3.02it/s]

9.704965253374668e-32


 81%|████████  | 81/100 [00:31<00:06,  3.01it/s]

1.7978110471005674e-31


 82%|████████▏ | 82/100 [00:31<00:06,  2.97it/s]

1.0952810184380497e-31


 83%|████████▎ | 83/100 [00:31<00:06,  2.83it/s]

1.6460083411066168e-31


 84%|████████▍ | 84/100 [00:32<00:05,  2.86it/s]

1.161555905080893e-31


 85%|████████▌ | 85/100 [00:32<00:05,  2.81it/s]

1.3565432164887328e-31


 86%|████████▌ | 86/100 [00:32<00:04,  2.90it/s]

1.1309888353790578e-31


 87%|████████▋ | 87/100 [00:33<00:04,  2.89it/s]

8.66704001007589e-32


 88%|████████▊ | 88/100 [00:33<00:04,  2.82it/s]

2.7985788585677903e-31


 89%|████████▉ | 89/100 [00:33<00:03,  2.87it/s]

1.1634963298060315e-31


 90%|█████████ | 90/100 [00:34<00:03,  2.91it/s]

8.220203346412103e-32


 91%|█████████ | 91/100 [00:34<00:03,  2.99it/s]

1.1434122819047204e-31


 92%|█████████▏| 92/100 [00:34<00:02,  2.94it/s]

1.5397149816504559e-31


 93%|█████████▎| 93/100 [00:35<00:02,  2.94it/s]

9.057293994987565e-32


 94%|█████████▍| 94/100 [00:35<00:02,  2.88it/s]

1.1853131727345665e-31


 95%|█████████▌| 95/100 [00:36<00:01,  2.88it/s]

1.0412735762175122e-31


 96%|█████████▌| 96/100 [00:36<00:01,  2.81it/s]

1.4144533677109654e-31


 97%|█████████▋| 97/100 [00:36<00:01,  2.84it/s]

1.383908334437467e-31


 98%|█████████▊| 98/100 [00:37<00:00,  2.81it/s]

1.4506631247882511e-31


 99%|█████████▉| 99/100 [00:37<00:00,  2.84it/s]

1.1628470004146424e-31


100%|██████████| 100/100 [00:37<00:00,  2.65it/s]

8.65184616151438e-32
Simple tests done!





#### Medium tests

In [46]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Medium tests done!')

100%|██████████| 10/10 [00:02<00:00,  4.66it/s]

Medium tests done!





#### Complex tests:

In [43]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'
    
print('Complex tests done!')

100%|██████████| 10/10 [00:21<00:00,  2.18s/it]

Complex tests done!





In [34]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

-0.00013444117144228995

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!