# Ensemble Learning Assignment
- Implement various methods of ensemble learning in Scratch.
1. Blending
## Problem 1: Scratch implementation of blending
- Give at least three examples of implementing blending from scratch and getting better accuracy than a single model. Better accuracy means smaller mean squared error (MSE) on the validation data.

In [112]:
# importing dataset
import pandas as pd
import numpy as np
house_data = pd.read_csv('train.csv')
house_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [113]:
# In this assignment we will use sales price as objective variable and grlivarea and yearbuilt as explanatory variable
X = house_data[['GrLivArea','YearBuilt']] # select GrLivArea and YearBuilt as explanatory features
y = house_data['SalePrice'] # select SalePrice as objective variable
null_count = X.isnull().sum() # check dataset for null values
null_count

GrLivArea    0
YearBuilt    0
dtype: int64

In [114]:
X = X.values # convert to ndarray
y = y.values # convert to ndarray

In [115]:
# Divide the data into 80% training data and 20% validation data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.2)

In [116]:
# standardize/normalize the values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train ) #assign standardized variables to X_train_scaled
print(X_train.mean(axis=0)) 
X_test = scaler.transform(X_test)#assign standardized variables to X_test_scaled
print(X_test.mean(axis=0)) 

[ 1.95809883e-16 -3.46716567e-15]
[-0.05060307  0.06069477]


In [117]:
print(X_train.std(axis=0))
print(X_test.std(axis=0))

[1. 1.]
[0.88576695 0.9664846 ]


In [118]:
# 1st example
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
lr = LinearRegression()
rf = RandomForestRegressor(random_state = 42)
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
blend_pred = 0.5*lr_pred + 0.5*rf_pred # calculate blend using average option
#calculate mse for each model and blended model
lr_mse = mean_squared_error(y_test, lr_pred)
rf_mse = mean_squared_error(y_test, rf_pred)
blend_mse = mean_squared_error(y_test, blend_pred)

print("Linear Regression MSE:", lr_mse)
print("Random Forest MSE:", rf_mse)
print("Blended MSE:", blend_mse)

Linear Regression MSE: 1907504023.479166
Random Forest MSE: 1679955268.6499934
Blended MSE: 1565497454.835617


In [122]:
# 2nd example
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor(random_state=42)
knn.fit(X_train, y_train)
dt.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
blend_pred = 0.5*knn_pred + 0.5*dt_pred
knn_mse = mean_squared_error(y_test, knn_pred)
dt_mse = mean_squared_error(y_test, dt_pred)
blend_mse = mean_squared_error(y_test, blend_pred)
print("K-Nearest Neighbors MSE:", knn_mse)
print("Decision Tree MSE:", dt_mse)
print("Blended MSE:", blend_mse)


K-Nearest Neighbors MSE: 1630278415.659452
Decision Tree MSE: 2605845818.5871387
Blended MSE: 1791167508.8567393


In [120]:
# 3rd example
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
nn = MLPRegressor(random_state=42, max_iter=1000)
svm = SVR(kernel='rbf')
nn.fit(X_train, y_train)
svm.fit(X_train, y_train)
nn_pred = nn.predict(X_test)
svm_pred = svm.predict(X_test)
blend_pred = 0.5*nn_pred + 0.5*svm_pred
nn_mse = mean_squared_error(y_test, nn_pred)
svm_mse = mean_squared_error(y_test, svm_pred)
blend_mse = mean_squared_error(y_test, blend_pred)

print("Neural Network MSE:", nn_mse)
print("Support Vector Machine MSE:", svm_mse)
print("Blended MSE:", blend_mse)


Neural Network MSE: 36710188886.42445
Support Vector Machine MSE: 6554411131.862905
Blended MSE: 15540676371.111324




- From above results blending gives smaller MSE than single model.

## Problem 2: Scratch Implementation of bagging
- Please provide at least one example that implements bagging from scratch and improves accuracy over a single model.

In [124]:
# building a single model decision tree regressor
from sklearn.tree import DecisionTreeRegressor 
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)
dtr_mse = mean_squared_error(y_test, dtr_pred)
print("Decision Tree MSE:", dtr_mse)

Decision Tree MSE: 2605845818.5871387


In [125]:
# building a bagging model class
class BaggingRegressor:
    def __init__(self, n_subsets = 10):
        self.n_subsets = n_subsets
        self.models = []
    def fit(self, X, y):
        for i in range(self.n_subsets):
            # new bootsrap sample for training data
            subset = np.random.choice(X.shape[0], size=X.shape[0], replace = True)
            X_sample = X[subset]
            y_sample = y[subset]
            #create a decisiontreeregressor and train it on the bootstrap sample
            tree = DecisionTreeRegressor()
            tree.fit(X_sample, y_sample)
            #add the trained decision tree regressor to the list of models
            self.models.append(tree)
    def predict(self, X):
        #make predictions using each model and return the average
        predictions = np.array([model.predict(X) for model in self.models])
        return np.mean(predictions, axis=0)
            


In [126]:
# Create a bagging regressor with 10 subsets of data
bagging = BaggingRegressor(n_subsets=10)
# Train the bagging regressor on the training data
bagging.fit(X_train, y_train)
# predict the data
y_pred = bagging.predict(X_test)
# calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
print("Bagging model MSE:", mse)

Bagging model MSE: 1732668374.9130228


In [127]:
print("Decision Tree MSE:", dtr_mse)
print("Bagging model MSE:", mse)

Decision Tree MSE: 2605845818.5871387
Bagging model MSE: 1732668374.9130228


- As you can see from the above results bagging model has less MSE than single model hence its more accurate

## Problem 3: Scratch implementation of Stacking
- Show at least one example that implements stacking from scratch and improves accuracy over a single model .

In [128]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

models = [
    DecisionTreeRegressor(max_depth=3),
    RandomForestRegressor(n_estimators=10, random_state=42)
]

# Create an empty matrix for the meta-features
meta_X_train = np.zeros((X_train.shape[0], len(models)))

# Train the stage 0 models and create the meta-features
for i, model in enumerate(models):
    model.fit(X_train, y_train)
    meta_X_train[:, i] = model.predict(X_train)

# Train the stage 1 model on the meta-features
meta_model = LinearRegression()
meta_model.fit(meta_X_train, y_train)

# Create an empty matrix for the test set meta-features
meta_X_test = np.zeros((X_test.shape[0], len(models)))

# Create the test set meta-features
for i, model in enumerate(models):
    meta_X_test[:, i] = model.predict(X_test)

# Use the stage 1 model to make predictions on the test set meta-features
y_pred = meta_model.predict(meta_X_test)

# Evaluate the performance of the stacked model
from sklearn.metrics import mean_squared_error
print('Stacked model MSE: ', mean_squared_error(y_test, y_pred, squared=True))


Stacked model MSE:  1852207205.3103755


In [129]:
import numpy as np
from sklearn.base import clone
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold


class StackingRegressor:
    def __init__(self, base_models, meta_model, n_folds):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self, X, y):
        self.base_models_ = [clone(model) for model in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        blend_train = np.zeros((X.shape[0], len(self.base_models_)))

        for i, model in enumerate(self.base_models_):
            for train_index, val_index in kf.split(X):
                X_train, y_train = X[train_index], y[train_index]
                X_val, y_val = X[val_index], y[val_index]

                model.fit(X_train, y_train)
                y_val_pred = model.predict(X_val)
                blend_train[val_index, i] = y_val_pred

        self.meta_model_.fit(blend_train, y)
        return self

    def predict(self, X):
        blend_test = np.zeros((X.shape[0], len(self.base_models_)))
        
        for i, model in enumerate(self.base_models_):
            blend_test[:, i] = model.predict(X)

        return self.meta_model_.predict(blend_test)


In [130]:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# define base models and meta-model
base_models = [
    RandomForestRegressor(n_estimators=100, random_state=42),
    LinearRegression()
]
meta_model = LinearRegression()

# instantiate and fit the stacking regressor
stacking_reg = StackingRegressor(base_models=base_models, meta_model=meta_model, n_folds=2)
stacking_reg.fit(X_train, y_train)

y_test_pred = stacking_reg.predict(X_test)


In [132]:
print('Stacked model MSE: ', mean_squared_error(y_test, y_test_pred))

Stacked model MSE:  1530048732.2503896
