## [Problem 1] Blending scratch mounting

In [21]:
from numpy import hstack
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
import pandas as pd

In [116]:
# get a list of base models
class BlendingRegressor:
    def __init__(self):
        self.models = []
        self.models.append(('lr', LinearRegression()))
        self.models.append(('knn', KNeighborsRegressor()))
        self.models.append(('cart', DecisionTreeRegressor()))
        self.models.append(('svm', SVR()))
        self.blender1 = LinearRegression()
        self.blender2 = VotingRegressor(self.models)
        self.blender3 = KNeighborsRegressor()
        
    def fit(self ,X_train, y_train, X_val=None, y_val=None):
        # fit all models on the training set and predict on hold out set
        meta_X = list()
        for name, model in self.models:
            # fit in training set
            model.fit(X_train.values[0:int(X_train.shape[0]/2)], y_train.values[0:int(X_train.shape[0]/2)])
            # predict on hold out set
            yhat = model.predict(X_train.values[int(X_train.shape[0]/2):])
            # reshape predictions into a matrix with one column
            yhat = yhat.reshape(len(yhat), 1)
            # store predictions as input for blending
            meta_X.append(yhat)
        # create 2d array from predictions, each set is an input feature
        meta_X = hstack(meta_X)
        # fit on predictions from base models
        self.blender1.fit(meta_X, y_train.values[int(X_train.shape[0]/2):])
        self.blender2.fit(meta_X, y_train.values[int(X_train.shape[0]/2):])
        self.blender3.fit(meta_X,y_train.values[int(X_train.shape[0]/2):])
    
    # make a prediction with the blending ensemble
    def predict1(self, X):
        # make predictions with base models
        meta_X = list()
        for name, model in self.models:
            # predict with base model
            yhat = model.predict(X)
            # reshape predictions into a matrix with one column
            yhat = yhat.reshape(len(yhat), 1)
            # store prediction
            meta_X.append(yhat)
        # create 2d array from predictions, each set is an input feature
        meta_X = hstack(meta_X)
        # predict
        return self.blender1.predict(meta_X)
    
    def predict2(self, X):
        # make predictions with base models
        meta_X = list()
        for name, model in self.models:
            # predict with base model
            yhat = model.predict(X)
            # reshape predictions into a matrix with one column
            yhat = yhat.reshape(len(yhat), 1)
            # store prediction
            meta_X.append(yhat)
        # create 2d array from predictions, each set is an input feature
        meta_X = hstack(meta_X)
        # predict
        return self.blender2.predict(meta_X)
    
    def predict3(self, X):
        # make predictions with base models
        meta_X = list()
        for name, model in self.models:
            # predict with base model
            yhat = model.predict(X)
            # reshape predictions into a matrix with one column
            yhat = yhat.reshape(len(yhat), 1)
            # store prediction
            meta_X.append(yhat)
        # create 2d array from predictions, each set is an input feature
        meta_X = hstack(meta_X)
        # predict
        return self.blender3.predict(meta_X)
    
    


In [15]:
data = pd.read_csv("train.csv")

In [55]:
len(data)

1460

In [16]:
data_select = data[["GrLivArea","YearBuilt","SalePrice"]]

In [17]:
data_select.head()

Unnamed: 0,GrLivArea,YearBuilt,SalePrice
0,1710,2003,208500
1,1262,1976,181500
2,1786,2001,223500
3,1717,1915,140000
4,2198,2000,250000


In [181]:
X_train, X_val, y_train, y_val = train_test_split(data_select[["GrLivArea","YearBuilt"]], data_select[["SalePrice"]], test_size=0.2, random_state=42)

In [89]:
# X_train.values[0:int(X_train.shape[0]/2)]

In [117]:
model = BlendingRegressor()

In [118]:
model.fit(X_train,y_train)

  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [135]:
base = LinearRegression()

In [126]:
base.fit(X_train,y_train)

LinearRegression()

In [127]:
base_result = base.predict(X_val)

In [129]:
score = mean_absolute_error(y_val, base_result)
print('Base Linear MAE: %.3f' % score)

Base Linear MAE: 32358.771


In [119]:
blender1_result = model.predict1(X_val)

In [130]:
score = mean_absolute_error(y_val, blender1_result)
print('Blending1 MAE: %.3f' % score)

Blending1 MAE: 30759.674


In [121]:
blender2_result = model.predict2(X_val)

In [131]:
score = mean_absolute_error(y_val, blender2_result)
print('Blending2 MAE: %.3f' % score)

Blending2 MAE: 31244.386


In [123]:
blender3_result = model.predict3(X_val)

In [132]:
score = mean_absolute_error(y_val, blender3_result)
print('Blending3 MAE: %.3f' % score)

Blending3 MAE: 30363.034


## [Problem 2] Scratch mounting of bagging

In [241]:
import random
class BaggingRegressor:
    def __init__(self,number_model = 200,sample_size = 0.5):
        self.models = [LinearRegression() for i in range(number_model)]
        self.sample_size = sample_size
    def fit(self,X, y):
        X = np.array(X)
        y = np.array(y)
        for model in self.models:
            sample_X , sample_y = self.subsample(X,y)
            model.fit(sample_X,sample_y)
    
    def predict(self,X):
        result = 0
        for model in self.models:
            result += model.predict(X)
        return result / len(self.models)
    # Create a random subsample from the dataset with replacement
    def subsample(self,X,y):
        sample_X = list()
        sample_y = list()
        n_sample = round(len(X) * self.sample_size)
        while len(sample_X) < n_sample:
            index = random.randrange(len(X))
            sample_X.append(X[index])
            sample_y.append(y[index])
        return np.array(sample_X) , np.array(sample_y)
    
    


In [242]:
model = BaggingRegressor()

In [243]:
model.fit(X_train,y_train)

In [244]:
base = LinearRegression()

In [245]:
base.fit(X_train,y_train)

LinearRegression()

In [246]:
base_result = base.predict(X_val)

In [247]:
score = mean_absolute_error(y_val, base_result)
print('Base Linear MAE: %.3f' % score)

Base Linear MAE: 32358.771


In [248]:
bagging_result = model.predict(X_val)

In [249]:
score = mean_absolute_error(y_val, bagging_result)
print('Bagging MAE: %.3f' % score)

Bagging MAE: 32308.624


## [Problem 3] Stacking scratch mounting

In [405]:
import random
from sklearn.model_selection import KFold 
class StackingRegressor:
    def __init__(self,folds,stages):
        self.n_folds = folds
        self.models = {}
        self.stages = stages
        self.kf = KFold(n_splits=self.n_folds)
        for i in range(self.stages):
            self.models[str(i)] = {}
            for j in range(self.n_folds):
                self.models[str(i)][str(j)] = []
                self.models[str(i)][str(j)].append(('lr', LinearRegression()))
                self.models[str(i)][str(j)].append(('knn', KNeighborsRegressor()))
                self.models[str(i)][str(j)].append(('cart', DecisionTreeRegressor()))
                self.models[str(i)][str(j)].append(('svm', SVR()))
        self.estimator = LinearRegression()
        
    def fit(self,X,y):
        X = np.array(X)
        y = np.array(y)
        dataset = np.concatenate([X,y],axis = 1)
        for stage in range(self.stages):
            dataset = self.training_algorithm(dataset,stage)
        
        self.estimator.fit(dataset[:,0:-1],dataset[:,-1])
        
    def predict(self,X):
        dataset = np.array(X)
        for stage in range(self.stages):
            dataset = self.predict_algorithm(dataset,stage)
        return self.estimator.predict(dataset)
    
    def training_algorithm(self,dataset,stage):
        datalist = [None]*len(dataset)
        count = 0
        for train_index, test_index in self.kf.split(dataset):
            name ,model = self.models[str(stage)][str(count)][0]
            model.fit(dataset[train_index,0:-1], dataset[train_index,-1])
            predicted = model.predict(dataset[test_index,0:-1]).reshape(-1,1)
            for name , model in self.models[str(stage)][str(count)][1:]:
                model.fit(dataset[train_index,0:-1], dataset[train_index,-1])
                predicted = np.concatenate([predicted,model.predict(dataset[test_index,0:-1]).reshape(-1,1)],axis = 1)
#             print( predicted.shape)
            for pos , j in enumerate(list(test_index)):
                
                datalist[j] = predicted[pos]
            count += 1
            
#         print(np.array(datalist).shape)
#         print(dataset[:,0:-1].shape)
        dataset = np.concatenate([dataset[:,0:-1],np.array(datalist),dataset[:,-1].reshape((-1,1))],axis = 1)
        return dataset
    

#     def cross_validation_split(self,dataset):
#         dataset_split = list()
#         dataset_copy = list(dataset)
#         index = list()
#         fold_size = int(len(dataset) / self.n_folds)
#         for i in range(self.n_folds):
#             fold = list()
#             index_list = list()
#             while len(fold) < fold_size:
#                 index = random.randrange(len(dataset_copy))
#                 data = dataset_copy.pop(index)
#                 fold.append(data)
# #                 index_list.append(list(dataset).index(data)) 
#                 index_list.append( np.where(dataset == data)) 
#                 print(index_list)
#             dataset_split.append(fold)
#             index.append(index_list)
#         return dataset_split , index
    
#     def training_algorithm(self,dataset,stage):
#         folds , index = self.cross_validation_split(dataset)
#         datalist = [None]*len(dataset)
#         for i ,fold in enumerate(folds):
#             train_set = list(folds)
#             train_set.remove(fold)
#             train_set = sum(train_set, [])
#             test_set = list()
#             for row in fold:
#                 row_copy = list(row)
#                 test_set.append(row_copy)
#                 row_copy[-1] = None
#             for name , model in self.models[str(stage)]:
#                 model.fit(train_set[:,0:-1], train_set[:,-1])
#                 predicted = model.predict(test_set[:,0:-1])
#                 for j,pos in enumerate(index[i]):
#                     datalist[pos] = predicted[j]
#         dataset = np.concatenate([dataset[:,0:-1],np.array(datalist),dataset[:,-1]],axis = 1)
#         return dataset
    def predict_algorithm(self,dataset,stage):
        for i in range(self.n_folds):
            name ,model = self.models[str(stage)][str(i)][0]
            predicted = model.predict(dataset).reshape(-1,1)
            for name , model in self.models[str(stage)][str(i)][1:]:
                predicted = np.concatenate([predicted,model.predict(dataset).reshape(-1,1)],axis = 1)
            if i == 0:
                data = predicted
            else:
                data = data + predicted
                
        dataset = np.concatenate([dataset,data/self.n_folds],axis = 1)
        return dataset
                                          
                                        
                                          
          


In [406]:
model = StackingRegressor(3,10)

In [407]:
model.fit(X_train,y_train)

In [408]:
stacking_results = model.predict(X_val)

In [409]:
base = LinearRegression()

In [410]:
base.fit(X_train,y_train)

LinearRegression()

In [411]:
base_result = base.predict(X_val)

In [412]:
score = mean_absolute_error(y_val, base_result)
print('Base Linear MAE: %.3f' % score)

Base Linear MAE: 32358.771


In [414]:
score = mean_absolute_error(y_val, stacking_results)
print('Stacking MAE: %.3f' % score)

Stacking MAE: 29112.708
