### Putting all 4 dragon balls together, and summon the divine dragon...

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.kernel_ridge import KernelRidge

import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR

# We will write customized regressor classes which inherits the following base classes from sklearn.
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

import pickle

from dictionaries import *;
from RegressorEncapsulation import *;

pd.set_option('display.max_columns', 500)

#### Preparation

In [2]:
housing_coords = pd.read_csv('../data/ames_housing_price_data_v6.csv', index_col = 0);
if "SalePrice.1" in housing_coords.columns:
    housing_coords.drop("SalePrice.1", axis = 1, inplace = True);

y = housing_coords["SalePrice"]
# y_std = svr_price_scaler.fit_transform(np.array(np.log10(y)).reshape(-1,1));
front_end = housing_coords.copy();
front_end.reset_index(inplace = True)

In [32]:
# Loading files and steal the parameters.
cbl = CatBoostRegressor(subsample= 0.85, depth= 2, random_seed= 0, learning_rate= 0.04, iterations= 4000, verbose=False);
lm = Lasso(alpha=1e-06, copy_X=True, fit_intercept= True, max_iter= 1000, normalize= True, positive= False, precompute= False, selection= 'cyclic', tol= 0.001);
svrg = SVR(C= 6000, epsilon = 0.1, gamma = 6e-5, max_iter=-1, shrinking=True);

# svrl = SVR(C= 100, cache_size=200, coef0 = 0.0, epsilon = 0.1, kernel = "linear", max_iter=-1, shrinking=True);
svrl = KernelRidge(alpha=0.005, coef0 = 0.0, kernel = "linear");

In [None]:
# with open('../Matt/linearmodel.pickle', mode = 'rb') as file:
#     lm = pickle.load(file);

# with open('../Matt/SVR_model.pickle', mode = 'rb') as file:
#     svrl = pickle.load(file);

# with open('SVR_model_g.pickle', mode = 'rb') as file:
#     svrg = pickle.load(file);


In [23]:
svrl = SVR(C= 100, cache_size=200, coef0 = 0.0, epsilon = 0.1, kernel = "linear", max_iter=-1, shrinking=True);
# svrl = KernelRidge(alpha=0.005, coef0 = 0.0, kernel = "linear");

In [24]:
# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class EncapsulatedModel(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, method="cat", instance = CatBoostRegressor()):
        self.method = method;
        self.instance = instance;
        self.x_scaler = StandardScaler();
        self.y_scaler = StandardScaler();
        self.fitted = False;
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        back_end = front_to_back(X, self.method, self.x_scaler, True);
        y_proc = predictor_processing(y, self.method, self.y_scaler);
        self.instance.fit(back_end, y_proc);
        self.fitted = True;
        return self;
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        return predict_from_front(X, self.method, self.instance, self.x_scaler, self.y_scaler);


In [69]:
[1] + [0]*3

[1, 0, 0, 0]

In [56]:
a = np.array((1,2,3))
b = np.array((2,3,4))
c = np.column_stack((a,b))

In [68]:
c
np.matmul(c, np.array([1,3]))

array([ 7, 11, 15])

In [90]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models, weight=None):
        self.models = models
        if not weight:
            self.weight= [1] + [0]*(len(models)-1);
        else: self.weight = weight;
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        print(self.weight)
        self.models_ = [clone(x) for x in self.models]
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)
        return self
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.matmul(predictions, np.array(self.weight))
        # return np.mean(predictions, axis=1)

In [25]:
x = EncapsulatedModel("svrl", svrl);

In [26]:
x.fit(front_end, y)

EncapsulatedModel(instance=SVR(C=100, kernel='linear'), method='svrl')

In [27]:
y_pred = x.predict(front_end)

In [28]:
x.score(front_end, y) # Kernel ridge: 0.9391

0.9364478042572866

In [29]:
X_train, X_test, y_train, y_test = train_test_split(front_end, y, test_size=0.3, random_state=1);

In [30]:
x.fit(X_train, y_train);
x.score(X_test, y_test) # Kernel ridge: 0.9319

0.9322521550355579

In [73]:
averaged_models = AveragingModels(models = (EncapsulatedModel("cat", cbl), EncapsulatedModel("lm", lm),\
                                           EncapsulatedModel("svrl", svrl), EncapsulatedModel("svrg", svrg)),\
                                 weight = [1/4, 1/4, 1/4, 1/4]);
averaged_models.fit(front_end, y)

[0.25, 0.25, 0.25, 0.25]


AveragingModels(models=(EncapsulatedModel(instance=<catboost.core.CatBoostRegressor object at 0x16e691490>),
                        EncapsulatedModel(instance=Lasso(alpha=1e-06,
                                                         normalize=True,
                                                         tol=0.001),
                                          method='lm'),
                        EncapsulatedModel(instance=KernelRidge(alpha=0.005,
                                                               coef0=0.0),
                                          method='svrl'),
                        EncapsulatedModel(instance=SVR(C=6000, gamma=6e-05),
                                          method='svrg')),
                weight=[0.25, 0.25, 0.25, 0.25])

In [74]:
averaged_models.score(front_end, y)

0.9619382830879044

In [75]:
X_train, X_test, y_train, y_test = train_test_split(front_end, y, test_size=0.3, random_state=9);

In [76]:
%time averaged_models.fit(X_train, y_train);
%time averaged_models.score(X_test, y_test) # 0.9432789901860964

CPU times: user 9.4 s, sys: 3.24 s, total: 12.6 s
Wall time: 3.45 s
CPU times: user 1.82 s, sys: 187 ms, total: 2.01 s
Wall time: 356 ms


0.9432789901860964

### Applying GridSearchCV to find the best weight.

In [101]:
from itertools import combinations;

BINS = 20;
weight_list = [];
temp = list(range(BINS+3));
for combo in combinations(temp, 3):
    weight_list.append([combo[0]/BINS, (combo[1]-combo[0]-1)/BINS, (combo[2]-combo[1]-1)/BINS, (2+BINS-combo[2])/BINS]);


[0.0, 0.0, 0.0, 1.0]
[0.0, 1.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 1.0]
[0.0, 0.0, 0.5, 0.5]
[0.0, 0.0, 1.0, 0.0]
[0.0, 0.25, 0.0, 0.75]
[0.0, 0.25, 0.5, 0.25]
[0.0, 0.5, 0.0, 0.5]
[0.0, 0.5, 0.25, 0.25]
[0.0, 0.75, 0.0, 0.25]
[0.0, 1.0, 0.0, 0.0]
[0.25, 0.0, 0.0, 0.75]
[0.25, 0.0, 0.5, 0.25]
[0.25, 0.25, 0.0, 0.5]
[0.25, 0.25, 0.25, 0.25]
[0.25, 0.25, 0.5, 0.0]
[0.25, 0.5, 0.25, 0.0]
[0.5, 0.0, 0.0, 0.5]
[0.5, 0.0, 0.25, 0.25]
[0.5, 0.25, 0.0, 0.25]
[0.5, 0.25, 0.25, 0.0]
[0.75, 0.0, 0.0, 0.25]
[0.75, 0.25, 0.0, 0.0]
[0.0, 0.0, 1.0, 0.0]
[1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.25, 0.75]
[0.0, 0.0, 0.5, 0.5]
[0.0, 0.0, 0.75, 0.25]
[0.0, 0.0, 1.0, 0.0]
[0.0, 0.25, 0.25, 0.5]
[0.0, 0.25, 0.75, 0.0]
[0.0, 0.5, 0.0, 0.5]
[0.0, 0.5, 0.5, 0.0]
[0.0, 0.75, 0.0, 0.25]
[0.0, 1.0, 0.0, 0.0]
[0.25, 0.0, 0.25, 0.5]
[0.25, 0.0, 0.5, 0.25]
[0.25, 0.25, 0.0, 0.5]
[0.25, 0.25, 0.25, 0.25]
[0.25, 0.5, 0.0, 0.25]
[0.25, 0.75, 0.0, 0.0]
[0.5, 0.0, 0.0, 0.5]
[0.5, 0.0, 0.5, 0.0]
[0.5, 0.25, 0.0, 0.2

In [98]:
kfold = KFold(n_splits=5, shuffle = True);
params = {'weight': weight_list};
wavg = AveragingModels(models = (EncapsulatedModel("cat", cbl), EncapsulatedModel("lm", lm),\
                                           EncapsulatedModel("svrl", svrl), EncapsulatedModel("svrg", svrg)));
wavg_tuner = GridSearchCV(wavg, params, cv=kfold, return_train_score = True, n_jobs = -1)
%time wavg_tuner.fit(front_end, y)


[0.25, 0.0, 0.0, 0.75]
CPU times: user 15 s, sys: 5.25 s, total: 20.2 s
Wall time: 4min 39s


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             estimator=AveragingModels(models=(EncapsulatedModel(instance=<catboost.core.CatBoostRegressor object at 0x16e691490>),
                                               EncapsulatedModel(instance=Lasso(alpha=1e-06,
                                                                                normalize=True,
                                                                                tol=0.001),
                                                                 method='lm'),
                                               EncapsulatedModel(instance=KernelRidge(alpha=0.005,
                                                                                      coef0=0.0),
                                                                 method='svrl'),
                                               Encapsula...
                                    [0.0, 0.75, 0.25, 0.0],
                                    [0.0, 

In [99]:
wavg_tuner.cv_results_

{'mean_fit_time': array([ 8.38133984,  8.789922  ,  9.3198369 ,  9.57622104,  9.70060158,
         9.87131095, 10.01708851, 10.13733077, 10.22488747, 10.31730628,
        10.57238808, 10.23775058, 10.24682584, 10.35411496, 10.17739801,
        10.28980875, 10.26927161, 10.35892138, 10.33082938, 10.67098694,
        10.84643402, 10.68781552, 10.77028394, 10.77516437, 10.36079597,
        10.59023819, 10.61912374, 10.37661037, 10.51943655, 10.57576876,
        10.45006042, 10.75243893, 10.47809024, 10.52670865,  8.42346668]),
 'std_fit_time': array([0.12802281, 0.55608544, 0.11845347, 0.23128803, 0.16617674,
        0.15157663, 0.29348415, 0.19045585, 0.1831248 , 0.16849196,
        0.16182793, 0.14555943, 0.11703887, 0.14306474, 0.06280567,
        0.08011212, 0.33425152, 0.19497848, 0.22478557, 0.50392973,
        0.45493117, 0.27703711, 0.20070245, 0.13436062, 0.42149756,
        0.22267414, 0.20863985, 0.22314149, 0.17936156, 0.32185996,
        0.13729191, 0.14306306, 0.28474413, 0.

In [100]:
wavg_tuner.best_score_

0.952512149593192