In [2]:
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

## Carga de datos

In [2]:
boston = load_boston()
df = pd.DataFrame(data = boston["data"], columns=boston["feature_names"])
df["target"] = boston["target"]
tgt = "target"
ls_features = [x for x in df.columns if x not in [tgt]]

## Preparación de datos

In [3]:
X = df[ls_features]
y = df[[tgt]]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

In [5]:
mm_x = MinMaxScaler()
mm_y = MinMaxScaler()

In [6]:
Xs = mm_x.fit_transform(X_train) 
ys = mm_y.fit_transform(y_train)

## Modelado

### Ridge Regression

In [7]:
param_grid = {
    "alpha": [x/1000 for x in range(1000)] + [x for x in range(100)],
    "solver": ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [8]:
model = Ridge()
model.fit(Xs, ys)
clf = GridSearchCV(model, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5)
clf.fit(Xs, ys)
print("Best score: " + str(clf.best_score_))
print("Best estimator: " + str(clf.best_estimator_))

Fitting 4 folds for each of 7700 candidates, totalling 30800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  75 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 2696 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 10760 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 21128 tasks      | elapsed:   13.6s


Best score: 0.68302823719473
Best estimator: Ridge(alpha=0.573, solver='sag')


[Parallel(n_jobs=-1)]: Done 30713 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 30800 out of 30800 | elapsed:   18.4s finished


### Kernel Ridge Regression

In [9]:
param_grid = {"alpha": [x/100 for x in range(100)],
              "kernel": ['linear', 'poly', 'rbf', 'sigmoid', "chi2", "laplacian", "exponential"], 
              "degree": [1, 2, 3],
              "gamma": [x/10 for x in range(10)]}

In [10]:
n_hyper = np.product([x for x in map(len, param_grid.values())])

In [11]:
n_hyper

21000

In [3]:
model = KernelRidge()

In [13]:
model.fit(Xs, ys)

KernelRidge()

In [14]:
clf = RandomizedSearchCV(model, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5, n_iter=n_hyper*.25)
clf.fit(Xs, ys)
print("Best score: " + str(clf.best_score_))

Fitting 4 folds for each of 5250 candidates, totalling 21000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 3568 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 6160 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 9328 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 13072 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 17392 tasks      | elapsed:   36.4s


Best score: 0.8683511346010809


[Parallel(n_jobs=-1)]: Done 21000 out of 21000 | elapsed:   42.3s finished


In [None]:
clf.best_score_

In [None]:
clf.best_estimator_

### Pruebas del modelo ganador

In [15]:
y_test


Unnamed: 0,target
447,12.6
122,20.5
176,23.2
290,28.5
49,19.4
...,...
28,18.4
259,30.1
380,10.4
90,22.6


In [16]:
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
447,9.92485,0.0,18.10,0.0,0.740,6.251,96.6,2.1980,24.0,666.0,20.2,388.52,16.44
122,0.09299,0.0,25.65,0.0,0.581,5.961,92.9,2.0869,2.0,188.0,19.1,378.09,17.93
176,0.07022,0.0,4.05,0.0,0.510,6.020,47.2,3.5549,5.0,296.0,16.6,393.23,10.11
290,0.03502,80.0,4.95,0.0,0.411,6.861,27.9,5.1167,4.0,245.0,19.2,396.90,3.33
49,0.21977,0.0,6.91,0.0,0.448,5.602,62.0,6.0877,3.0,233.0,17.9,396.90,16.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,0.77299,0.0,8.14,0.0,0.538,6.495,94.4,4.4547,4.0,307.0,21.0,387.94,12.80
259,0.65665,20.0,3.97,0.0,0.647,6.842,100.0,2.0107,5.0,264.0,13.0,391.93,6.90
380,88.97620,0.0,18.10,0.0,0.671,6.968,91.9,1.4165,24.0,666.0,20.2,396.90,17.21
90,0.04684,0.0,3.41,0.0,0.489,6.417,66.1,3.0923,2.0,270.0,17.8,392.18,8.81


In [17]:
mm_x

MinMaxScaler()

In [18]:
X_test_sc=mm_x.transform(X_test)

In [19]:
X_test_sc

array([[1.34895002e-01, 0.00000000e+00, 6.42962963e-01, ...,
        8.08510638e-01, 9.78869333e-01, 4.05905077e-01],
       [1.17873816e-03, 0.00000000e+00, 9.22592593e-01, ...,
        6.91489362e-01, 9.52569469e-01, 4.47019868e-01],
       [8.69059286e-04, 0.00000000e+00, 1.22592593e-01, ...,
        4.25531915e-01, 9.90745877e-01, 2.31236203e-01],
       ...,
       [1.21001722e+00, 0.00000000e+00, 6.42962963e-01, ...,
        8.08510638e-01, 1.00000000e+00, 4.27152318e-01],
       [5.51084230e-04, 0.00000000e+00, 9.88888889e-02, ...,
        5.53191489e-01, 9.88098240e-01, 1.95364238e-01],
       [3.77193491e-02, 0.00000000e+00, 6.97777778e-01, ...,
        2.23404255e-01, 1.00000000e+00, 7.60485651e-01]])

In [20]:
kernel_model=clf.best_estimator_

In [21]:
X_test_sc=pd.DataFrame(X_test_sc)

In [22]:
X_test_sc[(X_test_sc<0)].isnull().sum()

0     152
1     152
2     151
3     152
4     152
5     152
6     152
7     152
8     152
9     152
10    152
11    152
12    152
dtype: int64

In [23]:
X_test.iloc[114]

CRIM         3.67367
ZN           0.00000
INDUS       18.10000
CHAS         0.00000
NOX          0.58300
RM           6.31200
AGE         51.90000
DIS          3.99170
RAD         24.00000
TAX        666.00000
PTRATIO     20.20000
B          388.62000
LSTAT       10.58000
Name: 485, dtype: float64

In [24]:
X_test_sc[X_test_sc[2]<0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
129,0.000102,0.8,-0.01037,0.0,0.076132,0.826595,0.299691,0.410916,0.130435,0.129771,0.191489,0.993267,0.034216


In [25]:
mm_x.data_max_

array([ 73.5341, 100.    ,  27.74  ,   1.    ,   0.871 ,   8.78  ,
       100.    ,  12.1265,  24.    , 711.    ,  22.    , 396.9   ,
        37.97  ])

In [75]:
mm_x.data_min_

array([6.3200e-03, 0.0000e+00, 7.4000e-01, 0.0000e+00, 3.8500e-01,
       3.5610e+00, 2.9000e+00, 1.1296e+00, 1.0000e+00, 1.8700e+02,
       1.2600e+01, 3.2000e-01, 1.7300e+00])

In [77]:
X_test.iloc[114,:]

CRIM         0.01381
ZN          80.00000
INDUS        0.46000
CHAS         0.00000
NOX          0.42200
RM           7.87500
AGE         32.00000
DIS          5.64840
RAD          4.00000
TAX        255.00000
PTRATIO     14.40000
B          394.23000
LSTAT        2.97000
Name: 195, dtype: float64

In [63]:
(0.46000-7.4000e-01)/(27.74-7.4000e-01)

-0.01037037037037037

In [65]:
X_test_sc[X_test_sc[2]<0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
114,8.4e-05,0.842105,-0.01037,0.0,0.076132,0.826595,0.299691,0.410916,0.130435,0.129771,0.191489,0.993267,0.034216


In [67]:
X_test_sc_new=X_test_sc.applymap(lambda x: 0  if  x<0 else x)

In [68]:
kernel_model.predict(X_test_sc_new)

array([[ 0.23277031],
       [ 0.46000865],
       [ 0.53440272],
       [ 0.34426779],
       [ 0.35745767],
       [ 0.32977949],
       [ 0.47007187],
       [ 0.2602691 ],
       [ 0.03132582],
       [ 0.40997879],
       [ 0.35713275],
       [ 0.42206537],
       [ 0.61597457],
       [ 0.85979423],
       [ 0.24637931],
       [ 0.38468355],
       [ 0.35317902],
       [ 0.4243818 ],
       [ 0.21825358],
       [ 0.26498912],
       [ 0.33022241],
       [ 0.47146644],
       [ 0.07645978],
       [ 0.16067001],
       [ 0.25218106],
       [ 0.64056293],
       [ 0.30634328],
       [ 0.78575021],
       [ 0.44736699],
       [ 0.07333458],
       [ 0.30516175],
       [ 0.35843343],
       [ 0.63396949],
       [ 0.93512627],
       [ 0.58635774],
       [ 0.33443799],
       [ 0.64435179],
       [ 0.3469881 ],
       [ 0.27354879],
       [ 0.1403382 ],
       [ 0.3896295 ],
       [ 0.46387396],
       [ 0.57797312],
       [ 0.39964002],
       [ 0.26520848],
       [ 0

In [69]:
mm_y.inverse_transform(kernel_model.predict(X_test_sc_new))

array([[15.93500191],
       [26.02438393],
       [29.3274808 ],
       [20.88548997],
       [21.47112044],
       [20.24220935],
       [26.47119089],
       [17.15594814],
       [ 6.99086624],
       [23.8030581 ],
       [21.45669423],
       [24.33970258],
       [32.94927089],
       [43.77486383],
       [16.53924142],
       [22.67994951],
       [21.28114867],
       [24.44255192],
       [15.29045889],
       [17.36551674],
       [20.26187505],
       [26.53310997],
       [ 8.99481431],
       [12.73374863],
       [16.79683911],
       [34.04099395],
       [19.20164168],
       [40.48730923],
       [25.46309435],
       [ 8.8560554 ],
       [19.14918165],
       [21.51444439],
       [33.74824544],
       [47.11960652],
       [31.63428368],
       [20.44904673],
       [34.20921932],
       [21.00627182],
       [17.74556636],
       [11.83101608],
       [22.89955002],
       [26.19600402],
       [31.2620066 ],
       [23.34401684],
       [17.37525638],
       [11

In [70]:
r2_score(y_pred=mm_y.inverse_transform(kernel_model.predict(X_test_sc_new)), y_true=y_test)

0.9056857618329777

In [72]:
clf.best_estimator_.score((mm_x.transform(X_train)), mm_y.transform(y_train))

0.9722787061278378