In [103]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import sklearn
import mglearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib notebook

In [104]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

X, y = mglearn.datasets.make_wave(n_samples=100)
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)

In [105]:
tree_reg = DecisionTreeRegressor(min_samples_split=3).fit(X, y)
linear_reg = LinearRegression().fit(X, y)


In [106]:
X.shape, y.shape

((100, 1), (100,))

In [107]:
fig, ax = plt.subplots()
ax.plot(line, tree_reg.predict(line), label="decision tree")
ax.plot(line, linear_reg.predict(line), label="linear regression")
ax.plot(X[:, 0], y, 'o', c='k')
ax.set_ylabel("Regression output")
ax.set_xlabel("Input feature")
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7fb45a2b3f60>

In [108]:
bins = np.linspace(-3, 3, 11)
bins

array([-3. , -2.4, -1.8, -1.2, -0.6,  0. ,  0.6,  1.2,  1.8,  2.4,  3. ])

In [109]:
which_bin = np.digitize(X, bins=bins)

In [110]:
which_bin

array([[ 4],
       [10],
       [ 8],
       [ 6],
       [ 2],
       [ 2],
       [ 1],
       [ 9],
       [ 7],
       [ 8],
       [ 1],
       [10],
       [ 9],
       [ 3],
       [ 2],
       [ 2],
       [ 4],
       [ 6],
       [ 5],
       [ 3],
       [ 7],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 8],
       [ 2],
       [ 6],
       [ 6],
       [ 1],
       [ 7],
       [ 2],
       [ 1],
       [10],
       [10],
       [ 9],
       [ 4],
       [ 1],
       [ 7],
       [ 5],
       [ 2],
       [ 5],
       [ 1],
       [10],
       [ 3],
       [ 7],
       [ 4],
       [ 6],
       [ 6],
       [ 2],
       [10],
       [ 8],
       [10],
       [ 9],
       [ 6],
       [10],
       [ 1],
       [ 2],
       [ 1],
       [ 4],
       [ 4],
       [ 3],
       [ 9],
       [ 4],
       [ 3],
       [ 6],
       [ 2],
       [ 9],
       [ 1],
       [10],
       [ 8],
       [ 2],
       [ 1],
       [ 9],
       [ 8],
       [ 8],
       [ 8],

In [111]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoder.fit(which_bin)
X_binned = encoder.transform(which_bin)

In [112]:
X_binned

array([[ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.

In [113]:
line_binned = encoder.transform(np.digitize(line, bins=bins))

In [114]:
fig, ax = plt.subplots()
reg = LinearRegression().fit(X_binned, y)
ax.plot(line, reg.predict(line_binned), label="Linear regression binned")
reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned, y)
ax.plot(line, reg.predict(line_binned), label="Decision tree binned")
ax.plot(X[:, 0], y, "o", c="k")
ax.vlines(bins, -3, 3, linewidth=1, alpha=.2)
ax.legend()
ax.set_xlabel("Input feature")
ax.set_ylabel("Regression output")


<IPython.core.display.Javascript object>

Text(0,0.5,'Regression output')

In [115]:
mglearn.discrete_scatter(X, y)

[<matplotlib.lines.Line2D at 0x7fb46a82c1d0>]

In [116]:
X,y


(array([[-0.75275929],
        [ 2.70428584],
        [ 1.39196365],
        [ 0.59195091],
        [-2.06388816],
        [-2.06403288],
        [-2.65149833],
        [ 2.19705687],
        [ 0.60669007],
        [ 1.24843547],
        [-2.87649303],
        [ 2.81945911],
        [ 1.99465584],
        [-1.72596534],
        [-1.9090502 ],
        [-1.89957294],
        [-1.17454654],
        [ 0.14853859],
        [-0.40832989],
        [-1.25262516],
        [ 0.67111737],
        [-2.16303684],
        [-1.24713211],
        [-0.80182894],
        [-0.26358009],
        [ 1.71105577],
        [-1.80195731],
        [ 0.08540663],
        [ 0.55448741],
        [-2.72129752],
        [ 0.64526911],
        [-1.97685526],
        [-2.60969044],
        [ 2.69331322],
        [ 2.7937922 ],
        [ 1.85038409],
        [-1.17231738],
        [-2.41396732],
        [ 1.10539816],
        [-0.35908504],
        [-2.26777059],
        [-0.02893854],
        [-2.79366887],
        [ 2

In [117]:
X_combined = np.hstack([X, X_binned])
X_combined.shape

(100, 11)

In [118]:
reg = LinearRegression().fit(X_combined, y)
line_combined = np.hstack([line, line_binned])
fig, ax = plt.subplots()
ax.plot(line, reg.predict(line_combined))
#ax.plot(X[:, 0], y)
ax.plot(X[:, 0], y, "o", c="k")


<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fb43883cf60>]

In [119]:
X_combined = np.hstack([X_binned, X * X_binned])
X_combined[0:10]

array([[ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.        , -0.        , -0.        , -0.75275929, -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  2.70428584],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.39196365,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  

In [120]:
reg = LinearRegression().fit(X_combined, y)
line_combined = np.hstack([line_binned, line * line_binned])
fig, ax = plt.subplots()
ax.plot(line, reg.predict(line_combined))
#ax.plot(X[:, 0], y)
ax.plot(X[:, 0], y, "o", c="k")


<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fb46b103550>]