In [1]:
# evaluate model performance with outliers removed using isolation forest
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error

In [2]:
# load the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = read_csv(url, header=None)

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [3]:
# retrieve the array
data = df.values

In [4]:
data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 3.9690e+02, 4.9800e+00,
        2.4000e+01],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 3.9690e+02, 9.1400e+00,
        2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 3.9283e+02, 4.0300e+00,
        3.4700e+01],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 5.6400e+00,
        2.3900e+01],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 3.9345e+02, 6.4800e+00,
        2.2000e+01],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 7.8800e+00,
        1.1900e+01]])

In [6]:
# split into input and output elements
X, y = data[:, :-1], data[:, -1]

In [8]:
y

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [9]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [10]:
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)

(339, 13) (339,)


In [11]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

In [14]:
print(yhat.shape)

(339,)


In [15]:
# select all rows that are not outliers
mask = yhat != -1

In [16]:
mask

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [17]:
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(305, 13) (305,)


In [18]:
y_train

array([13.1, 16.2, 24.8, 20.2, 22.5, 14.8, 28.7, 20.1, 23.4, 32. , 19.1,
       50. , 20.9, 21.7, 22. , 17.2, 12.3, 21.4, 20.5, 35.2, 19.6, 22. ,
       21.7, 14.1, 21.1, 15. , 11.9, 20. , 41.3, 18.7, 50. , 18.4, 28.1,
       16.1, 17.2, 28.6, 23.6, 20.4, 19.6, 18.8, 22.6, 17.7, 30.5, 18.2,
       20.6, 24.4, 17.3, 13.3, 22.8, 20.5, 21.2, 18.8, 18.9, 18.2, 23.1,
       32.7, 24. , 10.2, 19.5, 33.1, 13.4, 15.2, 24.8, 24.3,  9.5, 24.2,
       18.5, 44. , 24.7, 21.5, 21.8, 23.8, 32.4, 24.4, 17.6, 29.8,  9.6,
       16.7, 32. , 16.1,  8.3, 26.6, 14.3, 28.4, 32.2, 17.1, 29.4, 10.4,
       31.5, 27.5, 46.7, 27.5, 17.2, 23.4, 31.6, 13.8, 22. , 24.8, 24.3,
       25.2, 21.2, 20.6, 18.7,  5.6, 19.3, 19.8, 22.3, 20.3, 12. , 23.9,
       16.5, 13.2, 33.2, 10.5,  7.5, 27.5, 18.4, 23.2, 23. , 25. ,  7.2,
       14.4, 13.1, 18.9, 25. , 16.1, 29. , 23.1, 19.3, 33.1, 24.6, 23. ,
       15.2, 27.1, 19.6, 24.5, 20.3, 34.9, 17.1, 15.6, 26.4, 22.6, 15.6,
       21.2, 22.4, 13.5, 11.7, 17.1, 31.7, 28.7, 24

In [20]:
X_train

array([[2.36482e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.96900e+02, 2.36900e+01],
       [2.59150e-01, 0.00000e+00, 2.18900e+01, ..., 2.12000e+01,
        3.92110e+02, 1.71900e+01],
       [3.65900e-02, 2.50000e+01, 4.86000e+00, ..., 1.90000e+01,
        3.96900e+02, 6.72000e+00],
       ...,
       [5.87205e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.96900e+02, 1.93700e+01],
       [3.30450e-01, 0.00000e+00, 6.20000e+00, ..., 1.74000e+01,
        3.76750e+02, 1.08800e+01],
       [8.01400e-02, 0.00000e+00, 5.96000e+00, ..., 1.92000e+01,
        3.96900e+02, 8.77000e+00]])

In [21]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [22]:
# evaluate the model
yhat = model.predict(X_test)

In [23]:
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

MAE: 3.285
