In [100]:
# evaluate model performance with outliers removed using isolation forest

from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error

In [101]:
# load the dataset
#url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
#df = read_csv(url, header=None)
df = read_csv('input-data.csv')

In [102]:
df

Unnamed: 0,Blaine,Co-Tfe%,Co-FeO%,Co-SiO2%,Co-CaO%,Co-MgO%,Co-AL2O3%,Co-P%,Co-MnO%,Co-TiO2%
0,1823.333333,67.460000,22.820000,3.029500,0.638000,0.426000,0.497000,0.091000,0.0310,0.222000
1,1837.333333,67.536667,22.656667,2.968333,0.673600,0.442567,0.492200,0.100900,0.0285,0.223267
2,1712.333333,67.650000,22.663333,2.838333,0.604900,0.392067,0.449533,0.088167,0.0272,0.270367
3,1794.333333,67.653333,22.506667,2.899667,0.622367,0.430867,0.485933,0.094567,0.0293,0.245033
4,1807.666667,67.490000,23.446667,3.013000,0.642667,0.401000,0.443333,0.094333,0.0310,0.213000
...,...,...,...,...,...,...,...,...,...,...
165,1737.333333,67.553333,23.476667,2.949000,0.659333,0.393667,0.559000,0.108000,0.0250,0.239333
166,1715.000000,67.486667,23.556667,3.085333,0.655667,0.448333,0.607000,0.104667,0.0270,0.214667
167,1707.000000,67.666667,22.693333,3.148000,0.632000,0.387000,0.604000,0.102000,0.0270,0.243000
168,1730.000000,67.930000,22.986667,2.900500,0.590500,0.381000,0.531500,0.096500,0.0255,0.242500


In [103]:
# retrieve the array
data = df.values

In [104]:
data

array([[1.82333333e+03, 6.74600000e+01, 2.28200000e+01, ...,
        9.10000000e-02, 3.10000000e-02, 2.22000000e-01],
       [1.83733333e+03, 6.75366667e+01, 2.26566667e+01, ...,
        1.00900000e-01, 2.85000000e-02, 2.23266667e-01],
       [1.71233333e+03, 6.76500000e+01, 2.26633333e+01, ...,
        8.81666670e-02, 2.72000000e-02, 2.70366667e-01],
       ...,
       [1.70700000e+03, 6.76666667e+01, 2.26933333e+01, ...,
        1.02000000e-01, 2.70000000e-02, 2.43000000e-01],
       [1.73000000e+03, 6.79300000e+01, 2.29866667e+01, ...,
        9.65000000e-02, 2.55000000e-02, 2.42500000e-01],
       [1.76300000e+03, 6.77350000e+01, 2.26100000e+01, ...,
        1.04000000e-01, 2.50000000e-02, 2.34000000e-01]])

In [105]:
print(data.shape)

(170, 10)


In [106]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(data)

In [107]:
print(yhat.shape)

(170,)


In [108]:
# select all rows that are not outliers
mask = yhat != -1

In [110]:
print(mask.shape)

(170,)


In [111]:
data = data[mask, :]

In [112]:
print(data.shape)

(153, 10)


In [113]:
# split into input and output elements
X, y = data[:, :-1], data[:, -1]

In [114]:
y

array([0.222     , 0.22326667, 0.27036667, 0.24503333, 0.213     ,
       0.209     , 0.29735   , 0.2595    , 0.209     , 0.219     ,
       0.27785   , 0.295     , 0.25      , 0.274     , 0.3147    ,
       0.2525    , 0.207     , 0.2365    , 0.3429    , 0.2535    ,
       0.2255    , 0.261     , 0.288     , 0.25333333, 0.2355    ,
       0.232     , 0.3655    , 0.2565    , 0.3065    , 0.29      ,
       0.2635    , 0.328     , 0.30945   , 0.342     , 0.2735    ,
       0.249     , 0.32205   , 0.352     , 0.318     , 0.2775    ,
       0.2988    , 0.343     , 0.285     , 0.2315    , 0.2834    ,
       0.213     , 0.2255    , 0.3347    , 0.305     , 0.35335   ,
       0.3315    , 0.208     , 0.2029    , 0.212     , 0.214     ,
       0.223     , 0.2159    , 0.222     , 0.22      , 0.22795   ,
       0.2225    , 0.214     , 0.245     , 0.209     , 0.215     ,
       0.27      , 0.2085    , 0.2115    , 0.21615   , 0.2805    ,
       0.219     , 0.215     , 0.25295   , 0.214     , 0.204  

In [115]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [116]:
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)

(102, 9) (102,)


In [117]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [118]:
# evaluate the model
yhat = model.predict(X_test)

In [119]:
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' %mae)

MAE: 0.020
