# Scikit-learn: Isolation Forest, Minimum Covariance Determinant, Local Outlier Factor, and One-Class SVM
## Import libraries 

In [17]:
# load and summarize the dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error


Housing-price dataset has 13 properties of the house. 

Fit model will predict which examples in the training dataset are outliers, that will then be removed from the training dataset, then the model will be fit on the remaining examples and evaluated on the entire test dataset.

In [18]:
# load the dataset
csv_file = 'housing.csv'
df = read_csv(csv_file, header=None)

# split into input and output elements
data = df.values
X, y = data[:, :-1], data[:, -1]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

# summarize the shape of the dataset
print(X.shape, y.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(506, 13) (506,) (339, 13) (167, 13) (339,) (167,)


In [19]:
def runAD(yhat, X_train, y_train, X_test, y_test):
    # select all rows that are not outliers
    mask = yhat != -1
    X_train, y_train = X_train[mask, :], y_train[mask]

    # summarize the shape of the updated training dataset
    print(X_train.shape, y_train.shape)

    # fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # evaluate the model
    yhat = model.predict(X_test)
    # evaluate predictions
    mae = mean_absolute_error(y_test, yhat)
    print('MAE: %.3f' % mae)

## Isolation Forest

In [20]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

runAD(yhat, X_train, y_train, X_test, y_test)


(305, 13) (305,)
MAE: 3.215


## Minimum Covariance Determinant

In [21]:
from sklearn.covariance import EllipticEnvelope

ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)

runAD(yhat, X_train, y_train, X_test, y_test)


(335, 13) (335,)
MAE: 3.388


## Local Outlier Factor

In [22]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

runAD(yhat, X_train, y_train, X_test, y_test)


(305, 13) (305,)
MAE: 3.356


## One-Class SVM

In [23]:
from sklearn.svm import OneClassSVM

ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)

runAD(yhat, X_train, y_train, X_test, y_test)



(336, 13) (336,)
MAE: 3.431
