### Test Dataset

In [1]:
#Generate gaussian data
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std

# seed the random number
seed(1)

#Generate the univariate observations with mean -50 and std 5

data = 5 * randn(10000) + 50

print('mean=%.3f  stdv = %.3f' %(mean(data),std(data)))

mean=50.049  stdv = 4.994


### Standard Deviation Method

In [3]:
##Calculating summary statistics
data_mean,data_std = mean(data),std(data)

##Define outliers
threshold = 3 * data_std

lower,upper = data_mean - threshold , data_mean + threshold

#Identifying outliers

outliers = [ x for x in data if x < lower or x > upper]

print("Identified outliers: %d"% len(outliers))

# Removing the outliers
outliers_removed = [ x for x in data if x >= lower and x <= upper]

print("Non-outlier observations : %d"%len(outliers_removed))

Identified outliers: 29
Non-outlier observations : 9971


### Interquartile Range Method

In [7]:
from numpy import percentile
##Indentifying 75th and 25th quartiles
quart_75,quart_25 = percentile(data,75),percentile(data,25)

IQR = quart_75 - quart_25

###Threshold ideally 1.5 * IQR , if you want remove extreemes 3 * IQR
threshold = 1.5 * IQR

print('quart_75 : %.3f'%quart_75)
print('quart_25 : %.3f'%quart_25)
print('threshold :%.3f'%threshold)

#calculating the lower and upper boundaries
lower ,upper = quart_25 - threshold ,quart_75 + threshold

#outliers
outliers = [x for x in data if x < lower or x > upper]

print('Outliers number :%d'%len(outliers))

#Outliers removed

outliers_removed = [x for x in data if x >= lower and x <= upper]

print('Outliers removed :%d'%len(outliers_removed))

quart_75 : 53.359
quart_25 : 46.685
threshold :10.011
Outliers number :81
Outliers removed :9919


### Automatic Outlier Detection

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import LocalOutlierFactor

df =  pd.read_csv('house.csv',header=None)

data = df.values

X = data[:,:-1]
y = data[:,-1]

print(X.shape,y.shape)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

print(X_train.shape,X_test.shape)

##Removing outliers from X_train and y_train
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

#Select the all rows that are not outliers
mask = yhat != -1
X_train,y_train = X_train[mask,:],y_train[mask]

print("after removing outliers",X_train.shape,y_train.shape)

#Train the model
lr = LinearRegression()

lr.fit(X_train,y_train)

#Evaluating the model
yhat = lr.predict(X_test)
mae = mean_absolute_error(y_test,yhat)
print("MAE : %.3f"%mae)

(506, 13) (506,)
(339, 13) (167, 13)
after removing outliers (305, 13) (305,)
MAE : 3.356
