In [2]:
import pandas as pd 
import numpy as np 
from scipy.stats import chi2
from matplotlib import patches 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
pd.set_option('max_colwidth', 1000) # Show up to 1000 characters within each cell
pd.set_option('max_rows', 20) # Show up to 20 dataframe rows
pd.set_option('max_columns', 1000) # Show up to 1000 columns

In [3]:
df = pd.read_csv('./airquality.csv', sep=',', decimal = '.')
df.head()

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,1,41.0,190.0,7.4,67,5,1
1,2,36.0,118.0,8.0,72,5,2
2,3,12.0,149.0,12.6,74,5,3
3,4,18.0,313.0,11.5,62,5,4
4,5,,,14.3,56,5,5


In [4]:
df = df[['Ozone', 'Temp']]
df = df.dropna()
df = df.to_numpy()

In [7]:
covariance = np.cov(df, rowvar = False)
covariance_pm1 = np.linalg.matrix_power(covariance , -1)

centerpoint = np.mean(df , axis = 0)

In [19]:
#Distance between center point 
distances = []
for i, val in enumerate(df):
      p1 = val
      p2 = centerpoint
      distance = (p1-p2).T.dot(covariance_pm1).dot(p1-p2)
      distances.append(distance)
distances = np.array(distances)



In [20]:
# Cutoff (threshold) value from Chi-Sqaure Distribution for detecting outliers 
cutoff = chi2.ppf(0.95, df.shape[1])

In [21]:
# Index of outliers
outlierIndexes = np.where(distances > cutoff )

In [23]:
print('--- Observations found as outlier -----')
print(df[ distances > cutoff , :])

--- Observations found as outlier -----
[[115.  79.]
 [135.  84.]
 [122.  89.]
 [168.  81.]]
