## Numpy Statistical Functions - Outlier IQR Method

In [1]:
import numpy as np

In [2]:
def compute_quartiles(arr,axis=None):
    #STEP 1: sort the data first
    arr = np.sort(arr,axis=axis)
    #size of array
    n = np.size(arr,axis)
    
    #STEP 2: divide dataset into 2 halves
    #compute size of each half
    #for even number, it returns exact half of the array size
    #for odd number, it returns the size excluding median
    size_of_half=int(n/2)
    print('Size of half:',size_of_half)
    #slice lower half of the array
    lower_half = arr.take(indices=range(0, size_of_half), axis=axis)
    print('Lower Half\n',lower_half) 
    
    if (n % 2) == 0:
        upper_half = arr.take(indices=range(size_of_half, n), axis=axis)
    else:
        #exclude median in case of odd number of values
        upper_half = arr.take(indices=range(size_of_half+1, n), axis=axis)
    
    print('Upper Half\n',upper_half)    
    
    #STEP 3: Compute Q1 and Q3
    #Q1 is median of lower half and Q3 is median of upper half
    q1 = np.median(lower_half,axis=axis)
    q3 = np.median(upper_half,axis=axis)
    
    print('q1:',q1)
    print('q3:',q3)
        
    return q1,q3  

#this function is modified to return q1 and q3 also along with iqr
def compute_iqr(arr,axis=None):
    q1, q3 = compute_quartiles(arr,axis)
    iqr = q3 - q1
    return q1,q3,iqr

<u>IQR Method</u> 

Lower Bound: Q1 – N * IQR <br>
Upper Bound: Q3 +  N * IQR <br>
Outlier: x < Lower Bound (or)  x > Upper Bound <br>
N  = 1.5 for mild outliers <br>
N  = 3.0 for extreme outliers <br>

In [3]:
def outlier_by_iqr(arr,N):
    q1,q3,iqr = compute_iqr(arr)
    print('******** Outlier Check ************')
    
    #computer lower bound and upper bound of IQR Method
    lower_bound = q1 - N * iqr
    upper_bound = q3 + N * iqr
    
    print('IQR:',iqr)
    print('Lower Bound:',lower_bound)
    print('Upper Bound:',upper_bound)
    
    #select outliers using numpy boolean indexing with "or" condition
    outliers = arr[(arr < lower_bound) | (arr > upper_bound)  ]
    return outliers

In [4]:
data = np.array([25,47,49,54,57,59,61,63,64,67,71,72,73,79, 225])

In [5]:
#outlier detection using inner fence, (i.e) N=1.5 by default
outliers = outlier_by_iqr(data,N=1.5)
print('\nOUTLIERS:',outliers)

Size of half: 7
Lower Half
 [25 47 49 54 57 59 61]
Upper Half
 [ 64  67  71  72  73  79 225]
q1: 54.0
q3: 72.0
******** Outlier Check ************
IQR: 18.0
Lower Bound: 27.0
Upper Bound: 99.0

OUTLIERS: [ 25 225]


In [6]:
#outlier detection using outer fence, (i.e) N=3.0
outliers = outlier_by_iqr(data,N=3.0)
print('\nOUTLIERS:',outliers)

Size of half: 7
Lower Half
 [25 47 49 54 57 59 61]
Upper Half
 [ 64  67  71  72  73  79 225]
q1: 54.0
q3: 72.0
******** Outlier Check ************
IQR: 18.0
Lower Bound: 0.0
Upper Bound: 126.0

OUTLIERS: [225]


Example 2 (Impact on data size)

In [7]:
#create array with size of 100 and value=5
data2 = np.full(100,5)
data2

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])

In [8]:
#modify array to have some other elements also
data2[96] = 15
data2[97] = 67
data2[98]= -32
data2[99] = 150

data2[35:45] =7
data2[57:77] =8
data2[80:85] =10

data2

array([  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
         5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
         5,   5,   5,   5,   5,   5,   5,   5,   5,   7,   7,   7,   7,
         7,   7,   7,   7,   7,   7,   5,   5,   5,   5,   5,   5,   5,
         5,   5,   5,   5,   5,   8,   8,   8,   8,   8,   8,   8,   8,
         8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   5,
         5,   5,  10,  10,  10,  10,  10,   5,   5,   5,   5,   5,   5,
         5,   5,   5,   5,   5,  15,  67, -32, 150])

In [9]:
#outlier detection using inner fence, (i.e) N=1.5 by default
outliers = outlier_by_iqr(data2,N=1.5)
print('\nOUTLIERS:',outliers)

Size of half: 50
Lower Half
 [-32   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5
   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5
   5   5   5   5   5   5   5   5   5   5   5   5   5   5]
Upper Half
 [  5   5   5   5   5   5   5   5   5   5   5   5   7   7   7   7   7   7
   7   7   7   7   8   8   8   8   8   8   8   8   8   8   8   8   8   8
   8   8   8   8   8   8  10  10  10  10  10  15  67 150]
q1: 5.0
q3: 8.0
******** Outlier Check ************
IQR: 3.0
Lower Bound: 0.5
Upper Bound: 12.5

OUTLIERS: [ 15  67 -32 150]


In [10]:
#outlier detection using outer fence, (i.e) N=3.0
outliers = outlier_by_iqr(data2,N=3.0)
print('\nOUTLIERS:',outliers)

Size of half: 50
Lower Half
 [-32   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5
   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5   5
   5   5   5   5   5   5   5   5   5   5   5   5   5   5]
Upper Half
 [  5   5   5   5   5   5   5   5   5   5   5   5   7   7   7   7   7   7
   7   7   7   7   8   8   8   8   8   8   8   8   8   8   8   8   8   8
   8   8   8   8   8   8  10  10  10  10  10  15  67 150]
q1: 5.0
q3: 8.0
******** Outlier Check ************
IQR: 3.0
Lower Bound: -4.0
Upper Bound: 17.0

OUTLIERS: [ 67 -32 150]


Suppose let us keep only few data points (i.e only few repetitions) from the input

In [11]:
#let us reduce the size of data from 100 to just 11 by having only 2 repetitions
data3  = np.array([-32,5,5,7,7,8,8,10,15,67,150])

In [12]:
#outlier detection using inner fence, (i.e) N=1.5 by default
outliers = outlier_by_iqr(data3,N=1.5)
print('\nOUTLIERS:',outliers)

Size of half: 5
Lower Half
 [-32   5   5   7   7]
Upper Half
 [  8  10  15  67 150]
q1: 5.0
q3: 15.0
******** Outlier Check ************
IQR: 10.0
Lower Bound: -10.0
Upper Bound: 30.0

OUTLIERS: [-32  67 150]


In [13]:
#outlier detection using outer fence, (i.e) N=3.0
outliers = outlier_by_iqr(data3,N=3.0)
print('\nOUTLIERS:',outliers)

Size of half: 5
Lower Half
 [-32   5   5   7   7]
Upper Half
 [  8  10  15  67 150]
q1: 5.0
q3: 15.0
******** Outlier Check ************
IQR: 10.0
Lower Bound: -25.0
Upper Bound: 45.0

OUTLIERS: [-32  67 150]


IQR could detect extreme outliers even with just few repetitions of similar data. This implies that IQR method could work even with small data size.