# Outliers Detection and Removal

## Outliers Detection and Removal using IQR 

In [None]:
import pandas as pd 
import numpy as np 

In [None]:
df = pd.read_table("../data/pulse.txt")

In [None]:
df.head() 

In [None]:
# calculate quantile 
Q1, Q2, Q3 = df['Height'].quantile([.25, .50, .75])
print("Q1 25 percentile of the given data is: ", Q1)
print("Q2 50 percentile of the given data is: ", Q2)
print("Q3 75 percentile of the given data is: ", Q3)

In [None]:
r = df.Height.max() - df.Height.min() 
print(r)

In [None]:
# iqr 
IQR = Q3 - Q1 
print(IQR)

In [None]:
# set upper and lower limit [Q1 - 1.5 x IQR, Q3 + 1.5 x IQR]
lower = Q1 - 1.5 * IQR 
upper = Q3 + 1.5 * IQR 
lower, upper

In [None]:
df.shape

In [None]:
# detect & removal outliers 
df_new = df[(df['Height'] < upper) & (df['Height'] > lower)]
df_new

In [None]:
df.shape, df_new.shape 

## Outliers Detection and Removal using Percentile

In [None]:
import pandas as pd 

In [None]:
df = pd.read_table("../data/pulse.txt")

In [None]:
df.head() 

In [None]:
# set maximum thresold 
max_thresold = df['Height'].quantile(0.95)
max_thresold

In [None]:
# detect outliers 
df[df['Height'] > max_thresold]

In [None]:
# set minimum thresold 
min_thresold = df['Height'].quantile(0.05)
min_thresold

In [None]:
# detect outliers 
df[df['Height'] < min_thresold]

In [None]:
# remove outliers 
df_clean = df[(df['Height'] < max_thresold) & (df['Height'] > min_thresold)]
df_clean

In [None]:
# shape of dataset 
df.shape, df_clean.shape

In [None]:
# set thresold 
min_thresold, max_thresold = df['Weight'].quantile([0.05, 0.95])
min_thresold, max_thresold

In [None]:
# detect outliers 
df[df['Weight'] > max_thresold]

In [None]:
df[df['Weight'] < min_thresold]

In [None]:
df_clean = df[(df['Weight'] < max_thresold) & (df['Weight'] > min_thresold)]
df_clean

In [None]:
df.shape, df_clean.shape 

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
df = pd.read_table('../data/pulse.txt')
df.head() 

In [None]:
sns.distplot(df['Height'])

In [None]:
df.Height.mean()

In [None]:
df.Height.std() 

In [None]:
upper_limit = df.Height.mean() + 3 * df.Height.std()
upper_limit

In [None]:
lower_limit = df.Height.mean() - 3 * df.Height.std()
lower_limit

In [None]:
# detection 
df[(df['Height']  > upper_limit) | (df['Height'] < lower_limit)]

In [None]:
# removal 
df_new = df[(df['Height'] < upper_limit) & (df['Height'] > lower_limit)]
df_new

In [None]:
df.shape, df_new.shape 

In [None]:
df.shape[0] - df_new.shape[0]

# Using Z-Score

$$
Z = \frac{x - \mu}{\sigma}
$$

In [None]:
df['ZScore'] = (df.Height - df.Height.mean())/df.Height.std() 
df.head() 

In [None]:
# detect 
df[(df['ZScore'] < -3) | df['ZScore'] > 3]

In [None]:
# removal 
df_new = df[(df['ZScore'] > -3) | df['ZScore'] < 3]
df_new

In [None]:
df.shape[0] - df_new.shape[0]