Removing Outliers

In [1]:
import pandas as pd
from scipy import stats
import numpy as np

In [2]:
df = pd.read_json('../../datasets/data.json')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  166 non-null    float64
 1   Pulse     165 non-null    float64
 2   Maxpulse  165 non-null    float64
 3   Calories  162 non-null    float64
dtypes: float64(4)
memory usage: 6.6 KB


In [4]:
df.shape

(169, 4)

In [5]:
df.isnull().sum().sum()

np.int64(18)

In [6]:
df.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,166.0,165.0,165.0,162.0
mean,64.277108,107.448485,133.945455,375.917901
std,42.484454,14.365612,16.556722,268.013841
min,15.0,80.0,100.0,50.3
25%,45.0,100.0,124.0,250.775
50%,60.0,105.0,131.0,317.85
75%,60.0,111.0,141.0,386.7
max,300.0,159.0,184.0,1860.4


In [7]:
# Use z-score when data follows a normal distribution (bell curve)
z_score_duration = np.abs(stats.zscore(df["Duration"]))
z_score_duration

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [8]:
df_clean_duration = df[z_score_duration < 3]

In [9]:
df_clean_duration

Unnamed: 0,Duration,Pulse,Maxpulse,Calories


In [10]:
z_score_pulse = np.abs(stats.zscore(df["Pulse"]))
z_score_pulse

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [11]:
df_clean_pulse = df[z_score_pulse < 3]

In [12]:
df_clean_pulse

Unnamed: 0,Duration,Pulse,Maxpulse,Calories


In [13]:
# Use when data is skewed (not normally distributed)
Q1 = df["Calories"].quantile(0.25)  # First quartile
Q3 = df["Calories"].quantile(0.75)  # Third quartile
IQR = Q3 - Q1  # Interquartile range

In [14]:
IQR

np.float64(135.92500000000007)

In [15]:
df_clean_calories = df[(df["Calories"] >= (Q1 - 1.5 * IQR)) & (df["Calories"] <= (Q3 + 1.5 * IQR))]

In [16]:
df_clean_calories

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60.0,110.0,130.0,409.1
1,60.0,117.0,145.0,479.0
2,60.0,103.0,135.0,340.0
3,45.0,109.0,175.0,282.4
4,45.0,117.0,148.0,406.0
...,...,...,...,...
164,60.0,105.0,140.0,290.8
165,60.0,110.0,145.0,300.4
166,60.0,115.0,145.0,310.2
167,75.0,120.0,150.0,320.4


In [17]:
df_clean_calories.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,142.0,141.0,141.0,145.0
mean,52.464789,108.056738,133.659574,300.965517
std,17.712704,14.874606,16.787253,100.037622
min,15.0,80.0,100.0,50.3
25%,45.0,100.0,122.0,246.0
50%,60.0,106.0,131.0,300.1
75%,60.0,112.0,141.0,361.9
max,120.0,159.0,182.0,563.2


In [18]:
df_clean_calories.info()

<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  142 non-null    float64
 1   Pulse     141 non-null    float64
 2   Maxpulse  141 non-null    float64
 3   Calories  145 non-null    float64
dtypes: float64(4)
memory usage: 5.7 KB


In [19]:
# Use when you don't want to lose data but want to cap extreme values.
lower = df["Calories"].quantile(0.05)  # 5th percentile
upper = df["Calories"].quantile(0.95)  # 95th percentile
df["Calories"] = df["Calories"].clip(lower, upper)

In [24]:
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60.0,110.0,130.0,409.1
1,60.0,117.0,145.0,479.0
2,60.0,103.0,135.0,340.0
3,45.0,109.0,175.0,282.4
4,45.0,117.0,148.0,406.0
...,...,...,...,...
164,60.0,105.0,140.0,290.8
165,60.0,110.0,145.0,300.4
166,60.0,115.0,145.0,310.2
167,75.0,120.0,150.0,320.4


In [25]:
lower, upper

(np.float64(124.01), np.float64(872.3799999999998))