In [25]:
import pandas as pd
import numpy as np
from pandas import datetime
from matplotlib import pyplot as plt

"""
Load AirQualityUCI Data
"""

def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

df.head()

  from pandas import datetime


Unnamed: 0_level_0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,C6H6(GT)
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-03-10 18:00:00,2.6,1360.0,1046.0,166.0,1056.0,113.0,1692.0,1268.0,48.9,0.7578,11.9
2004-03-10 19:00:00,2.0,1292.0,955.0,103.0,1174.0,92.0,1559.0,972.0,47.7,0.7255,9.4
2004-03-10 20:00:00,2.2,1402.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,54.0,0.7502,9.0
2004-03-10 21:00:00,2.2,1376.0,948.0,172.0,1092.0,122.0,1584.0,1203.0,60.0,0.7867,9.2
2004-03-10 22:00:00,1.6,1272.0,836.0,131.0,1205.0,116.0,1490.0,1110.0,59.6,0.7888,6.5


In [26]:
# Visualization setup
%matplotlib
from matplotlib import pyplot as plt
import seaborn; seaborn.set()  # set plot styles
%config InlineBackend.figure_format = 'svg'
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

Using matplotlib backend: QtAgg


<matplotlib.pyplot._IonContext at 0x23fb681c7f0>

In [27]:
# Visualize the 'CO(GT)' variable
df['CO(GT)'].plot()

<AxesSubplot:xlabel='Datetime'>

In [28]:
# Linear interpolation
co =df['CO(GT)'].interpolate().copy()

In [29]:
# Visualize original and imputed data
plt.plot(df['CO(GT)'], label='original', zorder=2)
plt.plot(co, label='linear interpolation', zorder=1)
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x23fb2696910>

In [30]:
# Detecting outliers using Boxplot


In [31]:
# Calculate correlations between variables
corr_matrix = df.corr()

In [32]:
# Choose the least correlated variable
rh = df['RH'].copy().interpolate() # Relative Humidity

In [33]:
# Visualize a scatter plot(CO, RH)
plt.scatter(co, rh, s=12, c='black')
plt.xlabel('CO(GT)')
plt.ylabel('RH')
plt.title("Detecting outliers using Boxplot")

Text(0.5, 1.0, 'Detecting outliers using Boxplot')

In [34]:
# Choose the most correlated variable
nmhc = df['PT08.S2(NMHC)'].copy().interpolate() # NMHC: Non-metanic Hydrocarbon

In [35]:
# Visualize a scatter plot(CO, NMHC)
plt.scatter(co, nmhc, s=12, c='black')
plt.xlabel('CO(GT)')
plt.ylabel("NMHC")

Text(89.25, 0.5, 'NMHC')

In [36]:
"""
IQR-based Outlier Detection
"""

# Q1, Q2(median), Q3
q1 =co.quantile(0.25)
median = co.quantile(0.5)
q3 =co.quantile(0.75)
print(q1,median,q3)

1.1 1.8 2.9


In [37]:
# IQR, upper_fence, lower_fence
iqr = q3 - q1
upper_fence =q3 + 1.5*iqr
lower_fence =q1 - 1.5*iqr
print(iqr,upper_fence,lower_fence)

1.7999999999999998 5.6 -1.5999999999999996


In [38]:
# Filtering the outliers # print(outliers)
outliers = co.loc[(co > upper_fence)  | (co < lower_fence)]
print(outliers)


Datetime
2004-03-11 19:00:00    6.9
2004-03-11 20:00:00    6.1
2004-03-12 20:00:00    6.6
2004-03-14 20:00:00    5.9
2004-03-15 09:00:00    8.1
                      ... 
2005-03-23 19:00:00    6.2
2005-03-23 20:00:00    7.2
2005-03-24 19:00:00    5.9
2005-03-24 20:00:00    7.5
2005-03-25 19:00:00    5.7
Name: CO(GT), Length: 224, dtype: float64


In [39]:
# Mask for outliers
mask = co.index.isin(outliers.index)[:50]


In [45]:
co[~mask] # ~: not

In [51]:
# Visualize the normal data and outliers ~: not
plt.plot(co[~mask], label='normal', color='blue',
    marker='o', markersize=3, linestyle='None')
plt.plot(outliers, label='outliers', color='red',
    marker='x', markersize=3, linestyle='None')
plt.legend(loc='best')

In [None]:
# Removing the outliers
co_refined = co.copy()
co_refined[mask] = np.nan

In [47]:
# Linear interpolation for reconstructing outliers removed
co_refined.interpolate(inplace = True)

In [48]:
pip install seaborn


Note: you may need to restart the kernel to use updated packages.


In [49]:
"""
Detecting Outliers with Z-Scores
"""

# Visualize the distribution of the 'CO(GT)' variable
import seaborn as sns 
sns.displot(co)

<seaborn.axisgrid.FacetGrid at 0x23fb4427af0>

In [None]:
# Mean, Standard deviation
np.mean(np.mean(co)
std = np.std(co)
print(mean,co)

In [None]:
# Calculate Z-scores for each data points
outliers = []
thres = 3

for i in co:
    z_score = (i-mean) / std
    if np.abs(z_score) > thres):
        print(z_score)
        outliers.append()

In [None]:
# Simplified version of filtering outliers
outliers = co.loc[np.abs(co.mean / std > 3)]

In [None]:
# Mask for outliers

In [None]:
# Comparison of distributions before/after outlier removal
sns.distplot(co, axlabel='CO(GT)', label='original')
sns.distplot(co[~mask], label='outliers removed')
plt.legend(loc='best')

# [exer] Adjust thres

In [None]:
# Flooring and Capping
floor = co.quantile(0,1)
cap = co.quantile(0,9)
co.loc[co < floor] = floor
co.loc[co > cap] = cap 


In [None]:
# Visualize the result
