# Outlier Detection & Treatments

### Loading Libraries

In [20]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Warnings
import warnings

# OS
import os
import sys
import pickleshare
import missingno as msno
from itertools import cycle

# PyArrow
import pyarrow as pa

# Path & Notebook Optimizer
from pathlib import Path
from tqdm.auto import tqdm

# Scikit-Learn
from sklearn.metrics import mean_absolute_error

# IPython
from IPython.display import display, HTML

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
tqdm.pandas()

np.random.seed()

pio.templates.default = "plotly_white"

sys.path.append('/Users/joaquinromero/Desktop/MTSF')

In [4]:
warnings.filterwarnings("ignore", category=UserWarning)

warnings.filterwarnings("ignore", category=FutureWarning)

warnings.filterwarnings("ignore", message="'force_all_finite' was renamed to 'ensure_all_finite'")

In [5]:
os.makedirs("imgs/chapter_03", exist_ok=True)

In [6]:
preprocessed = Path.home() / "Desktop" / "data" / "london_smart_meters" / "preprocessed"

In [7]:
assert preprocessed.is_dir(), "You have to run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02 before running this notebook"

In [8]:
from itertools import cycle

def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", font_size=15, title_font_size=20):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
        
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title=dict(
            text=fig.layout.title.text if fig.layout.title.text else "",
            x=0.5,
            xanchor="center",
            yanchor="top",
            font=dict(size=title_font_size)
        ),
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title=dict(
                text=ylabel,
                font=dict(size=font_size)
            ),
            tickfont=dict(size=font_size)
        ),
        xaxis=dict(
            title=dict(
                text=xlabel,
                font=dict(size=font_size)
            ),
            tickfont=dict(size=font_size)
        )
    )
    
    return fig

### Reading Blocks 0-7

In [9]:
from src.utils.data_utils import compact_to_expanded

In [10]:
try:
    block_df = pd.read_parquet(preprocessed/"london_smart_meters_merged_block_0-7.parquet")
    display(block_df.head())
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02
    </div>
    """))

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,MAC000002,2012-10-13,30min,"[0.263, 0.2689999999999999, 0.275, 0.256, 0.21...",24144,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[186, 186, 188, 188, 190, 190, 203, 203, 206, ...","[8.78, 8.78, 8.27, 8.27, 7.87, 7.87, 7.89, 7.8...","[6.28, 6.28, 6.21, 6.21, 6.22, 6.22, 6.76, 6.7...","[1007.7, 1007.7, 1007.36, 1007.36, 1006.73, 10...","[7.55, 7.55, 7.34, 7.34, 6.75, 6.75, 6.89, 6.8...","[2.28, 2.28, 1.81, 1.81, 1.95, 1.95, 1.83, 1.8...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, clear-night, clear-...","[0.84, 0.84, 0.87, 0.87, 0.89, 0.89, 0.93, 0.9...","[Clear, Clear, Clear, Clear, Partly Cloudy, Pa..."
1,MAC000246,2012-01-01,30min,"[0.509, 0.317, 0.253, 0.249, 0.93, 0.607, 0.10...",37872,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[229, 229, 238, 238, 229, 229, 231, 231, 227, ...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[10.97, 10.97, 11.02, 11.02, 11.04, 11.04, 10....","[1008.1, 1008.1, 1007.88, 1007.88, 1007.95, 10...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[5.9, 5.9, 6.06, 6.06, 5.31, 5.31, 4.68, 4.68,...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, clo...","[0.93, 0.93, 0.9, 0.9, 0.91, 0.91, 0.93, 0.93,...","[Mostly Cloudy, Mostly Cloudy, Overcast, Overc..."
2,MAC000450,2012-03-23,30min,"[1.337, 1.426, 0.996, 0.971, 0.994, 0.952, 0.8...",33936,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[78, 78, 73, 73, 81, 81, 80, 80, 75, 75, 71, 7...","[8.76, 8.76, 8.54, 8.54, 8.09, 8.09, 7.34, 7.3...","[7.25, 7.25, 7.12, 7.12, 7.17, 7.17, 6.68, 6.6...","[1027.41, 1027.41, 1026.91, 1026.91, 1026.54, ...","[7.59, 7.59, 7.43, 7.43, 7.24, 7.24, 7.34, 7.3...","[2.18, 2.18, 2.07, 2.07, 1.72, 1.72, 1.34, 1.3...","[rain, rain, rain, rain, rain, rain, rain, rai...","[fog, fog, fog, fog, fog, fog, fog, fog, fog, ...","[0.9, 0.9, 0.91, 0.91, 0.94, 0.94, 0.96, 0.96,...","[Foggy, Foggy, Foggy, Foggy, Foggy, Foggy, Fog..."
3,MAC001074,2012-05-09,30min,"[0.18, 0.086, 0.106, 0.173, 0.146, 0.223, 0.21...",31680,ToU,ACORN-,ACORN-,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[215, 215, 207, 207, 215, 215, 216, 216, 126, ...","[11.46, 11.46, 11.38, 11.38, 11.38, 11.38, 10....","[10.23, 10.23, 10.17, 10.17, 10.24, 10.24, 10....","[1007.39, 1007.39, 1007.21, 1007.21, 1007.06, ...","[11.46, 11.46, 11.38, 11.38, 11.38, 11.38, 10....","[2.35, 2.35, 2.15, 2.15, 1.84, 1.84, 1.22, 1.2...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.92, 0.92, 0.92, 0.92, 0.93, 0.93, 0.95, 0.9...","[Partly Cloudy, Partly Cloudy, Mostly Cloudy, ..."
4,MAC003223,2012-09-18,30min,"[0.076, 0.079, 0.123, 0.109, 0.051, 0.069, 0.0...",25344,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[236, 236, 240, 240, 242, 242, 244, 244, 248, ...","[14.06, 14.06, 13.12, 13.12, 12.53, 12.53, 12....","[10.82, 10.82, 10.29, 10.29, 9.86, 9.86, 9.83,...","[1011.09, 1011.09, 1010.82, 1010.82, 1010.65, ...","[14.06, 14.06, 13.12, 13.12, 12.53, 12.53, 12....","[3.86, 3.86, 3.81, 3.81, 4.27, 4.27, 4.12, 4.1...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, clear-night, clear-...","[0.81, 0.81, 0.83, 0.83, 0.84, 0.84, 0.86, 0.8...","[Clear, Clear, Clear, Clear, Clear, Clear, Par..."


In [11]:
#Converting to Expanded Form
exp_block_df = compact_to_expanded(block_df[block_df.file=="block_7"], timeseries_col = 'energy_consumption',
static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary'],
ts_identifier = "LCLid")

exp_block_df.head()

  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,timestamp,LCLid,energy_consumption,frequency,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,2012-01-01 00:00:00,MAC000050,0.175,30min,37872,Std,ACORN-D,Affluent,block_7,NO_HOLIDAY,...,229,12.12,10.97,1008.1,12.12,5.9,rain,partly-cloudy-night,0.93,Mostly Cloudy
1,2012-01-01 00:30:00,MAC000050,0.212,30min,37872,Std,ACORN-D,Affluent,block_7,NO_HOLIDAY,...,229,12.12,10.97,1008.1,12.12,5.9,rain,partly-cloudy-night,0.93,Mostly Cloudy
2,2012-01-01 01:00:00,MAC000050,0.313,30min,37872,Std,ACORN-D,Affluent,block_7,NO_HOLIDAY,...,238,12.59,11.02,1007.88,12.59,6.06,rain,cloudy,0.9,Overcast
3,2012-01-01 01:30:00,MAC000050,0.302,30min,37872,Std,ACORN-D,Affluent,block_7,NO_HOLIDAY,...,238,12.59,11.02,1007.88,12.59,6.06,rain,cloudy,0.9,Overcast
4,2012-01-01 02:00:00,MAC000050,0.257,30min,37872,Std,ACORN-D,Affluent,block_7,NO_HOLIDAY,...,229,12.45,11.04,1007.95,12.45,5.31,rain,partly-cloudy-night,0.91,Mostly Cloudy


In [12]:
# Taking a single time series from the block
ts_df = exp_block_df[exp_block_df.LCLid=="MAC000193"].set_index("timestamp")

#### Filling in Missing Values

In [13]:
from src.imputation.interpolation import SeasonalInterpolation
ts = SeasonalInterpolation(seasonal_period=48*7).fit_transform(ts_df.energy_consumption.values.reshape(-1,1)).squeeze()

### Outlier Detection

In [22]:
from itertools import cycle

def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", font_size=15, title_font_size=20):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
        
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title=dict(
            text=fig.layout.title.text if fig.layout.title.text else "",
            x=0.5,
            xanchor="center",
            yanchor="top",
            font=dict(size=title_font_size)
        ),
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title=dict(
                text=ylabel,
                font=dict(size=font_size)
            ),
            tickfont=dict(size=font_size)
        ),
        xaxis=dict(
            title=dict(
                text=xlabel,
                font=dict(size=font_size)
            ),
            tickfont=dict(size=font_size)
        )
    )
    
    return fig

In [15]:
from src.decomposition.seasonal import MultiSeasonalDecomposition, STL

from src.outliers.outlier_detection import detect_outlier_iqr,detect_outlier_sd, generalized_esd, seasonal_esd, detect_outlier_isolation_forest

In [16]:
res_df = pd.DataFrame(columns=["# of Outliers", "% of Outliers"])

### Standard Deviation

In [17]:
def detect_outlier_sd(ts, sd_multiple=2):
    mean = ts.mean()
    std = ts.std()
    higher_bound = mean + sd_multiple*std
    lower_bound = mean - sd_multiple*std
    outlier_mask = (ts>higher_bound) | (ts<lower_bound)
    return outlier_mask

In [18]:
#Detecting Outliers with 3 SD window
outlier_mask = detect_outlier_sd(ts, sd_multiple=3)
res_df.loc["3SD", "# of Outliers"] = outlier_mask.sum()
res_df.loc["3SD", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 802 | % of Outliers: 2.12%


In [28]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Standard Deviation")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_std.png")
# fig.show()

In [24]:
stl = MultiSeasonalDecomposition(seasonal_model="fourier",seasonality_periods=["day_of_year", "day_of_week", "hour"], model = "additive", n_fourier_terms=10)
res = stl.fit(pd.Series(ts, index=ts_df.index))

In [25]:
#Detecting Outliers with 2 SD window
outlier_mask = detect_outlier_sd(res.resid, 3)
res_df.loc["2SD on Residuals", "# of Outliers"] = outlier_mask.sum()
res_df.loc["2SD on Residuals", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 735 | % of Outliers: 1.94%


In [27]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Seasonal Standard Deviation")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_std_seasonal.png")
# fig.show()

### IQR

In [29]:
def detect_outlier_iqr(ts, iqr_multiple=2):
    q1, q2, q3 = np.quantile(ts, 0.25), np.quantile(ts, 0.5), np.quantile(ts, 0.75)
    iqr = q3-q1
    higher_bound = q3 + iqr_multiple*iqr
    lower_bound = q1 - iqr_multiple*iqr
    outlier_mask = (ts>higher_bound) | (ts<lower_bound)
    return outlier_mask

In [30]:
#Detecting Outliers with 4 IQR window
outlier_mask = detect_outlier_iqr(ts, 4)
res_df.loc["4IQR", "# of Outliers"] = outlier_mask.sum()
res_df.loc["4IQR", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 747 | % of Outliers: 1.97%


In [32]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="IQR")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_iqr.png")
# fig.show()

In [33]:
#Detecting Outliers with 4 IQR window on deseasonalized data
outlier_mask = detect_outlier_iqr(res.resid, 4)
res_df.loc["4SD on Residuals", "# of Outliers"] = outlier_mask.sum()
res_df.loc["4SD on Residuals", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 502 | % of Outliers: 1.33%


In [35]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Seasonal IQR")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_iqr_seasonal.png")
# fig.show()

### Isolation Forest

In [38]:
# class sklearn.ensemble.IsolationForest(*,n_estimators=100,max_samples='auto',contamination='auto',max_features=1.0, bootstrap=False,n_jobs=None,random_state=None,verbose=0,warm_start=False)

In [39]:
outlier_mask = detect_outlier_isolation_forest(ts, outlier_fraction=0.01)
res_df.loc["Isolation Forest", "# of Outliers"] = outlier_mask.sum()
res_df.loc["Isolation Forest", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 364 | % of Outliers: 0.96%


In [41]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Isolation Forest")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_isolation_forest.png")
# fig.show()

In [42]:
outlier_mask = detect_outlier_isolation_forest(res.resid.values, outlier_fraction=0.01)
res_df.loc["Isolation Forest on Residuals", "# of Outliers"] = outlier_mask.sum()
res_df.loc["Isolation Forest on Residuals", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 375 | % of Outliers: 0.99%


In [43]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Seasonal Isolation Forest")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_isolation_forest_seasonal.png")
# fig.show()

### Extreme Studentized Deviate (ESD) and Seasonal Extreme Studentized Deviate (S-ESD)

In [44]:
outlier_mask = generalized_esd(ts, max_anomalies=800, alpha=0.05, hybrid=False)
res_df.loc["ESD", "# of Outliers"] = outlier_mask.sum()
res_df.loc["ESD", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 420 | % of Outliers: 1.11%


In [46]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="ESD")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_esd.png")
# fig.show()

In [47]:
stl = MultiSeasonalDecomposition(seasonal_model="fourier",seasonality_periods=["day_of_year", "day_of_week", "hour"], model = "additive", n_fourier_terms=10)
outlier_mask = seasonal_esd(pd.Series(ts, index=ts_df.index), stl, max_anomalies=800, alpha=0.05, hybrid=False)
res_df.loc["S-ESD", "# of Outliers"] = outlier_mask.sum()
res_df.loc["S-ESD", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

# of Outliers: 431 | % of Outliers: 1.14%


In [48]:
# fig = plot_outliers(ts_df.index, ts, outlier_mask, method="S-ESD")
# format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
# fig.write_image("imgs/chapter_3/outliers_s-esd.png")
# fig.show()

### Summary

In [49]:
res_df.style.format({"% of Outliers": "{:.2f}%"})

Unnamed: 0,# of Outliers,% of Outliers
3SD,802,2.12%
2SD on Residuals,735,1.94%
4IQR,747,1.97%
4SD on Residuals,502,1.33%
Isolation Forest,364,0.96%
Isolation Forest on Residuals,375,0.99%
ESD,420,1.11%
S-ESD,431,1.14%
