# Outlier Detection & Treatments

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt

# Warnings
import warnings

# OS
import os
import sys
import pickleshare
import missingno as msno
from itertools import cycle

# PyArrow
import pyarrow as pa

# Path & Notebook Optimizer
from pathlib import Path
from tqdm.auto import tqdm

# Scikit-Learn
from sklearn.metrics import mean_absolute_error

# IPython
from IPython.display import display, HTML

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
tqdm.pandas()

np.random.seed()

pio.templates.default = "plotly_white"

sys.path.append('/Users/joaquinromero/Desktop/MTSF')

In [4]:
warnings.filterwarnings("ignore", category=UserWarning)

warnings.filterwarnings("ignore", category=FutureWarning)

warnings.filterwarnings("ignore", message="'force_all_finite' was renamed to 'ensure_all_finite'")

In [5]:
os.makedirs("imgs/chapter_03", exist_ok=True)

In [6]:
preprocessed = Path.home() / "Desktop" / "data" / "london_smart_meters" / "preprocessed"

In [7]:
assert preprocessed.is_dir(), "You have to run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02 before running this notebook"

In [8]:
from itertools import cycle

def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", font_size=15, title_font_size=20):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
        
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title=dict(
            text=fig.layout.title.text if fig.layout.title.text else "",
            x=0.5,
            xanchor="center",
            yanchor="top",
            font=dict(size=title_font_size)
        ),
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title=dict(
                text=ylabel,
                font=dict(size=font_size)
            ),
            tickfont=dict(size=font_size)
        ),
        xaxis=dict(
            title=dict(
                text=xlabel,
                font=dict(size=font_size)
            ),
            tickfont=dict(size=font_size)
        )
    )
    
    return fig

### Reading Blocks 0-7

In [None]:
from src.utils.data_utils import compact_to_expanded

In [None]:
try:
    block_df = pd.read_parquet(preprocessed/"london_smart_meters_merged_block_0-7.parquet")
    display(block_df.head())
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02
    </div>
    """))

In [None]:
#Converting to Expanded Form
exp_block_df = compact_to_expanded(block_df[block_df.file=="block_7"], timeseries_col = 'energy_consumption',
static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary'],
ts_identifier = "LCLid")

exp_block_df.head()

In [None]:
# Taking a single time series from the block
ts_df = exp_block_df[exp_block_df.LCLid=="MAC000193"].set_index("timestamp")

#### Filling in Missing Values

In [None]:
from src.imputation.interpolation import SeasonalInterpolation
ts = SeasonalInterpolation(seasonal_period=48*7).fit_transform(ts_df.energy_consumption.values.reshape(-1,1)).squeeze()

### Outlier Detection

In [None]:
def plot_outliers(x, ts, outlier_mask, method, font_size=15):
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=x,
        y=ts,
        mode="lines",
        name="Original"
    ))
    fig.add_trace(go.Scatter(
        x=x[outlier_mask],
        y=ts[outlier_mask],
        mode='markers',
        marker_symbol="star",
        marker_size=5,
        name="Outliers"
    ))
    fig.update_layout(
        title_text=f"Outliers using {method}: # of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%",
        # legend=dict(
        #     yanchor="top",
        #     y=0.99,
        #     xanchor="left",
        #     x=0.05
        # ),
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        )
    )
    return fig

In [None]:
from src.decomposition.seasonal import MultiSeasonalDecomposition, STL

from src.outliers.outlier_detection import detect_outlier_iqr,detect_outlier_sd, generalized_esd, seasonal_esd, detect_outlier_isolation_forest

In [None]:
res_df = pd.DataFrame(columns=["# of Outliers", "% of Outliers"])

### Standard Deviation

In [None]:
def detect_outlier_sd(ts, sd_multiple=2):
    mean = ts.mean()
    std = ts.std()
    higher_bound = mean + sd_multiple*std
    lower_bound = mean - sd_multiple*std
    outlier_mask = (ts>higher_bound) | (ts<lower_bound)
    return outlier_mask

In [None]:
#Detecting Outliers with 3 SD window
outlier_mask = detect_outlier_sd(ts, sd_multiple=3)
res_df.loc["3SD", "# of Outliers"] = outlier_mask.sum()
res_df.loc["3SD", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Standard Deviation")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_std.png")
fig.show()

In [None]:
stl = MultiSeasonalDecomposition(seasonal_model="fourier",seasonality_periods=["day_of_year", "day_of_week", "hour"], model = "additive", n_fourier_terms=10)
res = stl.fit(pd.Series(ts, index=ts_df.index))

In [None]:
#Detecting Outliers with 2 SD window
outlier_mask = detect_outlier_sd(res.resid, 3)
res_df.loc["2SD on Residuals", "# of Outliers"] = outlier_mask.sum()
res_df.loc["2SD on Residuals", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Seasonal Standard Deviation")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_std_seasonal.png")
fig.show()

### IQR

In [None]:
def detect_outlier_iqr(ts, iqr_multiple=2):
    q1, q2, q3 = np.quantile(ts, 0.25), np.quantile(ts, 0.5), np.quantile(ts, 0.75)
    iqr = q3-q1
    higher_bound = q3 + iqr_multiple*iqr
    lower_bound = q1 - iqr_multiple*iqr
    outlier_mask = (ts>higher_bound) | (ts<lower_bound)
    return outlier_mask

In [None]:
#Detecting Outliers with 4 IQR window
outlier_mask = detect_outlier_iqr(ts, 4)
res_df.loc["4IQR", "# of Outliers"] = outlier_mask.sum()
res_df.loc["4IQR", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="IQR")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_iqr.png")
fig.show()

In [None]:
#Detecting Outliers with 4 IQR window on deseasonalized data
outlier_mask = detect_outlier_iqr(res.resid, 4)
res_df.loc["4SD on Residuals", "# of Outliers"] = outlier_mask.sum()
res_df.loc["4SD on Residuals", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Seasonal IQR")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_iqr_seasonal.png")
fig.show()

### Isolation Forest

In [None]:
class sklearn.ensemble.IsolationForest(*,
                                       n_estimators=100,
                                       max_samples='auto',
                                       contamination='auto',
                                       max_features=1.0,
                                       bootstrap=False,
                                       n_jobs=None,
                                       random_state=None,
                                       verbose=0,
                                       warm_start=False)

In [None]:
outlier_mask = detect_outlier_isolation_forest(ts, outlier_fraction=0.01)
res_df.loc["Isolation Forest", "# of Outliers"] = outlier_mask.sum()
res_df.loc["Isolation Forest", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Isolation Forest")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_isolation_forest.png")
fig.show()

In [None]:
outlier_mask = detect_outlier_isolation_forest(res.resid.values, outlier_fraction=0.01)
res_df.loc["Isolation Forest on Residuals", "# of Outliers"] = outlier_mask.sum()
res_df.loc["Isolation Forest on Residuals", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="Seasonal Isolation Forest")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_isolation_forest_seasonal.png")
fig.show()

### Extreme Studentized Deviate (ESD) and Seasonal Extreme Studentized Deviate (S-ESD)

In [None]:
outlier_mask = generalized_esd(ts, max_anomalies=800, alpha=0.05, hybrid=False)
res_df.loc["ESD", "# of Outliers"] = outlier_mask.sum()
res_df.loc["ESD", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="ESD")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_esd.png")
fig.show()

In [None]:
stl = MultiSeasonalDecomposition(seasonal_model="fourier",seasonality_periods=["day_of_year", "day_of_week", "hour"], model = "additive", n_fourier_terms=10)
outlier_mask = seasonal_esd(pd.Series(ts, index=ts_df.index), stl, max_anomalies=800, alpha=0.05, hybrid=False)
res_df.loc["S-ESD", "# of Outliers"] = outlier_mask.sum()
res_df.loc["S-ESD", "% of Outliers"] = outlier_mask.sum()/len(ts)*100
print(f"# of Outliers: {outlier_mask.sum()} | % of Outliers: {outlier_mask.sum()/len(ts)*100:.2f}%")

In [None]:
fig = plot_outliers(ts_df.index, ts, outlier_mask, method="S-ESD")
format_plot(fig, xlabel="Time", ylabel="Energy Consumption")
fig.write_image("imgs/chapter_3/outliers_s-esd.png")
fig.show()

### Summary

In [None]:
res_df.style.format({"% of Outliers": "{:.2f}%"})