## Objectives

This notebook details the process to detect outliers in multiple time series data using the time series decomposition method.
- Input: a csv file that contains three columns: Account (account id), Month (calendar month), Value (numerical column to detect outliers from)

## Load packages

In [1]:
library(tidyverse)
library(anomalize)
library(ggQC)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0     ✔ purrr   0.2.5
✔ tibble  2.0.1     ✔ dplyr   0.7.8
✔ tidyr   0.8.2     ✔ stringr 1.3.1
✔ readr   1.3.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


## Read in dataset

In [2]:
# read in the csv file which contains the prorated and imputed KWH_Consumption data for NYCHA accounts
tb_all = read_csv("../output/NYCHA_TS.csv")

# filter out unnecessary columns
tb_all = select(tb_all, c('Account', 'Month', 'Value'))

“Missing column names filled in: 'X1' [1]”Parsed with column specification:
cols(
  X1 = col_double(),
  Account = col_character(),
  Month = col_date(format = ""),
  Value = col_double()
)


### Get the list of accounts in the input file

In [3]:
accounts = tb_all %>% group_by(Account) 
accounts <- accounts %>% summarise(counts = n(), na_counts = sum(is.na(Value)))
accounts <- mutate(accounts, na_perc = na_counts/counts)

### Fill the missing values in the "Value" column with 0

In [4]:
tb_all <- mutate(tb_all, Value = ifelse(is.na(Value), 0, Value))

### Define a function to calculate the deviation of a residual from the limit values

In [5]:
cal_dev <- function(residual, upper, lower) {
  if (residual < lower) {
    return(lower - residual)
  } else if (residual > upper) {
    return(residual - upper)
  } else {
    return (NA)
  }
}

In [None]:
ts = select(filter(tb_all, Account == accounts$Account[[i]]), 'Month', 'Value')

# apply STL decomposition
ts_anomalized <- ts %>%
    time_decompose(Value, merge = TRUE, method = 'stl')

# rename and reorder the columns of the resulting dataframe
ts_anomalized$Account = accounts$Account[[i]]
ts_anomalized$Missing_Value = ts_anomalized$Value == 0
ts_anomalized <- rename(ts_anomalized, Calendar_Month = Month, Total = Value, Trend = trend, Seasonal = season
   , Residual = remainder)
ts_anomalized <- ts_anomalized[, c('Account', 'Calendar_Month', 'Missing_Value', 'Total', 'Trend', 'Seasonal', 'Residual')]


# Calculate residuals
iqr_3X <- ts_anomalized %>%
        anomalize(Residual, method = 'iqr', alpha = 0.05)

iqr_3X <- iqr_3X[, c('Account', 'Calendar_Month', 'Residual', 'Residual_l1', 'Residual_l2', 'anomaly')]
iqr_3X <- rename(iqr_3X, Lower = Residual_l1, Upper = Residual_l2, Anomaly = anomaly)
iqr_3X <- mutate(iqr_3X, Anomaly = ifelse(Anomaly == "Yes", TRUE, FALSE))

iqr_6X <- ts_anomalized %>%
        anomalize(Residual, method = 'iqr', alpha = 0.025)

iqr_6X <- iqr_6X[, c('Account', 'Calendar_Month', 'Residual', 'Residual_l1', 'Residual_l2', 'anomaly')]
iqr_6X <- rename(iqr_6X, Lower = Residual_l1, Upper = Residual_l2, Anomaly = anomaly)
iqr_6X <- mutate(iqr_6X, Anomaly = ifelse(Anomaly == "Yes", TRUE, FALSE))

ctrl_limits <- QC_Lines(data = ts_anomalized$Residual, method = "XmR")  
ctrl_limits <- ctrl_limits[, c('xBar_one_LCL', 'xBar_one_UCL')]
ctrl_limits <- rename(ctrl_limits, Lower= xBar_one_LCL, Upper = xBar_one_UCL)
xmr_mean <- cbind(ts_anomalized[, c('Account', 'Calendar_Month', 'Residual')], ctrl_limits)


# Add deviation from limit, rank of outlier and outlier indicator (for XmR only)
iqr_3X$Dev <- mapply(cal_dev, iqr_3X$Residual, iqr_3X$Upper, iqr_3X$Lower)
iqr_3X <- arrange(iqr_3X, desc(Dev))
iqr_3X$Rank = seq(1:nrow(iqr_3X))
iqr_3X[is.na(iqr_3X$Dev), ]$Rank <- NA

iqr_6X$Dev <- mapply(cal_dev, iqr_6X$Residual, iqr_6X$Upper, iqr_6X$Lower)
iqr_6X <- arrange(iqr_6X, desc(Dev))
iqr_6X$Rank = seq(1:nrow(iqr_6X))
iqr_6X[is.na(iqr_6X$Dev), ]$Rank <- NA

xmr_mean$Dev <- mapply(cal_dev, xmr_mean$Residual, xmr_mean$Upper, xmr_mean$Lower)
xmr_mean <- arrange(xmr_mean, desc(Dev))
xmr_mean$Rank = seq(1:nrow(xmr_mean))
xmr_mean[is.na(xmr_mean$Dev), ]$Rank <- NA
xmr_mean <- mutate(xmr_mean, Anomaly = ifelse(is.na(Dev), FALSE, TRUE))


# rename the columns
iqr_3X <- rename(iqr_3X, Lower_3X = Lower, Upper_3X = Upper, Anomaly_3X = Anomaly, Dev_3X = Dev, Rank_3X = Rank)
iqr_6X <- rename(iqr_6X, Lower_6X = Lower, Upper_6X = Upper, Anomaly_6X = Anomaly, Dev_6X = Dev, Rank_6X = Rank)
xmr_mean <- rename(xmr_mean, Lower_xmr = Lower, Upper_xmr = Upper, Anomaly_xmr = Anomaly, Dev_xmr = Dev, Rank_xmr = Rank)


# Combine the results of 3 methods
result <- 
ts_anomalized[c('Account', 'Calendar_Month', 'Missing_Value', 'Total', 'Trend', 'Seasonal', 'Residual')] %>% 
    inner_join(iqr_3X[, -3], by = c('Account', 'Calendar_Month')) %>%
    inner_join(iqr_6X[, -3], by = c('Account', 'Calendar_Month')) %>%
    inner_join(xmr_mean[, -3], by = c('Account', 'Calendar_Month')) 

# considered as an outlier if 
#     1) STL_6X outputs it as an outliers OR
#     2) STL_6X outputs it as a normal point but both other two methods output it as an outlier
result <- mutate(result
    , Anomaly_Voted = ifelse(((Anomaly_6X == TRUE) | ((Anomaly_6X == FALSE) & (Anomaly_xmr == TRUE & Anomaly_3X == TRUE))), TRUE, FALSE)
)

# weighted rank of the outlier
results_stl[[i]] <- 
    mutate(result, Rank_Weighted = ifelse(is.na(Rank_6X), (Rank_3X + Rank_xmr)/2, Rank_6X))



## Check the outliers

### Find the index of a given account

In [None]:
account = '1.0 - BLD 03_7177432'
i = match(account, accounts$Account)

In [None]:
# check the outliers identified for the given account
arrange(select(filter(results_stl[[i]], Anomaly_Voted == TRUE)
       , c('Account', 'Calendar_Month', 'Missing_Value', 'Total', 'Trend', 'Seasonal', 'Residual', 'Anomaly_3X'
          , 'Anomaly_6X', 'Anomaly_xmr', 'Anomaly_Voted', 'Rank_Weighted'))
       , Rank_Weighted)

In [None]:
arrange(select(filter(results_stl[[i]], Anomaly_Voted == TRUE)
       , c('Account', 'Calendar_Month', 'Total', 'Anomaly_Voted', 'Rank_Weighted'))
       , Rank_Weighted)

## Visualize the outliers

### XmR charts

In [None]:
x_Plot <- ggplot(results_stl[[i]], aes(x = Calendar_Month, y = Residual)) + geom_point() + geom_line() + 
  stat_QC(method = 'XmR', auto.label = T, label.digits = 2, show.1n2.sigma = F) 

mR_Plot <- ggplot(results_stl[[i]], aes(x = Calendar_Month, y = Residual)) + stat_mR() + stat_QC_labels(method="mR")

grid.arrange(x_Plot, mR_Plot, nrow=2)

### IQR 3X

#### plotting configuration for the flagged anomalies

In [None]:
# ploting function for anomaly plots
ggsetup <- function(data) {
    data %>%
        ggplot(aes(rank, value, color = outlier)) +
        geom_point() +
        geom_line(aes(y = limit_upper), color = "red", linetype = 2) +
        geom_line(aes(y = limit_lower), color = "red", linetype = 2) +
        geom_text(aes(label = index), vjust = -1.25) +
        theme_bw() +
        scale_color_manual(values = c("No" = "#2c3e50", "Yes" = "#e31a1c")) +
        expand_limits(y = 13) +
        theme(legend.position = "bottom")
}

In [None]:
outliers <- iqr(results_stl[[i]]$Residual, verbose = TRUE, alpha = 0.05)$outlier_report
outliers %>% 
    ggsetup() +
    ggtitle("Interquartile Range Method (3X IQR) - Top outlers sorted by rank") 

### IQR 6X

In [None]:
outliers <- iqr(results_stl[[i]]$Residual, verbose = TRUE, alpha = 0.025)$outlier_report
outliers %>% 
    ggsetup() +
    ggtitle("Interquartile Range Method (6X IQR) - Top outlers sorted by rank") 

## Summarize the outliers detected in all accounts

In [None]:
points = lapply(results_stl, function(i) nrow(i))
anoamly_counts = lapply(results_stl, function(i) nrow(filter(i, Anomaly_Voted == TRUE)))
na_counts = lapply(results_stl, function(i) nrow(filter(i, Missing_Value == TRUE)))
anomaly_non_na_counts = lapply(results_stl, function(i) nrow(filter(i, (Anomaly_Voted == TRUE) & (Missing_Value == FALSE))))
anomaly_na_counts = lapply(results_stl, function(i) nrow(filter(i, (Anomaly_Voted == TRUE) & (Missing_Value == TRUE))))
na_non_anomaly_counts = lapply(results_stl, function(i) nrow(filter(i, (Anomaly_Voted == FALSE) & (Missing_Value == TRUE))))

summary_stl = tibble('account' = accounts$Account, 'points' = unlist(points)
                     , 'na' = unlist(na_counts)
                     , 'anomalies' = unlist(anoamly_counts), 'anomalies_non_na' = unlist(anomaly_non_na_counts)
                     , 'anomalies_na' = unlist(anomaly_na_counts)
                     , 'na_non_anomaly' = unlist(na_non_anomaly_counts)
                    )

In [None]:
summary_stl <- mutate(summary_stl, na_perc = na/points)
summary_stl = mutate(summary_stl, anom_perc = anomalies_non_na /(points))
summary_stl = mutate(summary_stl, na_recall = anomalies_na /(na))
summary(summary_stl$anom_perc)

In [None]:
head(summary_stl)

In [None]:
hist(summary_stl$anom_perc * 100,
      main="Frequency count of % of anomalies, Decomposition",
      xlab="% of anomalies detected",
      ylab="Absolute frequency",
#       breaks=100,
      col="lightblue",
      freq=TRUE,                            # freq=FALSE means to plot density, not counts
      xaxt="n")                              # xaxt="n" means "x axis tick marks == no"
axis(side=1, at=seq(0, 50, by = 1))                        