# Revert data to less aggregated data </br> (eg. daily-aggregated data to hourly-aggregated data)

### Author: Lina Siegrist

### This notebook include function where you can apply any boolean mask to a dataframe with different time aggregations.

In [1]:
import pandas as pd
import numpy as np
import copy
import datetime
from datetime import datetime

In [2]:
from sys import path as syspath
syspath.insert(1, '../src/')

In [3]:
from scripts import apply_filters as af, filter_times as ft

## Define paths to data files

In [9]:
root_path = "../data/processed/park1/"

# data from pipeline
park1_combined_bool = root_path + "masks/df_bool_BDfilt_dayfilt.csv"

# data from "Preprocessing.ipynb"
park1_EPI_unfiltered_filepath = root_path + "preprocessing/df_EPI.csv"

# data from "Detect_timewindow.ipynb"
park1_clusters_filepath = '../data/processed/park1/park1_string_clusters_filtered.csv'

## Import file: Combine two boolean dataframe: big-drop filter & bad-day filter

In [10]:
# clusters of strings
df_clusters = pd.read_csv(park1_clusters_filepath, delimiter=',')

In [11]:
# unfiltered EPI
EPI_unfiltered = pd.read_csv(park1_EPI_unfiltered_filepath)
EPI_unfiltered['datetime'] = pd.to_datetime(EPI_unfiltered['datetime'], format='%Y-%m-%d %H')
EPI_unfiltered.set_index('datetime', drop = True, inplace = True)

# basic preprocessing for unfiltered data (remove bad strings manually)
bottom = df_clusters['bottom'].dropna().tolist()
EPI_bottom = EPI_unfiltered[bottom]
EPI_unfiltered = EPI_unfiltered.drop(columns = EPI_bottom.columns).drop([col for col in EPI_unfiltered.columns.to_list() if col.startswith("ST 2.7")] + ["ST 2.5.4", "ST 4.4.1", "ST 4.5.2"], axis = 1)

In [12]:
# boolean dataframe (big-drop filter & bad-day filter)
df_bool_BDfilt_dayfilt = pd.read_csv(park1_combined_bool)
df_bool_BDfilt_dayfilt['datetime'] = pd.to_datetime(df_bool_BDfilt_dayfilt['datetime'], format='%Y-%m-%d %H')
df_bool_BDfilt_dayfilt.set_index('datetime', drop = True, inplace = True)

## Get it back the filtered data (freq: hour) from boolean dataframe <font color='orange'>(without timemask)</font>

#### LINA's comment: <font color='red'>Here mean() is changed to median</font> (19 Apr 2021)

In [13]:
# unfiltered data (frequency: hour)
df = copy.deepcopy(EPI_unfiltered)
df_hour=df.resample('H').median()
df_hour = df_hour.iloc[:-2,:] # remove the last two rows (2020-07-01)

In [59]:
# '''
# LINA's memo
# This line will take some time to be done.
# '''
# # get boolean dataframe with original time period (hour)
# df_bool_h = ft.good_day_bool(df_hour, df_bool_BDfilt_dayfilt)

In [65]:
# get boolean dataframe with original time period (hour)
df_bool_h = ft.good_day_bool_optimized(df_hour, df_bool_BDfilt_dayfilt)

In [67]:
# mask(substitute) data with NaN for bad days
df_BDfilt_dayfilt_h = df_hour.mask(df_bool_h)

In [236]:
# Export filtered data to csv
df_BDfilt_dayfilt_h.to_csv("df_park1_allfilters_hourly_without_timemask_updated.csv")

## Get it back the filtered data (freq: hour) from boolean dataframe (freq: day) <font color='orange'>(with timemask)</font>

#### LINA's comment: <font color='red'>Here mean() is changed to median</font> (19 Apr 2021)

In [69]:
# unfiltered data (frequency: hour) + timemask 
df_tm = ft.timemask(df, 15, 19)
df_tm_hour = df_tm.resample('H').median()
time = [ind for ind in df_tm_hour.index if (ind.hour>=16)&(ind.hour<=18)]
df_tm_hour=df_tm_hour.loc[time]

In [108]:
# '''
# LINA's memo
# This line will take some time to be done.
# '''
# # get boolean dataframe with original time period (hour)
# df_bool_tm_h = ft.good_day_bool(df_tm_hour, df_bool_BDfilt_dayfilt)

In [70]:
# get boolean dataframe with original time period (hour)
df_bool_tm_h = ft.good_day_bool_optimized(df_tm_hour, df_bool_BDfilt_dayfilt)

In [71]:
# mask(substitute) data with NaN for bad days
df_BDfilt_dayfilt_tm_h = df_tm_hour.mask(df_bool_tm_h)

In [237]:
# Export filtered data to csv
df_BDfilt_dayfilt_tm_h.to_csv("df_park1_allfilters_hourly_timemasked_updated.csv")