# For several processed data files try to find the worst room for water loss

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob

from src.utility import make_max_water_by_temp_dataframe
from src.utility import set_common_mpl_styles
from src.utility import add_actual_water_content

# Load data

In [None]:
def load_data(filename):
    print(f"Loading from {filename}")
    return pd.read_pickle(filename)

In [None]:
filenames = glob.glob("../data/processed/*.pickle")
assert len(filenames) > 0, "Expecting to see processed files - have we run 'process_data.py'?"
filenames

In [None]:
series_by_filename = []
for filename in filenames:
    df_30min_water = load_data(filename)
    
    # REMOVED DUPLICATION (DRY!)
    #df_moisture = make_max_water_by_temp_dataframe()
    #df_30min_water = add_actual_water_content(df_30min, df_moisture)

    # get first day at midnight, get last partial day and subtract a day
    # both are normalized to midnight
    start_date = df_30min_water.index[0].normalize()
    end_date = df_30min_water.index[-1].normalize() - pd.Timedelta("1d")
    days_diff = (end_date - start_date).days

    overnight_water_loss = []
    indices = []
    for day_offset in range(days_diff):
        this_start_date = pd.to_datetime(start_date) + pd.Timedelta(f"{day_offset}d")
        this_end_date = this_start_date + pd.Timedelta("6h")
        overnight = df_30min_water.loc[this_start_date:this_end_date]
        water_diff = overnight["max_water_gm3"].max() - overnight["max_water_gm3"].min()
        print(this_start_date, this_end_date, water_diff)
        indices.append(this_start_date)
        overnight_water_loss.append(-water_diff)
    ser_loss = pd.Series(overnight_water_loss, index=indices)
    series_by_filename.append(ser_loss)

# Calculate median loss per room

In [None]:
df_joined = pd.concat(series_by_filename, axis=1)
df_joined.columns = [s.replace("_", " ").split(" ")[0] for s in filenames]
df_joined.median().sort_values()

In [None]:
df_losses = df_joined.median().sort_values().to_frame()
df_losses.columns = ['median_water_loss']
def split_name(name):
    return name.split('/')[-1]
df_losses.index = [split_name(n) for n in df_losses.index.values]
fig, ax = plt.subplots(constrained_layout=True)
df_losses.plot(kind='bar', ax=ax)
set_common_mpl_styles(ax, title="Worst water loss by room", ylabel='Overnight median water loss $g/m^3$ (more is worse)')