# STUDENT - what is going on below?

You need to read this Notebook and assume that a colleague has asked you for a code review. What is potentially most damaging to their ongoing productivity and correctness? 

* Are there examples of unnecessary repetition?
* Are there bad naming standards which makes the code look "non-Pythonic"?
* Can you interpret their method and results clearly?

----

# Analysis of household air-moisture levels

As part of an ongoing analysis to derive metadata from a timeseries we're using some household data at the 1 minute level to display temperature & humidity levels in our house in 2 rooms.

We then back-calculate the moisture level (grams per metre cubed) for the actual amount of water in a cubed meter or air near the sensor, to get an understanding of where the most or least water is in the house and which household events affect these levels.

We can use similar processes to derive signals for work-releated problems on our own timeseries data, this analysis combines both work and play and will be used as an internal proof-of-concept to get sign-off from the team for running a similar project on internal business data.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandera as pa


from src.utility import make_max_water_by_temp_dataframe
from src.utility import set_common_mpl_styles

FILENAME_KITCHEN = "Kitchen_export_202112311652.csv"
FILENAME_HALL = "First floor hall_export_202112311700.csv"
FILENAME_BACK_ROOM = "Dog room_export_202112311653.csv"
FILENAME_LIVING_ROOM = "Living Room_export_202112311656.csv"
FILENAME_ROOF_OFFICE = "Roof Office_export_202112311626.csv"

# filename = FILENAME_KITCHEN
filename = FILENAME_HALL
# filename = FILENAME_BACK_ROOM
# filename = FILENAME_LIVING_ROOM
# filename = FILENAME_ROOF_OFFICE


def load_data(filename):
    df = pd.read_csv(filename, parse_dates=True)

    # check for the badness from data supplier
    # we check that these columns are hard to read (that's the supplier's fault!)
    cols = [
        "Timestamp\xa0for\xa0sample\xa0frequency\xa0every\xa01 min\xa0min",
        " Temperature_Celsius",
        "Relative_Humidity",
    ]
    for idx, expected_col_name in enumerate(cols):
        if df.columns[idx] != expected_col_name:
            raise ValueError(
                f"Column {idx} expected to say {expected_col_name} but instead has {df.columns[idx]}"
            )

    # and then we can rename the columns to something sane
    df.columns = ["timestamp", "t_c", "rh"]

    df["timestamp"] = pd.to_datetime(df.reset_index()["timestamp"])

    df = df.set_index("timestamp")  # FIXED NO INPLACE
    print(f"Date range {df.index.min()} - {df.index.max()}")
    return df

In [2]:
def add_actual_water_content(df, df_moisture):
    df_water = pd.merge_asof(
        df.sort_values("t_c"), df_moisture, left_on="t_c", right_index=True
    ).sort_index()
    df_water["est_water_gm3"] = df_water["max_water_gm3"] * df_water["rh"] / 100
    return df_water

# Load data

In [6]:
def sanity_check_data(df):
    # this is a simpler example, we reset_index to keep the schema simple for a first discussion
    df = df.reset_index()

    min_date = "2021-01-01"
    max_date = "2022-12-12"
    schema = pa.DataFrameSchema(
        {
            "timestamp": pa.Column(
                "datetime64[ns]", checks=[pa.Check.gt(min_date), pa.Check.lt(max_date)]
            ),
            "rh": pa.Column(float, checks=[pa.Check.gt(0), pa.Check.lt(100)]),
        },
        ordered=True,
    )
    validated_df = schema(df, lazy=True)


def sanity_check_data_with_index(df):
    # this time we don't reset_index, so we have to configure our schema
    # to have a pa.Index entry
    min_date = "2021-01-01"
    max_date = "2022-12-12"
    schema = pa.DataFrameSchema(
        {
            "rh": pa.Column(float, checks=[pa.Check.gt(0), pa.Check.lt(100)]),
        },
        index=pa.Index(
            "datetime64[ns]", checks=[pa.Check.gt(min_date), pa.Check.lt(max_date)]
        ),
        ordered=True,
    )
    validated_df = schema(df, lazy=True)

In [8]:
df = load_data(filename)

# now let's choose a day
day1 = "2021-12-24"
day2 = "2021-12-25"

start_time = f"{day1} 20:00"
end_time = f"{day2} 10:00"

print(f"Working on {day1} - {day2}")
assert pd.to_datetime(start_time) > df.index.min(), "Must be in date range"
assert pd.to_datetime(end_time) < df.index.max(), "Must be in date range"

# generate our 30min mean summaries of the underlying data
# and use the Pandera schema checker to start a check on our data
df_30min = df.resample("30min").mean()
sanity_check_data(df_30min)
sanity_check_data_with_index(df_30min)
df_30min.head()

Date range 2021-12-22 17:00:00 - 2021-12-31 17:00:00
Working on 2021-12-24 - 2021-12-25


Unnamed: 0_level_0,t_c,rh
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-22 17:00:00,16.0,54.66
2021-12-22 17:30:00,16.0,54.58
2021-12-22 18:00:00,16.0,54.064286
2021-12-22 18:30:00,16.04,53.743333
2021-12-22 19:00:00,16.1,53.45


In [None]:
df_moisture = make_max_water_by_temp_dataframe()
df_30min_water = add_actual_water_content(df_30min, df_moisture)
df_30min_water.head()

In [None]:
print(f"Querying between {start_time} and {end_time}")
dfx = df_30min_water.query("timestamp > @start_time and timestamp < @end_time")

In [None]:
def make_estimated_water_plot(dfx, filename, time_range):
    fig, axs = plt.subplots(constrained_layout=True, figsize=(12, 8), nrows=3)
    ax = axs[0]

    dfx = dfx.copy().rename(columns={"t_c": "temperature_C"})
    line_temp = dfx["temperature_C"].plot(ylabel="Temperature C", ax=ax, marker="o")
    title = f"Estimated moisture in house air on {time_range}\n{filename}"
    set_common_mpl_styles(
        ax, grid_axis="both", title=title, ymin=0, legend=False, xlabel=""
    )
    ax.set_ylim((10, 22))  # sensible internal temperature range

    ax = axs[1]
    dfx["rh"].plot(
        ylabel="Relative Humidify (%)", c="orange", marker="o", ax=ax, xlabel=""
    )
    set_common_mpl_styles(ax, grid_axis="both", legend=False)
    ax.set_ylim((45, 85))  # sensible internal RH range

    ax = axs[2]
    dfx["est_water_gm3"].plot(ylabel="Temperature C", c="green", marker="s", ax=ax)
    ax.set_ylabel("Estimated water content (grams per $m^3$)")
    set_common_mpl_styles(ax, grid_axis="both", legend=False)
    ax.set_ylim((3, 12))  # sensible internal moisture range


make_estimated_water_plot(dfx, filename, f"{start_time} - {end_time}")

# Report temperature and water loss overnight

In [None]:
heating_off = f"{day1} 23:00"
heating_on = f"{day2} 06:30"
df_overnight = dfx.query("timestamp >= @heating_off and timestamp <= @heating_on")
df_overnight

In [None]:
t_c_max = df_overnight["t_c"].max()
t_c_min = df_overnight["t_c"].min()
t_c_diff = t_c_min - t_c_max

water_max = df_overnight["est_water_gm3"].max()
water_min = df_overnight["est_water_gm3"].min()
water_diff = water_min - water_max

print(f"Invesigating {heating_off} - {heating_on} on {filename}")
print(
    f"Temperature change {t_c_diff:0.2f} C and water change {water_diff:0.2f} g/m^3 overnight"
)