# Sample output

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import MO, WeekdayLocator
from myst_nb import glue

from myst_nb import glue

In [2]:
# filter variables
year = ["2022"]
categories = ["Bioindicator", "Coliform", "Other"]
label_colors = { "Bioindicator":"dodgerblue", "Coliform":"magenta"}
locations = ["SVT", "VNX", "MRD"]
date_range = ["2022-07-01", "2022-07-16"]

date_range_dt = (dt.date(2022, 7, 1), dt.date(2022, 7, 16))
location_markers = {"SVT":"o", "VNX":"D", "MRD":"X"}
location_colors = {"SVT":"black", "VNX":"green", "MRD":"goldenrod"}
y = "per/100ml"

new_sd = pd.read_csv("data/end/survey_data_2020_2023.csv")
new_rd = pd.read_csv("data/end/rain_data_2022.csv")

myFmt = mdates.DateFormatter('%m-%d')

def report_data(data, locations: list = None, categories: list = None, years: list = None):
    mask_year = data["year"].astype("str").isin(years)
    mask_locations = data["location"].isin(locations)
    mask_label = data["label"].isin(categories)
    data['date'] = pd.to_datetime(data['date'])
    data['per/100ml'] = data['count'] * data.coef
    data['fail'] = data[y] > 100
    
    
    return data[(mask_year)].copy()

def format_df_for_display_mean_std(data_frame, category: str = None):
        
    groups = ["location", "date", "label"]
    d = data_frame[[*groups, "mean", "std"]][data_frame["label"] == category]
    s = d[[*groups, "mean", "std"]].copy()
    s["std +"] = s["std"] + s["mean"]
    s["std -"] = s["mean"] - s["std"]
    s["std -"] = s["std -"].where( s["std -"]>=0, 0)
    
    return s

def scatter_plot_with_std(data: pd.DataFrame=None, label: str = None, x: str = None, y_one: str = None, y_two: str = None, y_three: str = None, ax: matplotlib.axes = None, color: str = None, marker: str = None):
    ax.vlines(x=data[x], ymin=data[y_one], ymax=data[y_three], color=color, alpha=.2, linestyle= "-.")
    sns.scatterplot(data = data, x=x, y=y_two, color=color, marker=marker, label=label, ax=ax)
    sns.scatterplot(data = data, x=x, y=y_three, color=color, label="mean + std", marker=7, ax=ax)
    sns.scatterplot(data = data, x=x, y=y_one, color=color,label="mean - std", marker=6, ax=ax)
    
    return ax
def major_and_minor_ticks(ax):
    loc_major = WeekdayLocator(byweekday=MO, interval=1)
    ax.xaxis.set_minor_locator(mdates.DayLocator(interval=1))  
    ax.xaxis.set_major_locator(loc_major)
    ax.xaxis.set_major_formatter(myFmt)
    ax.tick_params(axis='x', labelrotation = 45)
    ax.tick_params(axis='x', which='major', length=8, width=2, color='black')
    
    return ax

def rain_data_format(data, start, end):
    data['date'] = pd.to_datetime(data['date'])
    date_mask = (data["date"] >= start) & (data["date"] <= end)
    new_rd = data[date_mask].copy()
    
    return new_rd

rep_data = report_data(new_sd, locations=locations, categories=categories, years=year)
rain_data = rain_data_format(new_rd, rep_data["date"].min(), rep_data["date"].max())

In [3]:
# mean per sample day and location
# set the index groups
groups = ["date", "label"]
figure_name = "mean_plate_count_no_rain"
title = "Mean plate count by site, sample day and category"
project = f"Hackuarium do-it-together water quality: {year[0]}"




def scatterplot_date_label(data, y, groups, title, project, date_range, palette, year, figure_name):
    
    chart_data = data[data.label.isin(label_colors.keys())].groupby(groups, as_index=False)[y].mean()
    
    fig, ax = plt.subplots()    
    
    ax.axvspan(date_range[0], date_range[1], color="black", alpha=0.2, label="Event")
    sns.scatterplot(data = chart_data, x="date", y=y, hue="label", palette=palette)
    
    ax = major_and_minor_ticks(ax)
    ax.axvline(x=date_range_dt[0], ymin=0, ymax=1)
    ax.axvline(x=date_range_dt[1], ymin=0, ymax=1)
    ax.set_ylabel("Average colonies per 100 ml", labelpad=20)
    ax.set_xlabel("")
    
    ax.set_title(f"{project}\n{title}", loc="left")
    ax.legend().set_title('')

    file_name = f"resources/charts/figure_one_{year[0]}.jpg"    

    plt.tight_layout()

    plt.savefig(file_name)

    glue(figure_name, fig, display=False)

    plt.close()

scatterplot_date_label(rep_data, y,  groups, title, project, date_range_dt, label_colors, year, figure_name)

```{glue:} "mean_plate_count_no_rain"
```

In [4]:
rep_data[rep_data.label == "Bioindicator"][y].describe()

count    189.000000
mean      35.449735
std       72.658000
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max      400.000000
Name: per/100ml, dtype: float64

In [5]:
rep_data[rep_data.label == "Coliform"][y].describe()

count      189.000000
mean      3642.857143
std       2966.069519
min          0.000000
25%       1700.000000
50%       3200.000000
75%       4700.000000
max      21400.000000
Name: per/100ml, dtype: float64

In [6]:
mean_sample_day_location_category = rep_data[rep_data.label == "Bioindicator"].groupby(groups, as_index=False)[y].mean()
figure_name = "mean_plate_count_bioindicator_rain"

title = "Bioindicator mean plate count by sample day and mm of rain"


def scatterplot_date_label_rain(data, rain_data, y,  date_range_dt, groups, label, title, project, palette, year, figure_name):

    chart_data = data[data.label == label].groupby(groups, as_index=False)[y].mean()
    
    fig, ax= plt.subplots()    
    
    ax2 = ax.twinx()
    
    ax.axvspan(date_range_dt[0], date_range_dt[1], color="black", alpha=0.2, label="Dates of interest")
    ax.scatter(data = chart_data, x="date", y=y, color="dodgerblue", label=label)
    ax2.bar(data=rain_data, x = "date", height="mm", color="b", alpha=.1, label="rain") 
    
    ax = major_and_minor_ticks(ax)
    ax.axvline(x=date_range_dt[0], ymin=0, ymax=1)
    ax.axvline(x=date_range_dt[1], ymin=0, ymax=1)
    ax.set_ylabel("Average colonies per 100 ml", labelpad=20)
    ax2.set_ylabel("Millimeters of rain")
    ax.set_xlabel("")

    # format legend
    ax2h, ax2l = ax2.get_legend_handles_labels()
    rain_handle = ax2h[:1]
    rain_label = ["rain mm"]
    handles, labels = ax.get_legend_handles_labels()
    ax.legend([*handles, *rain_handle], [*labels, *rain_label], loc="upper right")
        
    ax.set_title(f"{project}\n{title}", loc="left")
    
    
    file_name = f"resources/charts/figure_two_{year[0]}.jpg"    

    plt.tight_layout()

    plt.savefig(file_name)

    glue(figure_name, fig, display=False)

    plt.close()

scatterplot_date_label_rain(rep_data, rain_data, y, date_range_dt,  groups, "Bioindicator", title, project, label_colors, year, figure_name)

```{glue:} "mean_plate_count_bioindicator_rain"
```

In [7]:
# bioindicator results
title = "Bioindicator averages +/- standard deviation"
figure_name3 = f"location_summary_bio_indicators_{year[0]}"
file_name = f"resources/charts/figure_three_{year[0]}.jpg"


def location_summary(data):    

    by_location = data.groupby(["location", "date", "label"], as_index=True).agg({y:"mean", "sample":"nunique", "fail":"sum"})
    std_locations = data.groupby(["location", "date", "label"], as_index=True).agg({y:"std"})
    by_location["std"] = by_location.index.map(lambda x: std_locations.loc[x, y ])
    by_location["weight"] = by_location["sample"]/by_location["sample"].sum()
    by_location["mean"] = by_location[y]
    by_location.reset_index(inplace=True, drop=False)
    
    return by_location

by_location = location_summary(rep_data)


def location_summary_charts(data, date_range, locations, project, title, figure_name, file_name):

    fig, ax = plt.subplots()
    
    # the shaded area of the jazz
    ax.axvspan(date_range_dt[0], date_range_dt[1],  color="black", alpha=.1,  label="Dates of interest")
    
    # the results from each location
    for site in locations:
        data = format_df_for_display_mean_std(by_location[by_location.location == site], category="Bioindicator")
        ax = scatter_plot_with_std(**dict(data=data, label=site, x="date", y_one="std -", y_two="mean", y_three="std +", ax=ax, color=location_colors[site], marker=location_markers[site]))
    
    
    ax = major_and_minor_ticks(ax)
    ax.set_ylabel("Average colonies per 100 ml")
    ax.set_xlabel("")
    
    handles, labels = ax.get_legend_handles_labels()
    h = [handles[0], handles[1], handles[4], handles[7]]
    ls = ["event", "SVT", "VNX", "MRD"]
    ax.get_legend().remove()
    ax.set_title(f"{project}\n{title}", loc="left")
    
    plt.legend(h, ls, loc="upper right", fontsize=10)
        

    plt.tight_layout()

    plt.savefig(file_name)

    glue(figure_name, fig, display=False)

    plt.close()

location_summary_charts(by_location, date_range_dt, locations, project, title, figure_name, file_name)

```{glue:} {{figure_name3}}
```

## Are the plate counts for bioindicators durring the event greater than before or after the event?

In [8]:


data = rep_data[rep_data.label == "Bioindicator"]

before = data[data["before event"] == True][y].values
during = data[data["event"] == True][y].values
after = data[data["after event"] == True ][y].values

d = [before, during, after]

title = "Bioindicator plate counts before, during and after jazz"
figure_name4 = f"before_during_after_{year[0]}"
file_name = f"resources/charts/figure_four_{year[0]}.jpg"

def boxplots_before_during_after(data, project, title, figure_name, file_name):

    fig, ax = plt.subplots()

    ax.boxplot(d)
    ax.set_ylabel("Average colonies per 100 ml")
    ax.set_xticklabels(["before event", "during event", "after event"])

    ax.set_title(f"{project}\n{title}", loc="left")
    plt.tight_layout()

    plt.savefig(file_name)

    glue(figure_name, fig, display=False)

    plt.close()

```{glue:} {{figure_name4}}
```