In [1]:
import pandas as pd
from IPython.display import display
from sklearn.model_selection import StratifiedShuffleSplit
from diel_vector import set_x_markers, get_site_metrics, assess_df, get_dailies_for_all_metrics, pca_plot, write_rule_location_counts, write_metric_rule_counts
from tools.io import unpickle_data
from tools.plots import Plots
from tools.process_dataframe import partial_metrics, full_metrics

In [2]:
# load data
sscodes = unpickle_data('data/formatted_sscodes.pkl')

In [3]:
# parameters
n_day_groups = 5
test_proportion=0.3
x_callback=set_x_markers
axis_ranges ={x: None for x in partial_metrics + ['D']} # used for plotting, can be overwritten later to set all y axes to the same values
splitter = StratifiedShuffleSplit(test_size=test_proportion, n_splits=1)

In [4]:
# containers for later use
n_rules_data = []
index_counts_data = []
time_counts_data = []

In [None]:
def process_data(fltr):
    data = sscodes[fltr]
    for metric in full_metrics:
        data[metric] = data[metric].astype(float)

    data["scaled_group"] = data["scaled_day"].apply(lambda x: int(n_day_groups*x/25))
    get_site_metrics(data, fltr, x_callback, axis_ranges)
    daily_metrics, labels = get_dailies_for_all_metrics(data)

    accs = []
    for metric, metric_values in daily_metrics.items():
        f1s, n_rules, counts, tcounts = assess_df(metric_values, labels[metric], splitter, f"{fltr}_{metric}")
        accs.append(f1s)
        n_rules_data.append(n_rules)
        metric_output_name = f"{fltr}_{metric}"
        index_counts_data.append((metric_output_name, counts))
        time_counts_data.append((metric_output_name, tcounts))
        print(f"F1 scores for {metric} in {fltr}: {f1s}")
        fig = pca_plot(daily_metrics[metric], labels[metric], metric_output_name)
        display(fig)

    order = ['F1 micro', 'F1 macro', 'F1 weighted', 'Accuracy']
    combined_metrics = pd.concat(daily_metrics.values(), axis=1)
    f1s, n_rules, counts, tcounts = assess_df(combined_metrics, labels['lprms'], splitter, f"{fltr}_combined")
    print(f"F1 scores for combined metrics in {fltr}: {f1s}")
    fig = pca_plot(combined_metrics, labels["lprms"], f"{fltr}_combined")
    display(fig)
    accs.append(f1s)
    n_rules_data.append(n_rules)
    index_counts_data.append((f"{fltr}_combined", counts))
    time_counts_data.append((f"{fltr}_combined", tcounts))
    tests = list(daily_metrics.keys()) + ["Combined"]
    print(tests)
    for i, v in enumerate(list(zip(*accs))):
        print(v)
        fig = Plots.categorical_bar_plot(tests, v, "", f"result_{fltr}_{order[i].replace(' ', '_')}_plot_gap", ("Metric", "F1 Score"), y_limits=(0, 1))
        display(fig)

In [6]:
filter_band = "broad"
process_data(filter_band)

TypeError: process_data() missing 1 required positional argument: 'data'

In [None]:
filter_band = "fish"
process_data(filter_band)

In [None]:
filter_band = "invertebrate"
process_data(filter_band)

In [None]:
write_rule_location_counts(n_rules_data)
for typ, counts_data in [("index", index_counts_data), ("time", time_counts_data)]:
    write_metric_rule_counts(typ, counts_data)