Create summaries for datasets
===

We are interested in seeing the proportion of negative flows currently produced by the scripts.

In [192]:
import pprint
import yaml
from pathlib import Path
import pandas as pd
import os

# READ YAML CONFIG:
INPUT_FILENAME = "inputs.yaml"
try:
    with open(INPUT_FILENAME) as config_file:
        input_config = yaml.safe_load(config_file)
except FileNotFoundError:
    print(f"Error loading '{INPUT_FILENAME}'. File not found.")
except yaml.YAMLError:
    print(f"Error parsing YAML file '{INPUT_FILENAME}'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

print("Input config:")
pprint.pprint(input_config)

Input config:
{'datasets': [{'basins_filepath': 'inputs/LOI_delineations/LOI_delineations/NC_sites_FINAL_watersheds_v2.shp',
               'flow_filepath': 'processed_inputs/NC_sites_FINAL_v2_unimpaired_flow.csv',
               'output_directory': 'outputs/nc_sites_final'},
              {'basins_filepath': 'inputs/LOI_delineations/LOI_delineations/Biosites_NCoast_delineations_QA.shp',
               'flow_filepath': 'processed_inputs/N_Coast_BioSites_Unimpaired_Flow.csv',
               'output_directory': 'outputs/biosites'},
              {'basins_filepath': 'inputs/LOI_delineations/LOI_delineations/All_gages_delineations_combined.shp',
               'flow_filepath': 'processed_inputs/All_gages_unimpaired_flow.csv',
               'output_directory': 'outputs/gages'},
              {'basins_filepath': 'processed_inputs/delineations/McBain_sites_snapped_NAD83_watersheds.shp',
               'flow_filepath': 'processed_inputs/McBain_Sites_Unimpaired_Flow.csv',
               'outpu

In [193]:
OVERALL_DIR = Path('outputs') / 'OVERALL'
os.makedirs(OVERALL_DIR, exist_ok=True)

def summarize(data):
    total_days = data['flow_cfs_impaired'].groupby(['dataset', 'siteID']).count()
    days_with_negative_flow = data[data['flow_cfs_impaired'] < 0]['flow_cfs_impaired'].groupby(['dataset', 'siteID']).count()
    total = pd.DataFrame({'total_days': total_days})
    negative = pd.DataFrame({'negative_days': days_with_negative_flow})
    overall = negative.join(total, how="right").fillna(0)
    overall['percentage_negative'] = (100 * overall['negative_days'] / overall['total_days']).round(2)
    return overall

gather_daily = []
gather_monthly = []
for dataset in input_config['datasets']:
    output_path = Path(dataset['output_directory'])
    dataset_name = os.path.basename(output_path)
    print("Working on", dataset_name)
    print()


    daily_data = pd.read_csv(output_path / "daily.csv")
    daily_data['dataset'] = dataset_name
    daily_data = daily_data.set_index(['dataset', 'siteID', 'date'])
    gather_daily.append(daily_data)
    summary = summarize(daily_data)
    print("Daily (head)\n", summary.head())
    summary.to_csv(output_path / 'daily_summary.csv')
    
    monthly_data = pd.read_csv(output_path / "monthly.csv")
    monthly_data['dataset'] = dataset_name
    monthly_data = monthly_data.set_index(['dataset', 'date', 'siteID'])
    gather_monthly.append(monthly_data)
    summary = summarize(monthly_data)
    print("Monthly (head)\n", summary.head())
    summary.to_csv(output_path / 'monthly_summary.csv')

    print()

Working on nc_sites_final

Daily (head)
                               negative_days  total_days  percentage_negative
dataset        siteID                                                       
nc_sites_final NC_CH_1_0862             0.0        2981                 0.00
               NC_CH_1_11386            0.0        2647                 0.00
               NC_CH_1_13552            0.0        2647                 0.00
               NC_CH_1_14301            0.0        2647                 0.00
               NC_CH_1_15014          163.0        2647                 6.16
Monthly (head)
                               negative_days  total_days  percentage_negative
dataset        siteID                                                       
nc_sites_final NC_CH_1_0862             0.0          98                  0.0
               NC_CH_1_11386            0.0          87                  0.0
               NC_CH_1_13552            0.0          87                  0.0
               NC_C

In [194]:
overall_daily = pd.concat(gather_daily)
overall_daily.to_csv(OVERALL_DIR / 'daily_overall.csv')

overall_monthly = pd.concat(gather_monthly)
overall_monthly.to_csv(OVERALL_DIR / 'monthly_overall.csv')

In [201]:
overall_daily.reset_index(level='date', drop=True).groupby(['dataset', 'siteID']).count().shape

(341, 5)

In [203]:
overall_monthly.reset_index(level='date', drop=True).groupby(['dataset', 'siteID']).count().shape

(341, 5)

In [197]:
overall_daily_summary = summarize(overall_daily)
print(overall_daily_summary.head())
overall_daily_summary.to_csv(OVERALL_DIR / 'daily_summary_overall.csv')

overall_monthly_summary = summarize(overall_monthly)
print(overall_monthly_summary.head())
overall_monthly_summary.to_csv(OVERALL_DIR / 'monthly_summary_overall.csv')

                                negative_days  total_days  percentage_negative
dataset           siteID                                                      
SFEhighresolution SFE_2017_209            0.0        2647                 0.00
                  SFE_2017_221            0.0        2647                 0.00
                  SFE_2017_24             0.0        2647                 0.00
                  SFE_2017_25           177.0        2647                 6.69
                  SFE_2017_322          146.0        2647                 5.52
                                negative_days  total_days  percentage_negative
dataset           siteID                                                      
SFEhighresolution SFE_2017_209            0.0          87                  0.0
                  SFE_2017_221            0.0          87                  0.0
                  SFE_2017_24             0.0          87                  0.0
                  SFE_2017_25             2.0       