In [1]:
from datetime import datetime, timedelta
import os, argparse, sys
import matplotlib.pyplot as plt
import numpy as np
from numpy.polynomial import Polynomial
import pandas as pd
from glob import glob
from datetime import datetime, timedelta
from dateutil.parser import parse as parse_to_datetime
import imageio
import openpyxl
import sklearn

In [2]:
######################## PROFILE PLOT ####################################   
## User Input from excel config file

#reading in user input from excel config excel file
config_path = "makeprofileplot_config.xlsx" #Location of excel user file
config = pd.read_excel(config_path, skiprows=2, engine='openpyxl', usecols='A:C', index_col=0, nrows=22) # Need to download openpyxl package to import xlsx
print(config)

############################################################

##
# Raise Exception indicating problem with configuration sheet
## 

if len(config.columns) == 0:
    raise Exception("No Data found in configuration sheet ")

############################################################


skiprows = config.at['Skip Rows', config.columns[0]]
x_label = config.at['X Label', config.columns[0]]
x_min = config.at['X Axis (min, max)', config.columns[0]]
x_max = config.at['X Axis (min, max)', config.columns[1]]
y_label = config.at['Y Label', config.columns[0]]
y_min = config.at['Y Axis (min, max)', config.columns[0]]
y_max = config.at['Y Axis (min, max)', config.columns[1]]
start_day = config.at['Julian Start Day', config.columns[0]]
obsdatapath = config.at['File', config.columns[0]]
obs_day_column_name = config.at['Date Column', config.columns[0]]
obs_variable = config.at['Variable Name', config.columns[0]]
obs_variable_units = config.at['Variable Units', config.columns[0]]
obs_variable_column_name = config.at['Variable Column Name', config.columns[0]]
obs_depth_column_name = config.at['Depth Column Name', config.columns[0]] 
obs_result_column_name = config.at['Result Column Name', config.columns[0]]
obs_na_values = config.at['NA Values', config.columns[0]]
figure_title = config.at['Figure Title', config.columns[0]]
# NOTE: append "_column_name" to variables that store labels (names of columns)
modpath = config.at['File', config.columns[1]]
mod_day_column_name = config.at['Date Column', config.columns[1]]
mod_variable = config.at['Variable Name', config.columns[1]]
mod_variable_column_name = config.at['Variable Column Name', config.columns[1]]
mod_depth_column_name = config.at['Depth Column Name', config.columns[1]] 
mod_result_column_name = config.at['Result Column Name', config.columns[1]]
mod_na_values = config.at['NA Values', config.columns[1]]
mod_variable_units = config.at['Variable Units', config.columns[1]]

profileplotfolder = config.at['Profile Plots Folder', config.columns[0]] 
statsfolder = config.at['Statistic Output Folder', config.columns[0]]
#print(obsdatapath, obs_day, obs_param, modpath, mod_day, mod_param, profileplotfolder)

                                                                   Unnamed: 1  \
$ 3/11/22                                                                       
File                             data\Model Files\HaggRes_InletStreams_WQ.csv   
Sheet Name                                                                NaN   
Skip Rows                                                                 NaN   
Variable Column Name                                            Lab Parameter   
Variable Name                                                     Temperature   
Variable Units                                                        Celsius   
Result Column Name                                                Result_as#2   
Depth Column Name                                                     Depth_m   
NA Values                                                                -999   
Legend Label                                                         Observed   
Date Column                 

In [3]:
# #
# Config Problems - Check data and throw exceptions if missing
# #

# Note: I did not include sheet name nor skip rows, as they are currently empty

def isNaN(variable):
    '''
        Checks for NaN values. If the variable is not equal to itself, it is a NaN type.
        Args: variable: Variable to check for NaN
        Return: True/False
    '''
    return variable!= variable

# Check two of the following configurations exist (will be NaN if missing)
for index in range(0, 2):
    if isNaN(config.at['File', config.columns[index]]):
        raise Exception("Missing entry in 'File' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Variable Column Name', config.columns[index]]):
        raise Exception("Missing entry in 'Variable Column Name' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Variable Name', config.columns[index]]):
        raise Exception("Missing entry in 'Variable Name' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Variable Units', config.columns[index]]):
        raise Exception("Missing entry in 'Variable Units' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Result Column Name', config.columns[index]]):
        raise Exception("Missing entry in 'Result Column Name' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Depth Column Name', config.columns[index]]):
        raise Exception("Missing entry in 'Depth Column Name' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['NA Values', config.columns[index]]):
        raise Exception("Missing entry in 'NA Values row' in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Legend Label', config.columns[index]]):
        raise Exception("Missing entry in 'Legend Label' in in configuration sheet (expected two values).")

    if isNaN(config.at['Date Column', config.columns[index]]):
        raise Exception("Missing entry in 'Date Column' in in configuration sheet (expected two values).")

    if isNaN(config.at['X Axis (min, max)', config.columns[index]]):
        raise Exception("Missing entry in 'X Axis (min, max)' in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Y Axis (min, max)', config.columns[index]]):
        raise Exception("Missing entry in 'Y Axis (min, max)' in in configuration sheet (expected two values).")
    
# Check one of the following configurations exist (will be NaN if missing)
for index in range(0, 1):
    if isNaN(config.at['Julian Start Day', config.columns[index]]):
        raise Exception("Missing entry in 'Julian Start Day' row in in configuration sheet (expected two values).")
    
    # Uncomment to add
    #if isNaN(config.at['Figure Name', config.columns[index]]):
    #    raise Exception("Missing entry in 'Figure Name' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Figure Title', config.columns[index]]):
        raise Exception("Missing entry in 'Figure Title' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['X Label', config.columns[index]]):
        raise Exception("Missing entry in 'X Label' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Y Label', config.columns[index]]):
        raise Exception("Missing entry in 'Y Label' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Mode', config.columns[index]]):
        raise Exception("Missing entry in 'Mode' row in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Profile Plots Folder', config.columns[index]]):
        raise Exception("Missing entry in 'Profile Plots Folder' in in configuration sheet (expected two values).")
    
    if isNaN(config.at['Statistic Output Folder', config.columns[index]]):
        raise Exception("Missing entry in 'Statistic Output Folder' in in configuration sheet (expected two values).")
    
##
# Trim all whitespace off of data inputs that are strings (not numbers or dates)
## 

input_list = [
    x_label,
    y_label,
    obsdatapath,
    obs_day_column_name,
    obs_variable,
    obs_variable_units,
    obs_variable_column_name,
    obs_depth_column_name,
    obs_result_column_name,
    figure_title,
    modpath,
    mod_day_column_name,
    mod_variable,
    mod_variable_column_name,
    mod_depth_column_name,
    mod_result_column_name,
    mod_variable_units,
    profileplotfolder,
    statsfolder
]

for parameter in input_list:
    if not isNaN(parameter):
        parameter = parameter.strip()

In [4]:
# #
# Linux Compatibility - Directory Handler
# #

'''
    Windows and Linux environments use different formats for directories. This section of code
    takes existing paths and if in a linux environment, changes '\' to '/'. Then, it checks if those
    directories exist in the current folder, and if not, creates them.

'''

import platform
import os

# If not Windows, add working directory path to variable and change '\'
if platform.system() != 'Windows':
    modpath = os.getcwd() + '/' + modpath.replace('\\', '/') 
    obsdatapath = os.getcwd() + '/' + obsdatapath.replace('\\', '/') 
    profileplotfolder = os.getcwd() + '/' + profileplotfolder.replace('\\', '/')
    statsfolder = os.getcwd() + '/' + statsfolder.replace('\\', '/')

# For consiser code, added path lists to a List/Array to go through in a for loop
path_list = [modpath, obsdatapath, profileplotfolder, statsfolder]

# If directories for paths dont exist, create them
if platform.system() != 'Windows':
    dir_split = '/'
    for variable in path_list:
        build_path = ''
        print(os.getcwd() + dir_split)
        print(variable.split(os.getcwd() + dir_split))
        path = variable.split(os.getcwd() + dir_split)[1].split(dir_split) # split path into array
        if '.' in path[-1]:
            path.pop() # Remove filename from end (Note: only works if file extension exists on end of filename)
        for directory in path:
            build_path += directory + dir_split
            if not os.path.isdir(build_path):
                os.mkdir(build_path)
else:
    dir_split = '\\'
    for variable in path_list:
        build_path = ''
        path = variable.split(dir_split) # split path into array
        if '.' in path[-1]:
            path.pop() # Remove filename from end (Note: only works if file extension exists on end of filename)
        for directory in path:
            build_path += directory + dir_split
            if not os.path.isdir(build_path):
                os.mkdir(build_path)

In [5]:
##
# Check Data Files for misformatted data. This will treat pandas warnings as exceptions, catching if there are mixed types in a column (like letters in a data column)
## 

# NOTE: This will only flag mixed types, if the entire column is made of letters, this will not flag

import warnings
warnings.filterwarnings("error")

In [7]:
# loading in model and observed data

print('loading model "%s"' % modpath)
print('loading observed "%s"' % obsdatapath)

#read in profile model outputs
moddata = pd.read_csv(modpath, na_values=mod_na_values)

#observed data
obsdata = pd.read_csv(obsdatapath, na_values= obs_na_values)

loading model "data\Model Files\spr_wb1.csv"
loading observed "data\Model Files\HaggRes_InletStreams_WQ.csv"


DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.

In [None]:
#Conditional Statements - Create tables for observed and modeled data
#### This requires data to be in a specific format ####
#Create model data table, round down days
profobs = obsdata[(obsdata['Site'] == 'V - Hagg Lake')].copy() # conditions data to only include Site V, makes a copy so we dont alter obs data
profobs = profobs[(profobs[obs_variable_column_name] == obs_variable)]
profobs = profobs[[obs_day_column_name, obs_depth_column_name, obs_result_column_name]] #making a table with day, depth, and results
profobs[obs_day_column_name] = profobs[obs_day_column_name].apply(np.floor) #round down
profobs = profobs.dropna()

#Create observed data table, round down days
profmod = moddata[(moddata[mod_variable_column_name] == mod_variable)].copy() 
profmod = profmod[[mod_day_column_name, mod_depth_column_name, mod_result_column_name]]
profmod[mod_day_column_name] = profmod[mod_day_column_name].apply(np.floor)
profmod = profmod.dropna()

#use this if you want to see the data
#print('Observed data:')
#print(profobs)
#print('Model data:')
#print(profmod)

In [None]:
#create a model days and observed days into a se
modeldays = set(profmod[mod_day_column_name])
observeddays = set(profobs[obs_day_column_name])
days =  modeldays.intersection(observeddays) #find the ones that are in both data sets



In [None]:
#Create Index of Days in Model Dataset - In the future we may need to check that observed and modeled days match earlier in code
#mod_ind = profmod_complete[mod_day].unique() # prof mod complete not defined
#mod_ind #Index of Julian Days for model dataset. We are assuming the model days match the observed days.

# renaming variables so they specify that they are interpolated!
interpolated_df_day_column_name = mod_day_column_name
interpolated_df_depth_column_name = 'Depth' #mod_depth_column_name
interpolated_df_mod_result_column_name = mod_result_column_name
interpolated_df_obs_result_column_name = obs_result_column_name
interpolated_columns = [
    interpolated_df_day_column_name, 
    interpolated_df_depth_column_name, 
    interpolated_df_mod_result_column_name, 
    interpolated_df_obs_result_column_name,
]


interpolated_df = pd.DataFrame(columns=interpolated_columns) #creating an empty data frame to put all of the interpolated values in

#Interpolate - each days values are interpolated using this loop

for i in days: # changed this to modeldays instead of mod_ind
    profmod_i = profmod[(profmod[mod_day_column_name] == i)]
    profobs_i = profobs[(profobs[obs_day_column_name] == i)]
    
    mod_depths = profmod_i[mod_depth_column_name]
    mod_results = profmod_i[mod_result_column_name]
    
    obs_depths = profobs_i[obs_depth_column_name]
    obs_results = profobs_i[obs_result_column_name]

    # if there are no observed depths for day "i" np.interp will crash below. 
    # so, if there are none, skip this day (via "continue")
    # if len(obs_depths) == 0:
    #     print('no observed depths for day {}'.format(i))
    #     continue
    
    interp_mod_results = list(np.interp(obs_depths, mod_depths, mod_results))
    interp_mod_days = [i] * len(obs_depths)
    interpolated_df = pd.concat([
        interpolated_df, 
        pd.DataFrame(zip(interp_mod_days, obs_depths, interp_mod_results, obs_results), columns=interpolated_columns)
    ])

# new data frame will have day, depth, interpolated model data, observed data

In [None]:
#Function to Caluclate Statistic Values
from sklearn.metrics import mean_absolute_error, mean_squared_error

stats_columns = [
    'DAY',
    'MAE', 
    'RMSE', 
    'ME',
    'MODEL ST.DEV',
    'PBIAS',
    'MOD_MEAN',
    'OBS_MEAN',
]
def make_empty_statsdf():
    return pd.DataFrame(columns=stats_columns)

def concat_statsdf(statsdf, df, day):
    y_true = df[interpolated_df_obs_result_column_name].to_numpy()
    y_pred = df[interpolated_df_mod_result_column_name].to_numpy()
    MOD_MEAN = y_pred.mean()
    OBS_MEAN = y_true.mean()
    RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
    ME = np.sum(y_pred - y_true) / len(y_true)
    MAE = np.sum(np.absolute(y_pred - y_true))/ len(y_true)
    MOD_ST_DEV = y_pred.std()
    PBIAS = 100 * np.sum(y_true - y_pred) / np.sum(y_true)
    return pd.concat([
        statsdf,
        pd.DataFrame([[day, MAE, RMSE, ME,MOD_ST_DEV, PBIAS, MOD_MEAN, OBS_MEAN]], columns=stats_columns)
    ])


In [None]:
# iterate over the interpolated_df we created above
#creating the profile plots

statsdf = make_empty_statsdf()

for i in days:
    interp_i = interpolated_df[(interpolated_df[interpolated_df_day_column_name] == i)]
    mod_i = profmod[(profmod[mod_day_column_name]==i)]
    depths = list(interp_i[interpolated_df_depth_column_name])
    mod_depths = list(mod_i[mod_depth_column_name])
    x_mod = list(mod_i[mod_result_column_name])
    x_obs = list(interp_i[interpolated_df_obs_result_column_name])
    date = (start_day) + timedelta(days=(i-1))
    fig, ax = plt.subplots()
    ax.plot(x_mod, mod_depths, marker = '', linestyle ='-', label = 'Model')
    ax.plot(x_obs, depths, marker ='*', linestyle = 'None', color ='g', label = 'Observed')

    ############################################################

    ##
    # Replace plt with specific figure/axis
    ##

    # plt.title(f"{figure_title} {date.strftime('%B %d %Y')}" )
    # plt.xlabel(x_label)
    # plt.ylabel(y_label)
    # plt.legend(loc = 'lower right')

    fig.suptitle(f"{figure_title} {date.strftime('%B %d %Y')}" )
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.legend(loc = 'lower right')
    ############################################################

    ax.set_xlim([x_min, x_max])
    ax.set_ylim([y_min, y_max])
    ax.invert_yaxis()
    ax.axis([x_min, x_max, y_max, y_min])
    ax.text( 1, 2 , f"Julian Day: {i}")
    plotname = f'profmod_{i}.jpg'

    ############################################################

    ##
    # Catch .jpg error and retry as .png
    ##

    #fig.savefig(os.path.join(profileplotfolder, plotname))
    
    # Attempt to save as a .jpg, else save as a .png
    try:
        fig.savefig(os.path.join(profileplotfolder, plotname))
    except ValueError:
        print('ERROR: Unable to save as .jpg, saving as .png instead')
        plotname = plotname.split('.')
        plotname.pop()
        plotname.append('png') # Throw away the file extension
        plotname = '.'.join(plotname)
        fig.savefig(os.path.join(profileplotfolder, plotname), format='png')
    except:
        print(f'Error: Unable to save {plotname}')
    ############################################################

    ############################################################

    ##
    # Fix runtime error for 20+ open plots
    ##

    #fig.clf()
    plt.close()
    ############################################################

    statsdf = concat_statsdf(statsdf, interp_i, i).sort_values('DAY') # calling the statistics function in this loop to calculate for everyday

    
statsdf = concat_statsdf(statsdf, interpolated_df, 'AVG') #average statistics values
statsdf.to_csv(os.path.join(statsfolder, 'Statistics.csv'))

In [None]:
# Create Gif of profile plots (Cui Yong, 2020)
from pathlib import Path

############################################################

##
# Function for parsing day out of file names to later assist with sorting
##

import re

def numericalSort(raw):
    ''' 
        Takes a string, splits out the numbers, converts to an int, and returns results for sorting
        [Source: https://stackoverflow.com/questions/12093940/reading-files-in-a-particular-order-in-python]
        Args:
            value (string): value to be split
        Return:
            parts (List): string parts, seperated on number
    '''
    value = str(raw)
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    if len(parts) > 3:
        return parts[3]
    else:
        return float('inf')  # Non-standard naming scheme - place file at end of gif


numbers = re.compile(r'(\d+)')  # Regex for seperating numbers from strings
############################################################

image_path = Path(profileplotfolder)

############################################################

##
# Read in files and sort them according to day. If no .jpg files exist, look for .png files
##

# images = list(image_path.glob('*.jpg'))

images = []

for file in sorted(list(image_path.glob('*.jpg')), key=numericalSort):
    images.append(file)

if len(images) == 0:
    for file in sorted(list(image_path.glob('*.png')), key=numericalSort):
        images.append(file)
############################################################

image_list = []
for file_name in images:
    image_list.append(imageio.imread(file_name))

len(image_list)
imageio.mimwrite(os.path.join(profileplotfolder, 'profileplots.gif'), image_list , fps =4)
file_name