In [None]:
from datetime import date, timedelta, datetime
import os
import glob
import time
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
pf_output_dir = 'path/to/parflow/run/outputs/'
runname = '<parflow_runname>'

water_year = 2003

In [None]:
def date_from_water_year_day(water_year, day_of_water_year):
    """Calculates the date from the water year and day of water year.

    Args:
        water_year: The water year (e.g., 2024 for the year starting Oct 1, 2023)
        day_of_water_year: The day of the water year (1-366)

    Returns:
        The corresponding date object.
    """

    start_of_water_year = date(water_year - 1, 10, 1)
    return start_of_water_year + timedelta(days=day_of_water_year - 1)

# Example usage:
water_year = 2003
pf_time_step = 2764

day_of_water_year = int(pf_time_step/24)
result_date = date_from_water_year_day(water_year, day_of_water_year)
print(result_date)

### Find the current run status and the last file and date of run

In [None]:
# read value from last daily restart
path_to_tcl = f'{pf_output_dir}/clm_restart.tcl'
lines = open(path_to_tcl, "r").readlines()[0]
istep = [int(i) for i in lines.split() if i.isdigit()][0]

# calculate day of water year
day_of_water_year = int(istep/24)
result_date = date_from_water_year_day(water_year, day_of_water_year)
print('last restart ',istep,' date: ', result_date)

# calculate latest files that were written
LatestFile = max(glob.iglob(f"{pf_output_dir}/{runname}.out.*"),key=os.path.getctime)
print("Most Recent File Written:", LatestFile)
modification_time = os.path.getmtime(LatestFile)
readable_modification_time = time.ctime(modification_time)
print("Last modified:", readable_modification_time)

LatestFile = max(glob.iglob(f"{pf_output_dir}/{runname}.out*log"),key=os.path.getctime)
print("Most Recent Log File Written:", LatestFile)
modification_time = os.path.getmtime(LatestFile)
readable_modification_time = time.ctime(modification_time)
print("Last modified:", readable_modification_time)

LatestFile = max(glob.iglob(f"{pf_output_dir}/{runname}.out*pfb"),key=os.path.getctime)
print("Most Recent Output File Written:", LatestFile)
modification_time = os.path.getmtime(LatestFile)
readable_modification_time = time.ctime(modification_time)
print("Last modified:", readable_modification_time)

### Create summary information from kinsol log for later plotting

In [None]:
#set the path to your kinsol log
kinsol_file = f"{pf_output_dir}/{runname}.out.kinsol.log"

#create empty pandas dataframe with the info you want to save
df = pd.DataFrame(columns = ['Timestep', 'Convergence','Nonlin. Its.', 'Lin. Its.','Func. Evals.','PC Evals.','PC Solves','Lin. Conv. Fails','Beta Cond. Fails','Backtracks'])

#open the kinsol and read line by line                 
with open (kinsol_file, 'rt') as myfile: 
    for myline in myfile: 
        if 'KINSOL starting step for time' in myline:
            #create empty dictionary to save information for current timestep
            curr_dict = {}
            line_curr = str.split(myline)  
            curr_dict['Timestep'] = float(line_curr[-1])
        elif 'KINSol return value 1' in myline:
            curr_dict['Convergence']='yes'
        elif 'KINSol return value' in myline:
            curr_dict['Convergence']='no'
        elif 'Nonlin. Its.:' in myline:
            line_curr = str.split(myline) 
            curr_dict['Nonlin. Its.'] = line_curr[-2]
        elif 'Lin. Its.:' in myline:
            line_curr = str.split(myline) 
            curr_dict['Lin. Its.'] = line_curr[-2]
        elif 'Func. Evals.' in myline:
            line_curr = str.split(myline) 
            curr_dict['Func. Evals.'] = line_curr[-2]
        elif 'PC Evals.' in myline:
            line_curr = str.split(myline) 
            curr_dict['PC Evals.'] = line_curr[-2]
        elif 'PC Solves' in myline:
            line_curr = str.split(myline) 
            curr_dict['PC Solves'] = line_curr[-2]
        elif 'Lin. Conv. Fails' in myline:
            line_curr = str.split(myline) 
            curr_dict['Lin. Conv. Fails'] = line_curr[-2]
        elif 'Beta Cond. Fails' in myline:
            line_curr = str.split(myline) 
            curr_dict['Beta Cond. Fails'] = line_curr[-2]
        elif 'Backtracks' in myline:
            line_curr = str.split(myline) 
            curr_dict['Backtracks'] = line_curr[-2]
            #the Backtracks is the last entry for each timestep, save the current timestep in the dataframe
            curr_dict['Backtracks'] = line_curr[-2]
            df.loc[len(df),:] = curr_dict

# save the dataframe to a csv file
df.to_csv('kinsol_summary.csv', index=False)

### Plot the number of nonlinear iterations per timestep

In [None]:
data = pd.read_csv("/kinsol_summary.csv")
print(data.head())

#make a plot of the number of nonlinear iterations per timestep
plt.plot(data['Nonlin. Its.'])
plt.xlabel('Timestep')
plt.ylabel('Nonlinear iterations')
# make y axis readable
plt.ylim=(0,500)
plt.title('Number of nonlinear iterations per timestep')
plt.savefig('Nonlin_its.png')
plt.show()

#summary statistics of the performance of the solver
print(data.describe())

### Plot nonlinear iterations, linear iterations, and function evaluations over time

In [None]:
# nonlinear iterations
plt.plot(data['Timestep'],data['Nonlin. Its.'])
plt.xlabel('Simulation Time [h]')
plt.ylabel('Nonlinear iterations')
plt.title('Number of nonlinear iterations over time')
plt.show()

# linear iterations
plt.plot(data['Timestep'],data['Lin. Its.'])
plt.xlabel('Simulation Time [h]')
plt.ylabel('Linear iterations')
plt.title('Number of linear iterations over time')
plt.show()

# function evaluations
plt.plot(data['Timestep'],data['Func. Evals.'])
plt.xlabel('Simulation Time [h]')
plt.ylabel('Function evaluations')
plt.title('Number of function evaluations over time')
plt.show()

### Check on timesteps that have the largest number of iterations to convergence

In [None]:
# tell me what timestep the solver failed
nonconv = data[data['Convergence']=='no']

#list the timesteps where the function evaluations are greater than 75th percentile
large_function = data[data['Func. Evals.']>600]

#list the timesteps where the linear iterations are greater than 75th percentile
large_lin = data[data['Lin. Its.']>data['Lin. Its.'].quantile(0.75)]

# plot the function evaluations as a function of linear iterations
plt.scatter(data['Func. Evals.'],data['Lin. Its.'],c='blue')
# plot the function evaluations as a function of nonlinear iterations
plt.scatter(data['Func. Evals.'],data['Nonlin. Its.'],c='red')
plt.xlabel('Function evaluations')
plt.ylabel('Iterations (Linear=Blue, NL=Red)')
plt.title('Performance for all timesteps')
plt.show()

# now repeat the above for the large_lin dataframe and color the points differently to see if there is a pattern
plt.scatter(large_lin['Func. Evals.'],large_lin['Lin. Its.'],c='blue')
plt.scatter(large_lin['Func. Evals.'],large_lin['Nonlin. Its.'],c='red')
plt.xlabel('Function evaluations')
plt.ylabel('Iterations (Linear=Blue, NL=Red)')
plt.title('Performance for timesteps with large number of linear iterations')
plt.show()

# plot a histogram of the number of linear iterations
plt.hist(data['Lin. Its.'], bins=150)
plt.xlabel('Linear iterations')
plt.ylabel('Frequency')
plt.title('Histogram of linear iterations')
plt.show()

# plot a histogram of the number of function evaluations
plt.hist(data['Func. Evals.'], bins=150)
plt.xlabel('Function evaluations')
plt.ylabel('Frequency')
plt.title('Histogram of function evaluations')
plt.show()

# list the timesteps where the function evaluations are greater than 75th percentile
Super_large_function = data[data['Func. Evals.']>200]
print(Super_large_function)

# count the number of times the solver failed
print(data['Convergence'].value_counts())

### Plot nonliner iterations and linear iterations as a function of the number of function evaluations

In [None]:
# plot the number of nonlinear iterations as a function of the number of function evaluations
plt.scatter(data['Func. Evals.'],data['Nonlin. Its.'])
plt.xlabel('Function evaluations')
plt.ylabel('Nonlinear iterations')
plt.show()

In [None]:
# plot the number of linear iterations as a function of the number of function evaluations
plt.scatter(data['Func. Evals.'],data['Lin. Its.'])
plt.xlabel('Function evaluations')
plt.ylabel('Linear iterations')
plt.show()

### Calculate the time between file writes. Plot comparison of solver iterations to run time.

In [None]:
## make a table with the time between file writes for each step
## use this to compare solver iterations to run time
def get_write_intervals(file_list):
    """Calculates time intervals between file writes for a list of files."""

    intervals = []
    time_diffs = []
    for file_path in file_list:
        try:
            # Get the last modification time of the file
            mtime = os.path.getmtime(file_path)

            # Convert the timestamp to a readable format
            readable_time = time.ctime(mtime)

            # Append the file path and its modification time to the intervals list
            intervals.append((file_path, readable_time, mtime))

        except FileNotFoundError:
            print(f"File not found: {file_path}")

    # Calculate the time difference between consecutive file writes
    for i in range(1, len(intervals)):
        time_diff = intervals[i][2] - intervals[i-1][2]
        time_diffs.append(time_diff)
    return time_diffs

In [None]:
# start from log file above
start = int(data['Timestep'].iloc[0])
end = int(data['Timestep'].iloc[-1])

file_list = []
timestep_list = list(range(start, end))

for ii in range(start, end):
    file_list.extend([f'{pf_output_dir}/{runname}.out.press.'+ f'{ii:05d}'+'.pfb'])

times = get_write_intervals(file_list)
print(len(timestep_list))
print(len(times))

run_speed = pd.DataFrame({'Timestep':timestep_list[1:],'Time':times})

In [None]:
# summary statistics of the run speeds
print(run_speed.describe())

# plot performance over time
plt.plot(run_speed['Timestep'],run_speed['Time'],c='red')
plt.xlabel('Simulation Time [h]')
plt.ylabel('Solution Time for each step [s]')
plt.title('Simulation Performance with timestep')
plt.show()

# plot performance metrics over time
plt.plot(data['Timestep'],data['Nonlin. Its.'])
plt.xlabel('Simulation Time [h]')
plt.ylabel('Nonlinear iterations')
plt.title('Number of nonlinear iterations over time')
plt.show()

# combined plot
plt.plot(run_speed['Timestep'],run_speed['Time'],c='red')
plt.plot(data['Timestep'],data['Nonlin. Its.'],c='blue')
plt.plot(data['Timestep'],data['Lin. Its.'],c='green')
plt.xlabel('Simulation Time [h]')
plt.ylabel('Nonlinear iterations and time for each timestep [s]')
plt.title('CONUS2.1 performance over time')
plt.show()


### Estimate the amount of runtime required to finish the run

In [None]:
stop = 5787  ## should read this from pfidb file

average_speed = run_speed['Time'].mean()
last_timestep = run_speed['Timestep'].iloc[-1]
last_timestep = int(data['Timestep'].iloc[-1])
print('average speed per timestep [s]:',f'{average_speed:3.1f}')
print('last timestep written:',last_timestep)

est_time = (stop-last_timestep)*average_speed 
print("time left in run [h]: ",f'{est_time/3600:4.2f}')

### Estimate when the run will finish

In [None]:
# Get current time
now = datetime.now()

# Add the estimated remaining time
time_to_add = timedelta(hours=est_time/3600)
new_time = now + time_to_add

print("It's currently:", now)
print("Run estimated to finish at:", new_time)