In [20]:
## Data handlers
from dask_jobqueue import *
from dask.distributed import *
import xarray as xr
import numpy as np
import pandas as pd
import datetime
import netCDF4 
from dask import delayed
from dask import compute
from dask.diagnostics import*
from tqdm import tqdm
import dask

## Global config
import os, sys, glob
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import config 
import gc
import warnings
warnings.filterwarnings('ignore')

print("Python modules load Complete")

## Build the list of files in LIS output directory
route_files = []
for file in sorted(glob.glob('/home/civil/phd/cez218606/LISF1/PDDS/output/LSM_MERRA2_1/ROUTING/*/*HIST*')) :
    route_files.append(file)

 ## Find the list of valid stations having more than 5x365 observations and also find the earliest year of observation.##
 ## Using Dask Delayed to parallel execution ##

def valid_stations (read_dir,min_values,key) :
        try :
            key_4 =  format(key, "03")
            gauge_id = 'IWM-gauge-'+str(format(key_4))
            station = pd.read_csv(read_dir+gauge_id+'.csv').dropna(subset=['Streamflow (cumecs)'])
            value_count = station['Streamflow (cumecs)'].count()
            if value_count >= min_values :
                return gauge_id
        except Exception as e:
            return
read_dir = ('/home/civil/phd/cez218606/LISF1/PDDS/output/Narmada/')        
min_values = 5*365
n_stations = 3900
dask_results = []
for key in range(0,n_stations) :
    dask_result = dask.delayed(valid_stations)(read_dir,min_values,key)
    dask_results.append(dask_result)
dask_compute = dask.compute(*dask_results)
valid_stations = [] 
for val in dask_compute: 
    if val != None : 
        valid_stations.append(val) 
# Save the valid guage files in a separate directory for later use ##
for i in range(0,len(valid_stations)):
    key = valid_stations[i]
    station = pd.read_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/Narmada/'+key+'.csv')
    station.set_index('Date',inplace=True)
    station.to_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/'+key+'-obs.csv')
## Save the list of valid gaugeIDs ##
valid_df = pd.DataFrame(data={"stations": valid_stations})
valid_df.to_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/valid_stations_list.csv',index=False)

## Using hit and trial, match the LIS file index with observation start date (1959-04-06) and build index for batches  of 2 years##
#batch_index = [12053,12755,13512,14242,14973,15703,16434,17164,17894,18624,19354,20084,20814,21053,22274,23004,23734,24470]
batch_index = [0,364,730,1095]
## Open gauge metadata file and set index as GaugeId ##
meta_file = pd.read_csv("/home/civil/phd/cez218606/LISF1/PDDS/output/Narmada.csv")
meta_file.reset_index(drop = True,inplace=True)
meta_file.set_index('GaugeID',inplace=True)
## Create the list of desired variables to be extracted. SoilMoist will be extracted separately for different profiles. ##
route_vars = ['Streamflow_tavg']
## Create the empty containers to store gauge-wise extractions.## 
batch_vars = [None]*len(valid_stations)
merged_vars = [None]*len(valid_stations)

#%%time
## Load LIS data in the batches using "open_mfdataset" and then load it in RAM using ".compute" ##

routedat=xr.open_mfdataset(route_files,combine='by_coords',parallel = True)
routedat = config.reformat_LIS_output(routedat)
#routedat = routedat.chunk({'time':365})
routedat = routedat.compute()
    
## Iterate over the gauge stations and extract required simulated data ##
for i in range(0,len(valid_stations)):
    gauge_id = valid_stations[i]
    gauge_lat = meta_file.loc[gauge_id,'Latitude']
    gauge_lon = meta_file.loc[gauge_id,'Longitude']
    routedat_sel = routedat.sel(lat=gauge_lat,lon=gauge_lon,method='nearest')
    ext_route = routedat_sel[route_vars].to_dataframe()
        
    batch_vars[i] = pd.concat([ext_route],axis=1)
    batch_vars[i] = batch_vars[i].loc[:,~batch_vars[i].columns.duplicated()]
    batch_vars[i] = batch_vars[i].drop(['lat','lon'], axis=1)
    merged_vars[i] = pd.concat([merged_vars[i],batch_vars[i]])
        
## Purge the variables to free up RAM ##
del routedat
del routedat_sel
gc.collect()    
## Save the extracted variables in gauge-wise CSVs ##
for i in range(0,len(valid_stations)):
    key = valid_stations[i]
    merged_vars[i].to_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/'+key+'-sim.csv')

    
del merged_vars

## Append LIS extractions to observed data and save as new CSVs####
for i in range (0,len(valid_stations)):
    key = valid_stations[i]
    obs = pd.read_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/'+key+'-obs.csv')
    obs.set_index('Date',inplace=True)
    obs.index = pd.to_datetime(obs.index)
    sim = pd.read_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/'+key+'-sim.csv')
    sim.set_index('time',inplace=True)
    sim.index = pd.to_datetime(sim.index)
    obs = pd.concat([obs,sim],axis =1).reindex(obs.index)
    obs.to_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/'+key+'-merged.csv')

def covariance(x, y):
    return np.dot(x - x.mean(), y - y.mean()) / x.count()

def corrrelation(x, y):
    return covariance(x, y) / (x.std() * y.std())

def R2(x, y):
    return (corrrelation(x, y))**2






# Load pre-requisite data
stations_list = pd.read_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/valid_stations_list.csv')
meta_file = pd.read_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/Narmada.csv')
meta_file.reset_index(drop = True,inplace=True)
meta_file.set_index('GaugeID',inplace=True)
obj_fn = [config.pbias,config.nashsutcliffe,config.kge,
          config.rmse,config.correlationcoefficient,config.mae,config.rrmse]
obj_fn_names = ['PBIAS','NSE','KGE','RMSE','CORR','MAE','MAPE']
error_columns = ['GaugeID','Latitude','Longitude','PBIAS','NSE','KGE','RMSE','CORR','MAE','RRMSE','obs_avg','sim_avg','MAPE','R2']
error_df = pd.DataFrame(columns = error_columns).set_index('GaugeID')

key = stations_list.loc[0,'stations']



#Create list of all gauge stations with NaNs dropped and export to CSVs(if needed).
stations = []
for i in range(0,len(stations_list)):
    key = stations_list.loc[i,'stations']
    station = pd.read_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/processed_data1/'+key+'-merged.csv')
    station.set_index('Date',inplace=True)
    station.index = pd.to_datetime(station.index)
    #station.dropna(inplace=True)
    station = station.loc['2001-01-01':'2002-12-30']
    stations.append(station)

new_row = [None]*len(stations)
for i in range(0,len(stations)):
    try:
        obs_flow = stations[i]['Streamflow (cumecs)']
        sim_flow = stations[i]['Streamflow_tavg']
        meta_key = stations_list.loc[i,'stations']
        s_info= {'lat':meta_file.at[meta_key,'Latitude'],'lon':meta_file.at[meta_key,'Longitude']}
        error_columns = ['GaugeID','Latitude','Longitude','PBIAS','NSE','KGE','CORR','Alpha','Beta','RMSE','MAE','RRMSE','obs_avg','sim_avg','NMAE','R2','r_var']
        error_df = pd.DataFrame(columns = error_columns).set_index('GaugeID')
        new_row[i]= pd.DataFrame ([[meta_key,s_info['lat'],s_info['lon'],
                                    round(config.pbias(obs_flow,sim_flow),3),
                                    round(config.nashsutcliffe(obs_flow,sim_flow),3),
                                    round(config.kge(obs_flow,sim_flow,return_all="True")[0],3),
                                    round(config.kge(obs_flow,sim_flow,return_all="True")[1],3),
                                    round(config.kge(obs_flow,sim_flow,return_all="True")[2],3),
                                    round(config.kge(obs_flow,sim_flow,return_all="True")[3],3),
                                    round(config.rmse(obs_flow,sim_flow),3),
                                    round(config.mae(obs_flow,sim_flow),3),
                                    round(config.rrmse(obs_flow,sim_flow),3),
                                    round(obs_flow.mean(),3),
                                    round(sim_flow.mean(),3),
                                    round(config.nmae(obs_flow,sim_flow),3),
                                    R2(obs_flow,sim_flow),
                                    (1/((sim_flow-obs_flow).std())**2)]],
                                    columns=(['GaugeID','Latitude','Longitude','PBIAS','NSE','KGE','CORR','Alpha','Beta','RMSE','MAE','RRMSE','obs_avg','sim_avg','NMAE','R2','r_var'])).set_index('GaugeID')
    except Exception as e:
        print(str(e))
        continue

for i in range(0,len(new_row)):
    error_df = error_df.append(new_row[i]) 
error_df['wf'] = (error_df['r_var'])/(error_df['r_var'].sum())
error_df.to_csv('/home/civil/phd/cez218606/LISF1/PDDS/output/error_metrix/error_matrix1.csv')
print("Python Run Complete")

Python modules load Complete
Python Run Complete
