In [1]:
%matplotlib inline

In [2]:
import pandas as pds
import numpy as np
import geopandas as gpd
import pickle
from pathlib import Path
pds.set_option('display.max_rows', 500)

In [3]:
# We need the following columns:
# ID, name, river, elev, lon, lat, country, obsbeg_d, obsend_d, obsbeg_h, obsend_h, gaps_h, typimpact, 
# degimpact, hierarchy, nextdownid 

# First we compute the fraction of gaps and start and end dates for unfiltered daily and hourly streamflow data

In [4]:
# Read streamflow pickle files
save_date = 'july20'
savepath = Path(r"C:\Users\hordurbhe\Dropbox\UW\lamah_ice\discharge_measurements\processed_by_hh\combined_gauges_LV_VI_raw_%s.p" % save_date)
combined_dict_npc_met_office = pickle.load(open( savepath, "rb" ) )
savepath = Path(r"C:\Users\hordurbhe\Dropbox\UW\lamah_ice\discharge_measurements\processed_by_hh\combined_gauges_LV_VI_raw_splitted_%s.p" % save_date)
splitted_gauge_dict = pickle.load(open( savepath, "rb" ) )
savepath = Path(r"C:\Users\hordurbhe\Dropbox\UW\lamah_ice\discharge_measurements\processed_by_hh\hourly_data_IMO_NPC_%s.p" % save_date)
hourly_dict = pickle.load(open( savepath, "rb" ) )
savepath = Path(r"C:\Users\hordurbhe\Dropbox\UW\lamah_ice\discharge_measurements\processed_by_hh\combined_gauges_LV_VI_highqual_splitted_%s.p" % save_date)
splitted_gauge_dict_filtered = pickle.load(open( savepath, "rb" ) )

# Read gauge shapefile
gauges_path = Path(r'C:\Users\hordurbhe\Dropbox\UW\lamah_ice\GIS\watersheds\final_watersheds\gauges_with_splitted_included_with_elevation.shp') 
gauges = gpd.read_file(gauges_path)
gauges = gauges.set_index('id')
cols = ['st_numer', 'st_nafn', 'vhm_numer', 'vatnsfall', 'elevation','geometry']
gauges_reduced = gauges[cols]
gauges_reduced.columns = ['V_no','name','VHM_no','river', 'elevation', 'geometry']
# gauges_reduced

In [5]:
# We update the key names in the hourly dict (for splitted gauges)
# Lists of old and new key names
old_key_names = ['V100', 'V112', 'V68']
new_key_names = ['V100_3', 'V112_2', 'V68_2']

# Create a new dictionary to store the updated keys
updated_hourly_dict = {}

# Loop through the old key names and update the keys
for old_key, new_key in zip(old_key_names, new_key_names):
    if old_key in hourly_dict:
        value = hourly_dict.pop(old_key)  # Extract the value of the old key
        updated_hourly_dict[new_key] = value  # Add a new entry with the updated key

# Add any remaining entries from the original dictionary
updated_hourly_dict.update(hourly_dict)

In [6]:
# First we create another dict without preceding and trailing nans
def drop_nan_boundaries(df):
    first_valid_idx = df['Value'].first_valid_index()
    last_valid_idx = df['Value'].last_valid_index()
    return df.loc[first_valid_idx:last_valid_idx]

# We remove nans from the front and back of the hourly series
gauge_data_nan_cleaned = dict()
for key in updated_hourly_dict.keys():
    gauge_data_nan_cleaned[key] = drop_nan_boundaries(updated_hourly_dict[key])
    
# We remove nans from the front and back of the daily series
gauge_data_nan_cleaned_daily = dict()
for key in splitted_gauge_dict.keys():
    gauge_data_nan_cleaned_daily[key] = drop_nan_boundaries(splitted_gauge_dict[key])
    
# We remove nans from the front and back of the daily filtered series
gauge_data_nan_cleaned_daily_filtered = dict()
for key in splitted_gauge_dict_filtered.keys():
    gauge_data_nan_cleaned_daily_filtered[key] = drop_nan_boundaries(splitted_gauge_dict_filtered[key])

In [7]:
# Now we calculate the fraction of missing data:

# Create lists to store gauge names and corresponding fraction of missing data
gauge_ids = []
fraction_missing_list = []

for gauge, df in gauge_data_nan_cleaned.items():
    gauge_id = gauges_reduced[gauges_reduced['V_no']==gauge].index[0]
    # Convert the index to datetime if not already in datetime format
    df.index = pds.to_datetime(df.index)
    
    # Calculate the total number of hours and the number of missing hours
    total_hours = len(df)
    missing_hours = df['Value'].isna().sum()
    
    # Calculate the fraction of missing data in percentage
    fraction_missing = (missing_hours / total_hours) * 1000
    
    # Append gauge name and fraction of missing data to the lists
    gauge_ids.append(gauge_id)
    fraction_missing_list.append(fraction_missing)

# Create a DataFrame from the lists
gaps_df = pds.DataFrame({'gaps_hourly': fraction_missing_list}, index=gauge_ids)
gaps_df = gaps_df.sort_index()
gauges_reduced['gaps_hourly'] = gaps_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [8]:
# add latitude and longitude columns
gauges_reduced['lat'] = gauges.geometry.apply(lambda p: int(p.y))
gauges_reduced['lon'] = gauges.geometry.apply(lambda p: int(p.x))

# replace NaNs in VHM_no with -1
gauges_reduced['VHM_no'] = pds.to_numeric(gauges_reduced['VHM_no'] , errors='coerce')
gauges_reduced['VHM_no']  = gauges_reduced['VHM_no'].fillna(-1)

# convert the column to integer data type
gauges_reduced['VHM_no'] = gauges_reduced['VHM_no'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [9]:
# Now we calculate the obsbeg and obsend

# First we process daily streamflow
indices = []
beginning_year = []
end_year = []
for idx in gauges_reduced.index:
    gauge = gauges_reduced.loc[idx]['V_no']
    if gauge in ['V100','V68','V112']:
        gauge=gauge+'_1'
    beginning_year.append(gauge_data_nan_cleaned_daily[gauge].index[0].year)
    end_year.append(gauge_data_nan_cleaned_daily[gauge].index[-1].year)
    indices.append(idx)
    
df = pds.DataFrame({'obsbeg_day': beginning_year,
                   'obsend_day': end_year},
                  index=indices)
gauges_reduced['obsbeg_day'] = df['obsbeg_day']
gauges_reduced['obsend_day'] = df['obsend_day']

# Now we process the hourly streamflow:
indices = []
beginning_year = []
end_year = []
for idx in gauges_reduced.index:
    gauge = gauges_reduced.loc[idx]['V_no']
    try:
        beginning_year.append(gauge_data_nan_cleaned[gauge].index[0].year)
        end_year.append(gauge_data_nan_cleaned[gauge].index[-1].year)
    except:
#         print('No hourly data for ID %s, %s' % (idx,gauge))
        beginning_year.append(-1)
        end_year.append(-1)
    indices.append(idx)
    
df = pds.DataFrame({'obsbeg_hr': beginning_year,
                   'obsend_hr': end_year},
                  index=indices)
gauges_reduced['obsbeg_hr'] = df['obsbeg_hr']
gauges_reduced['obsend_dr'] = df['obsend_hr']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [10]:
# Read gauge hierarchy and human impact .csv files
human_infl_path = Path(r"C:\Users\hordurbhe\Dropbox\UW\lamah_ice\lamah_ice\A_basins_total_upstrm\1_attributes\final_attributes\human_influence_cleaned_reordered.csv")
influence = pds.read_csv(human_infl_path,sep='\t',encoding='UTF-16')
influence = influence.set_index('id')

hierarchy_path = Path(r"C:\Users\hordurbhe\Dropbox\UW\lamah_ice\lamah_ice\B_basins_intermediate_all\1_attributes\Gauge_hierarchy.csv")
hier = pds.read_csv(hierarchy_path,sep=';')
hier = hier.set_index('ID')

In [11]:
gauges_reduced['typimpact'] = influence['typimpact']
gauges_reduced['degimpact'] = influence['Degree']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [12]:
gauges_reduced['country'] = 'ISL'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


# Export gauge attributes table as .csv and .shp 

In [47]:
save_path = Path(r"C:\Users\hordurbhe\Documents\Vinna\lamah\lamah_ice\lamah_ice\D_gauges\1_attributes\Gauge_attributes.csv")
gauges_reduced[['V_no','name','river','VHM_no',
       'gaps_hourly',  'obsbeg_day', 'obsend_day', 'obsbeg_hr',
       'obsend_dr', 'typimpact', 'degimpact','elevation','lat', 'lon','geometry']].to_csv(save_path)

In [18]:
# Also save a final version of the gauges shapefile
gauges_reduced.crs = 'EPSG:3057'
save_path = Path(r"C:\Users\hordurbhe\Documents\Vinna\lamah\lamah_ice\lamah_ice\D_gauges\3_shapefiles\gauges.shp")
gauges_reduced[['V_no','name','river','VHM_no',
       'gaps_hourly',  'obsbeg_day', 'obsend_day', 'obsbeg_hr',
       'obsend_dr', 'typimpact', 'degimpact','elevation','lat', 'lon','geometry']].to_file(save_path)

  'obsend_dr', 'typimpact', 'degimpact','elevation','lat', 'lon','geometry']].to_file(save_path)
