In [1]:
import pandas as pd, glob, numpy as np
from datetime import date, timedelta

In [33]:
# Daily processing
daily_folder = r"Z:\PhD_Datasets&Analysis\Info_Inputs\Streamflow_Stations\Climate_Sensitive_Stations-GRDC\2025-02-13_17-18_Daily"
monthly_folder = r"Z:\PhD_Datasets&Analysis\Info_Inputs\Streamflow_Stations\Climate_Sensitive_Stations-GRDC\2025-02-13_17-18_Monthly"
watersheds_folder = r"Z:\PhD_Datasets&Analysis\Info_Inputs\Streamflow_Sts_Drainage_Areas\GRDC_Watersheds"
ext = "*.txt"

# TerraClimate available period
terra_st_yr = 1958
terra_ed_yr = 2023
sdate = date(terra_st_yr, 1, 1) # start date
edate = date(terra_ed_yr, 12, 31) # end date

In [34]:
period_dates = [(sdate + timedelta(days=i)).strftime('%Y-%m-%d') for i in range((edate - sdate).days + 1)]

# Create a DF whose index corresponds to the TerraClimate dates
daily_df_sts = pd.DataFrame({"YYYY-MM-DD": period_dates}).set_index("YYYY-MM-DD")
daily_df_sts

1958-01-01
1958-01-02
1958-01-03
1958-01-04
1958-01-05
...
2023-12-27
2023-12-28
2023-12-29
2023-12-30
2023-12-31


In [35]:
# Daily processing
archivos = glob.glob(daily_folder + "//" + ext)

for archivo in archivos:

    sts_dict = {}
    file_name = archivo.split("\\")[-1]
    print("Reading file: " + file_name)

    id_station = file_name.split("_")[0]
    data_matrix = []

    # Specify encoding explicitly
    with open(archivo, 'r', encoding='ISO-8859-1') as inFile:    
        data_matrix = inFile.readlines()[37:] # starting data line in the file

    date_array = []
    value_array = []

    for data in data_matrix:
        line = data.split(";")
        line_date = line[0]

        try:
            line_value = float(line[-1])
        except ValueError:
            print(f"Skipping invalid value in {archivo}: {line[-1]}")
            continue

        if int(line_value) == -999: # This avoids saving no-data values (-999)
            line_value = np.nan

        date_array.append(line_date)
        value_array.append(line_value)

    if len(value_array) == 0: # This avoids saving files with no station data
        print(f"Skipping station {id_station} due to no data")
        continue

    sts_dict["YYYY-MM-DD"] = date_array
    sts_dict[id_station] = value_array

    # Create a DF whose index corresponds to the TerraClimate dates
    temp_df = pd.DataFrame(sts_dict).set_index("YYYY-MM-DD")
    daily_df_sts = daily_df_sts.join(temp_df) # left join on index

# Drop columns with all NaN values
daily_df_cleaned = daily_df_sts.dropna(axis=1, how='all')
#daily_df_cleaned.to_csv(daily_folder + "\_DataFrames\Joined_Daily_Sts_DFs.csv")
daily_df_cleaned

Reading file: 1257100_Q_Day.Cmd.txt
Reading file: 1309620_Q_Day.Cmd.txt
Reading file: 1769100_Q_Day.Cmd.txt
Reading file: 2106100_Q_Day.Cmd.txt
Reading file: 2178200_Q_Day.Cmd.txt
Reading file: 2178951_Q_Day.Cmd.txt
Reading file: 2178960_Q_Day.Cmd.txt
Reading file: 2180400_Q_Day.Cmd.txt
Reading file: 2180600_Q_Day.Cmd.txt
Reading file: 2180711_Q_Day.Cmd.txt
Reading file: 2180712_Q_Day.Cmd.txt
Reading file: 2181100_Q_Day.Cmd.txt
Reading file: 2181200_Q_Day.Cmd.txt
Reading file: 2181300_Q_Day.Cmd.txt
Reading file: 2181500_Q_Day.Cmd.txt
Reading file: 2182050_Q_Day.Cmd.txt
Reading file: 2182150_Q_Day.Cmd.txt
Reading file: 2182250_Q_Day.Cmd.txt
Reading file: 2335950_Q_Day.Cmd.txt
Skipping station 2335950 due to no data
Reading file: 2517500_Q_Day.Cmd.txt
Reading file: 2517550_Q_Day.Cmd.txt
Reading file: 2517600_Q_Day.Cmd.txt
Reading file: 2517610_Q_Day.Cmd.txt
Skipping station 2517610 due to no data
Reading file: 2517700_Q_Day.Cmd.txt
Skipping station 2517700 due to no data
Reading file: 25

Unnamed: 0_level_0,1257100,1309620,1769100,2106100,2178200,2178951,2178960,2180400,2180600,2180711,...,6854593,6854601,6854713,6854714,6854950,6855409,6855411,6855412,6870640,6887300
YYYY-MM-DD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1958-01-01,147.60,,,,,,,,,,...,4.9,,2.20,,,,,,,
1958-01-02,151.55,,,,,,,,,,...,4.7,,2.10,,,,,,,
1958-01-03,155.37,,,,,,,,,,...,4.7,,2.10,,,,,,,
1958-01-04,158.21,,,,,,,,,,...,4.7,,2.00,,,,,,,
1958-01-05,160.89,,,,,,,,,,...,4.5,,2.00,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-27,,,,,,,,,,,...,2.9,7.99,2.45,2.12,13.14,4.63,1.09,7.16,10.34,
2023-12-28,,,,,,,,,,,...,2.7,7.91,2.44,2.09,11.94,4.53,1.08,7.08,10.23,
2023-12-29,,,,,,,,,,,...,2.7,7.84,2.41,2.06,11.74,4.46,1.07,7.01,10.15,
2023-12-30,,,,,,,,,,,...,2.7,7.76,2.40,2.03,11.58,4.35,1.06,6.93,9.99,


In [36]:
years = range(terra_st_yr, terra_ed_yr + 1)
months = range(1, 12 + 1)
yr_mth= [str(a)+ "-" + str(b).zfill(2) for a in years for b in months]

# Create a DF whose index corresponds to the TerraClimate monthly dates
monthly_df_sts = pd.DataFrame({"YYYY-MM": yr_mth}).set_index("YYYY-MM")
monthly_df_sts

1958-01
1958-02
1958-03
1958-04
1958-05
...
2023-08
2023-09
2023-10
2023-11
2023-12


In [37]:
# Read the CSV file with UTF-8 encoding that contains information on the CSS-related watersheds. This file contains only 1,236 records as 9 CSS stations did not have delineated watersheds provided by GRDC
drain_areas_df = pd.read_csv(watersheds_folder + "\\CSS-GRDC_Watersheds.csv") # grdc_no == station_no, area == CATCHMENT_SIZE 
drain_areas_df

Unnamed: 0,grdc_no,river,station,area,altitude,lat_org,long_org,lat_pp,long_pp,dist_km,area_calc,quality,type,comment,source
0,1257100.0,OKAVANGO RIVER,RUNDU (64932101),97300.0,1060.0,-17.9000,19.7500,-17.9062,19.7479,0.7,103517.8,Medium,Automatic,Area difference 5-10% and distance <= 5 km,hydrosheds
1,1309620.0,OUED OUM ER RBI',TARHAT,1036.0,866.0,33.0000,-5.6700,32.9979,-5.6521,1.7,1019.1,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
2,1769100.0,NYANDO,AHERO BRIDGE,2625.0,-999.0,-0.1688,34.9146,-0.1688,34.9146,0.0,2956.1,Low,Manual,"Not sure, but could be ok",hydrosheds
3,2178200.0,FUTUN XI,SHAO-WU-HSIEN,2745.0,-999.0,27.3000,117.5000,27.3188,117.5271,3.4,3053.1,Low,Automatic,Area difference 10-50% and distance <= 5 km,hydrosheds
4,2178951.0,HUAI HE,CHANGTAIGUAN,3090.0,-999.0,32.3142,114.0606,32.3104,114.0646,0.6,3023.2,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231,6855409.0,VUOKSI,"HAAPAJOKI, PUTKULANKOSKI",596.0,155.0,63.0975,31.0295,63.0896,31.0396,1.0,500.8,Low,Automatic,Area difference 10-50% and distance <= 5 km,hydrosheds
1232,6855411.0,VUOKSI,KAJOONJARVI - OUTLET,125.0,167.0,63.1519,28.8992,63.1440,28.9050,0.9,126.5,High,Automatic,Area difference <= 5% and distance <= 5 km,merit
1233,6855412.0,VUOKSI,LOHNAJARVI - OUTLET,788.0,80.0,61.8351,28.3028,61.8320,28.2940,0.6,778.2,High,Automatic,Area difference <= 5% and distance <= 5 km,merit
1234,6870640.0,KOUTAJOKI,OULANKAJOKI,1986.0,160.0,66.3693,29.3152,66.3670,29.3230,0.4,1954.1,High,Automatic,Area difference <= 5% and distance <= 5 km,merit


In [None]:
# The first conditional ensures working only with stations whose watersheds are bigger than three TerraClimate pixels. Considering a pixel size of 4 km, then 4 x 4 x 3 = 48 square km.
# This facilitates zonal statistics, which are executed after the water balance to extract values from the resulting surfaces for the ultimate watersheds.
# The second conditional guarantees that only watersheds whose delineation quality is "High" are considered.
filtered_drain_areas_df = drain_areas_df[(drain_areas_df["area"] > 48) & (drain_areas_df["quality"] == "High")]
filtered_drain_areas_df

Unnamed: 0,grdc_no,river,station,area,altitude,lat_org,long_org,lat_pp,long_pp,dist_km,area_calc,quality,type,comment,source
1,1309620.0,OUED OUM ER RBI',TARHAT,1036.0,866.0,33.0000,-5.6700,32.9979,-5.6521,1.7,1019.1,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
4,2178951.0,HUAI HE,CHANGTAIGUAN,3090.0,-999.0,32.3142,114.0606,32.3104,114.0646,0.6,3023.2,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
5,2178960.0,BAILU HE,BAIQUEYUAN,284.0,-999.0,31.7819,115.0944,31.7812,115.1021,0.7,290.7,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
6,2180400.0,HUANG SHUI,MINHE,15342.0,-999.0,36.3358,102.8197,36.3354,102.8187,0.1,15566.5,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
7,2180600.0,WEI HE,WUSHAN,8080.0,-999.0,34.7208,104.8858,34.7271,104.8854,0.7,8121.9,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227,6854601.0,IIJOKI,SUOLIJARVI -OUTLET,1313.0,150.0,65.1446,28.0690,65.1438,28.0646,0.2,1290.9,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds
1228,6854713.0,KEMIJOKI,OUNASJARVI - OUTLET,363.0,290.0,68.3961,23.7524,68.3960,23.7430,0.4,364.6,High,Automatic,Area difference <= 5% and distance <= 5 km,merit
1232,6855411.0,VUOKSI,KAJOONJARVI - OUTLET,125.0,167.0,63.1519,28.8992,63.1440,28.9050,0.9,126.5,High,Automatic,Area difference <= 5% and distance <= 5 km,merit
1233,6855412.0,VUOKSI,LOHNAJARVI - OUTLET,788.0,80.0,61.8351,28.3028,61.8320,28.2940,0.6,778.2,High,Automatic,Area difference <= 5% and distance <= 5 km,merit


In [39]:
sts_ids = filtered_drain_areas_df["grdc_no"].astype(int).to_list()
print(sts_ids)

[1309620, 2178951, 2178960, 2180400, 2180600, 2180712, 2181200, 2181500, 2335950, 2517550, 2517610, 2517700, 2517710, 2517750, 2517850, 2517910, 2517920, 2517940, 2517950, 2817310, 3617110, 3617120, 3617811, 3617812, 3617814, 3617820, 3618051, 3618052, 3618115, 3618500, 3618700, 3618710, 3618711, 3618715, 3618720, 3618721, 3618722, 3618731, 3618950, 3618951, 3621200, 3623100, 3624200, 3624201, 3624250, 3624400, 3624500, 3625000, 3626000, 3627000, 3628400, 3628401, 3628701, 3628900, 3629001, 3630600, 3631100, 3632400, 3632401, 3632450, 3633120, 3633121, 3633122, 3633123, 3633160, 3633180, 3633300, 3633301, 3633320, 3634100, 3634150, 3634160, 3634200, 3634220, 3634320, 3634340, 3634350, 3634360, 3634370, 3634380, 3635010, 3635030, 3635035, 3635040, 3635041, 3635060, 3635100, 3635300, 3635301, 3635310, 3635340, 3635360, 3635402, 3635403, 3635408, 3635410, 3635420, 3635430, 3635440, 3635451, 3635551, 3635600, 3635650, 3635651, 3635660, 3636100, 3636200, 3636201, 3636202, 3636500, 3636501, 

In [40]:
# Monthly processing
archivos = glob.glob(monthly_folder + "//" + ext)

for archivo in archivos:

    sts_dict = {}
    file_name = archivo.split("\\")[-1]
    print("Reading file: " + file_name)

    id_station = file_name.split("_")[0]

    if int(id_station) not in sts_ids: # This avoids reading files of stations that do not have their respective watersheds
        continue

    data_matrix = []

    # Specify encoding explicitly
    with open(archivo, 'r', encoding='ISO-8859-1') as inFile:    
        data_matrix = inFile.readlines()[39:] # starting data line in the file

    date_array = []
    value_array = []

    for data in data_matrix:
        line = data.split(";")
        line_date = '-'.join(line[0].split("-")[0:2])

        try:
            line_value = float(line[-2])
        except ValueError:
            print(f"Skipping invalid value in {archivo}: {line[-2]}")
            continue

        if int(line_value) == -999: # This avoids saving no-data values (-999)
            line_value = np.nan

        date_array.append(line_date)
        value_array.append(line_value)

    if len(value_array) == 0: # This avoids saving files with no station data
        print(f"Skipping station {id_station} due to no data")
        continue

    sts_dict["YYYY-MM"] = date_array
    sts_dict[id_station] = value_array

    # Create a DF whose index corresponds to the TerraClimate monthly dates
    temp_df = pd.DataFrame(sts_dict).set_index("YYYY-MM")
    monthly_df_sts = monthly_df_sts.join(temp_df) # left join on index

# Drop columns with all NaN values
monthly_df_cleaned = monthly_df_sts.dropna(axis=1, how='all')
#monthly_df_cleaned.to_csv(daily_folder + "\_DataFrames\Joined_Monthly_Sts_DFs.csv")
monthly_df_cleaned

Reading file: 1257100_Q_Month.txt
Reading file: 1309620_Q_Month.txt
Reading file: 1769100_Q_Month.txt
Reading file: 2106100_Q_Month.txt
Reading file: 2178200_Q_Month.txt
Reading file: 2178951_Q_Month.txt
Reading file: 2178960_Q_Month.txt
Reading file: 2180400_Q_Month.txt
Reading file: 2180600_Q_Month.txt
Reading file: 2180711_Q_Month.txt
Reading file: 2180712_Q_Month.txt
Reading file: 2181100_Q_Month.txt
Reading file: 2181200_Q_Month.txt
Reading file: 2181300_Q_Month.txt
Reading file: 2181500_Q_Month.txt
Reading file: 2182050_Q_Month.txt
Reading file: 2182150_Q_Month.txt
Reading file: 2182250_Q_Month.txt
Reading file: 2335950_Q_Month.txt
Reading file: 2517500_Q_Month.txt
Reading file: 2517550_Q_Month.txt
Reading file: 2517600_Q_Month.txt
Reading file: 2517610_Q_Month.txt
Reading file: 2517700_Q_Month.txt
Reading file: 2517710_Q_Month.txt
Reading file: 2517750_Q_Month.txt
Reading file: 2517850_Q_Month.txt
Reading file: 2517910_Q_Month.txt
Reading file: 2517920_Q_Month.txt
Reading file: 

Unnamed: 0_level_0,1309620,2178951,2178960,2180400,2180600,2180712,2181200,2181500,2517550,2517920,...,6742701,6744200,6744500,6830101,6830103,6854601,6854713,6855411,6855412,6870640
YYYY-MM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1958-01,,,,,,,,,,102.806,...,,101.384,67.277,,,,1.935,,,
1958-02,,,,,,,,,,102.607,...,,359.689,458.993,,,,1.771,,,
1958-03,,,,,,,,,,116.161,...,,353.968,283.710,,,,1.813,,,
1958-04,,,,,,,,,,228.267,...,,567.067,422.700,,,,1.650,,,
1958-05,,,,,,,,,,293.065,...,,510.806,298.016,,,,5.655,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08,,,,,,,,,,,...,,,,3.768,11.174,17.029,3.885,2.418,1.490,19.073
2023-09,,,,,,,,,,,...,,,,4.716,16.677,13.024,7.144,1.497,1.316,40.910
2023-10,,,,,,,,,,,...,,,,6.202,9.018,32.953,6.251,3.855,6.647,38.705
2023-11,,,,,,,,,,,...,,,,3.392,7.576,10.605,2.982,1.994,11.567,13.254


To identify stations with at least 30 complete water-years (Oct-Sep) in a DataFrame where:

- The index is in YYYY-MM format.
- The columns are stations with their monthly multiannual values.

1. Convert the index to a DateTime format\
Since the index is in YYYY-MM format, convert it to a proper datetime format for easier filtering and resampling.

In [41]:
reindexed_monthly_df_cleaned = monthly_df_cleaned.copy()
reindexed_monthly_df_cleaned.index = pd.to_datetime(reindexed_monthly_df_cleaned.index, format='%Y-%m')
reindexed_monthly_df_cleaned

Unnamed: 0_level_0,1309620,2178951,2178960,2180400,2180600,2180712,2181200,2181500,2517550,2517920,...,6742701,6744200,6744500,6830101,6830103,6854601,6854713,6855411,6855412,6870640
YYYY-MM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1958-01-01,,,,,,,,,,102.806,...,,101.384,67.277,,,,1.935,,,
1958-02-01,,,,,,,,,,102.607,...,,359.689,458.993,,,,1.771,,,
1958-03-01,,,,,,,,,,116.161,...,,353.968,283.710,,,,1.813,,,
1958-04-01,,,,,,,,,,228.267,...,,567.067,422.700,,,,1.650,,,
1958-05-01,,,,,,,,,,293.065,...,,510.806,298.016,,,,5.655,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-01,,,,,,,,,,,...,,,,3.768,11.174,17.029,3.885,2.418,1.490,19.073
2023-09-01,,,,,,,,,,,...,,,,4.716,16.677,13.024,7.144,1.497,1.316,40.910
2023-10-01,,,,,,,,,,,...,,,,6.202,9.018,32.953,6.251,3.855,6.647,38.705
2023-11-01,,,,,,,,,,,...,,,,3.392,7.576,10.605,2.982,1.994,11.567,13.254


2. Define Water Years (Oct-Sep)\
The water year starts in October and ends in September of the following year. You can define a water-year label as the year of the September within that water year.

In [42]:
reindexed_monthly_df_cleaned['water_year'] = reindexed_monthly_df_cleaned.index.to_series().apply(lambda x: x.year if x.month < 10 else x.year + 1)
reindexed_monthly_df_cleaned

Unnamed: 0_level_0,1309620,2178951,2178960,2180400,2180600,2180712,2181200,2181500,2517550,2517920,...,6744200,6744500,6830101,6830103,6854601,6854713,6855411,6855412,6870640,water_year
YYYY-MM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1958-01-01,,,,,,,,,,102.806,...,101.384,67.277,,,,1.935,,,,1958
1958-02-01,,,,,,,,,,102.607,...,359.689,458.993,,,,1.771,,,,1958
1958-03-01,,,,,,,,,,116.161,...,353.968,283.710,,,,1.813,,,,1958
1958-04-01,,,,,,,,,,228.267,...,567.067,422.700,,,,1.650,,,,1958
1958-05-01,,,,,,,,,,293.065,...,510.806,298.016,,,,5.655,,,,1958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-01,,,,,,,,,,,...,,,3.768,11.174,17.029,3.885,2.418,1.490,19.073,2023
2023-09-01,,,,,,,,,,,...,,,4.716,16.677,13.024,7.144,1.497,1.316,40.910,2023
2023-10-01,,,,,,,,,,,...,,,6.202,9.018,32.953,6.251,3.855,6.647,38.705,2024
2023-11-01,,,,,,,,,,,...,,,3.392,7.576,10.605,2.982,1.994,11.567,13.254,2024


3. Count Complete Water Years for Each Station\
Now, we group by water year and count the number of non-null monthly values per station. A complete water year must have 12 valid values for a given station.

- This method ensures that:

    - Only stations with 30+ water years (each with all 12 months) are selected.
    
    - The approach is flexible for datasets with missing months.

In [43]:
# Count valid months per water year per station
valid_months_per_wy = reindexed_monthly_df_cleaned.groupby('water_year').count()

# Identify stations with at least 30 complete water years. This guarantees that we are considering the same approach of TerraClimate authors for results validation
stations_with_30_wy = (valid_months_per_wy == 12).sum(axis=0) >= 30
selected_stations = stations_with_30_wy[stations_with_30_wy].index.tolist()
print(selected_stations)  # List of stations with at least 30 complete water years
print("The final amount of stations with at least 30 complete water years is", len(selected_stations))

['3617110', '3617811', '3617812', '3617814', '3618051', '3618052', '3618500', '3618700', '3618711', '3618720', '3618721', '3618722', '3618950', '3618951', '3621200', '3623100', '3625000', '3626000', '3627000', '3628400', '3628701', '3628900', '3629001', '3630600', '3631100', '3632400', '3633120', '3633123', '3633160', '3633180', '3633301', '3634150', '3634160', '3634320', '3634340', '3634350', '3634360', '3634370', '3635030', '3635035', '3635040', '3635041', '3635301', '3635310', '3635360', '3635402', '3635408', '3635430', '3635440', '3635451', '3635600', '3635650', '3635651', '3637180', '3637380', '3637771', '3637772', '3637773', '3637774', '3637775', '3637790', '3637810', '3637910', '3638050', '3638051', '3638700', '3649010', '3649030', '3649050', '3649110', '3649130', '3649150', '3649151', '3649160', '3649210', '3649211', '3649240', '3649251', '3649310', '3649311', '3649320', '3649321', '3649325', '3649409', '3649411', '3649412', '3649413', '3649416', '3649418', '3649419', '3649420'

In [44]:
# Export the final dataframes with the stations to be considered
final_daily_df = daily_df_cleaned[selected_stations]
final_daily_df.to_csv(daily_folder + "\_DataFrames\Joined_Daily_Sts_DFs.csv")

final_monthly_df = monthly_df_cleaned[selected_stations]
final_monthly_df.to_csv(monthly_folder + "\_DataFrames\Joined_Monthly_Sts_DFs.csv")

In [47]:
final_daily_df.describe()

Unnamed: 0,3617110,3617811,3617812,3617814,3618051,3618052,3618500,3618700,3618711,3618720,...,6742701,6744200,6744500,6830101,6830103,6854601,6854713,6855411,6855412,6870640
count,19044.0,13435.0,13148.0,16887.0,15467.0,15361.0,19235.0,13234.0,15939.0,15338.0,...,22280.0,23741.0,23741.0,19327.0,18818.0,22947.0,24088.0,17897.0,17871.0,21173.0
mean,7784.495688,2410.638053,490.291815,55.962123,17642.427799,12824.313716,2997.282294,93.42381,148.895264,1454.851321,...,76.808002,177.253658,124.772906,3.699646,8.03075,18.394925,4.120377,1.743149,6.379517,24.345363
std,5501.737962,1956.689287,285.548266,20.976846,7257.590305,5766.48926,2689.92114,106.036275,191.238582,1138.22696,...,114.796444,151.166746,142.266413,3.321394,15.385325,24.463669,5.031205,1.65051,5.448727,39.771033
min,644.78,228.602,126.191,22.161,1410.314,1046.547,128.277,3.129,4.897,70.0,...,3.7,17.3,4.9,0.64,0.9,2.5,0.3,0.13,0.21,3.1
25%,2511.64175,547.713,221.595,41.21,12165.797,8270.589,1010.181,29.906,39.282,602.01,...,26.3,80.2,45.9,1.51,2.14,6.5,1.69,0.81,2.75,7.0
50%,6789.361,1866.849,412.278,48.778,16428.832,11988.834,1929.331,58.347,78.514,1085.359,...,44.7,131.0,79.7,2.7,3.6,10.0,2.4,1.2,4.7,11.7
75%,12400.977,4113.269,762.132,65.221,22748.215,17317.873,4293.989,116.162,178.158,2028.656,...,81.7,219.0,148.0,4.8,7.9,19.1,4.5,2.07,8.1,24.0
max,30719.326,7666.857,1351.387,184.078,37100.773,28421.797,16085.612,1404.07,2051.142,6701.036,...,2642.0,2270.0,3216.0,43.0,340.0,263.0,66.0,14.5,44.0,462.0


In [48]:
final_monthly_df.describe()

Unnamed: 0,3617110,3617811,3617812,3617814,3618051,3618052,3618500,3618700,3618711,3618720,...,6742701,6744200,6744500,6830101,6830103,6854601,6854713,6855411,6855412,6870640
count,626.0,441.0,429.0,554.0,508.0,504.0,632.0,435.0,522.0,503.0,...,732.0,780.0,780.0,635.0,619.0,754.0,791.0,588.0,587.0,696.0
mean,7806.841605,2415.48066,491.534191,56.12185,17607.422844,12801.187534,2984.657657,93.317071,148.333632,1453.047423,...,76.702169,177.35824,125.062796,3.688691,7.982541,18.302406,4.109613,1.736526,6.372162,24.201412
std,5424.898172,1929.63551,281.064526,19.079074,7002.271063,5536.633958,2528.208037,75.579141,136.194831,1054.389435,...,76.314734,125.675334,99.869929,2.77206,9.786126,18.810085,3.805342,1.415775,4.843485,28.97966
min,717.629,256.013,140.316,28.471,3162.573,2193.154,168.572,6.751,9.171,94.835,...,5.04,26.726,12.306,0.645,1.1,2.671,0.329,0.154,0.262,3.487
25%,2614.34625,558.639,223.272,41.81775,12321.1145,8632.7385,1100.9375,38.4745,49.315,653.992,...,28.45025,87.8615,54.86,1.575,2.239,7.11575,1.705,0.83875,2.9215,7.2515
50%,6566.9195,1821.439,409.07,49.412,16390.11,11873.9975,1996.52,66.59,103.3195,1090.428,...,51.7465,140.349,94.5755,2.837,4.193,11.688,2.52,1.295,4.961,13.4975
75%,12337.8965,4015.821,761.45,67.3805,22662.6305,17335.23725,4541.4915,126.25,195.71725,2053.8145,...,93.02525,226.829,163.96125,4.8825,9.3015,21.4995,4.986,2.07625,8.4885,27.17125
max,28767.185,7549.287,1212.038,129.67,36018.936,27252.164,12367.627,436.095,829.255,5123.084,...,503.032,930.548,725.032,17.854,60.584,111.968,21.871,8.406,28.968,174.589


In [45]:
# final_daily_df = pd.read_csv(daily_folder + "\_DataFrames\Joined_Daily_Sts_DFs.csv", index_col="YYYY-MM-DD")
# final_daily_df

In [46]:
# final_monthly_df = pd.read_csv(monthly_folder + "\_DataFrames\Joined_Monthly_Sts_DFs.csv", index_col="YYYY-MM")
# final_monthly_df