In [1]:
import zipfile
import os
import sys
# Calculate the absolute path to the project root (one level up) to make src available
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
import pandas as pd
import numpy as np
import pathlib
import xarray as xr
import netCDF4 as nc
import dask
import json
import shutil
import src.utils.era5_prep as era5

## For Daily Data:

In [6]:
folder = '../data/era5_daily'


In [3]:
era5.unzip_files(folder)

In [7]:
# read each folder in unzipped folder and rename the files
era5.flatten_and_rename_nc_files(folder)

In [6]:
def join_dataframes(files, input_folder, identifier: str):
    # Filter files based on the identifier
    filtered_files = [os.path.join(input_folder, file) for file in files if file.startswith(identifier)]
    
    # Use xr.open_mfdataset to open multiple files
    x = xr.open_mfdataset(filtered_files, engine='netcdf4', combine='by_coords')
    
    # Convert to dataframe
    x_frame = x.to_dataframe().reset_index()
    
    # Group by and aggregate
    flattened = x_frame.groupby(['valid_time', 'latitude', 'longitude']).agg('sum').reset_index()
    
    return flattened

# total_precipitation_2_daily-meaXXX.nc
path_renamed_files = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/era5_files/'
files=os.listdir(path_renamed_files)

total_precipitation = join_dataframes(files, path_renamed_files, 'total_precipitation')
print(total_precipitation.head())

In [71]:
# total_precipitation_2_daily-meaXXX.nc
path_renamed_files = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/era5_files/'
files=os.listdir(path_renamed_files)

total_precipitation = join_dataframes(files, path_renamed_files, 'total_precipitation')
print(total_precipitation.head())

  valid_time  latitude  longitude   tp  number
0 2011-09-01     -86.0     -162.0  0.0       0
1 2011-09-01     -86.0     -161.5  0.0       0
2 2011-09-01     -86.0     -161.0  0.0       0
3 2011-09-01     -86.0     -160.5  0.0       0
4 2011-09-01     -86.0     -160.0  0.0       0


In [72]:
total_precipitation.describe()

Unnamed: 0,valid_time,latitude,longitude,tp,number
count,64233400,64233400.0,64233400.0,64233400.0,64233400.0
mean,2007-08-23 12:56:56.727273984,-62.0,-87.0,0.0002216148,0.0
min,1996-10-30 00:00:00,-86.0,-162.0,0.0,0.0
25%,2003-05-07 18:00:00,-74.0,-124.5,4.827976e-06,0.0
50%,2008-12-26 12:00:00,-62.0,-87.0,6.54459e-05,0.0
75%,2011-09-17 06:00:00,-50.0,-49.5,0.0002475466,0.0
max,2017-02-14 00:00:00,-38.0,-12.0,0.02416039,0.0
std,,14.0,43.44537,0.0004082066,0.0


In [73]:
total_precipitation['number'] = total_precipitation['number'].astype(float)
print(sum(total_precipitation['number']==0.0))
# number column can probably be dropped as it only contains zeros (its just the length of the frame)


64233400


In [74]:
total_precipitation = total_precipitation.drop(columns=['number'])
print(total_precipitation.head())

  valid_time  latitude  longitude   tp
0 2011-09-01     -86.0     -162.0  0.0
1 2011-09-01     -86.0     -161.5  0.0
2 2011-09-01     -86.0     -161.0  0.0
3 2011-09-01     -86.0     -160.5  0.0
4 2011-09-01     -86.0     -160.0  0.0


In [75]:
# 2m_temperature_stream-enda_daily-meaXXX.nc

path_renamed_files = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/era5_files/'
files=os.listdir(path_renamed_files)

temperature = join_dataframes(files, path_renamed_files, '2m_temperature') 
print(temperature.head())

  valid_time  latitude  longitude         t2m  number
0 2005-01-01     -86.0     -162.0  256.804840       0
1 2005-01-01     -86.0     -161.5  257.231110       0
2 2005-01-01     -86.0     -161.0  257.653473       0
3 2005-01-01     -86.0     -160.5  258.071930       0
4 2005-01-01     -86.0     -160.0  258.428131       0


In [76]:
temperature.describe()

Unnamed: 0,valid_time,latitude,longitude,t2m,number
count,64233400,64233400.0,64233400.0,64233400.0,64233400.0
mean,2007-08-23 12:56:56.727272704,-62.0,-87.0,223.0041,0.0
min,1996-10-30 00:00:00,-86.0,-162.0,0.0,0.0
25%,2003-05-07 18:00:00,-74.0,-124.5,247.5292,0.0
50%,2008-12-26 12:00:00,-62.0,-87.0,271.9107,0.0
75%,2011-09-17 06:00:00,-50.0,-49.5,279.56,0.0
max,2017-02-14 00:00:00,-38.0,-12.0,308.3194,0.0
std,,14.0,43.44537,92.51336,0.0


In [77]:
temperature = temperature.drop(columns=['number'])

In [78]:
# merge the two dataframes based on valid time, latitude and longitude
merged = pd.merge(total_precipitation, temperature, on=['valid_time', 'latitude', 'longitude'])

print(merged.head())

  valid_time  latitude  longitude   tp         t2m
0 2011-09-01     -86.0     -162.0  0.0  240.542679
1 2011-09-01     -86.0     -161.5  0.0  240.307816
2 2011-09-01     -86.0     -161.0  0.0  240.068314
3 2011-09-01     -86.0     -160.5  0.0  239.823929
4 2011-09-01     -86.0     -160.0  0.0  239.604202


In [82]:
# Double check if merge worked
print(temperature[(temperature['latitude'] == -86.0) & (temperature['longitude'] == -161.5) & (temperature['valid_time'] == '2011-09-01')])
print(total_precipitation[(total_precipitation['latitude'] == -86.0) & (total_precipitation['longitude'] == -161.5) & (total_precipitation['valid_time'] == '2011-09-01')])

  valid_time  latitude  longitude         t2m
1 2011-09-01     -86.0     -161.5  240.307816
  valid_time  latitude  longitude   tp
1 2011-09-01     -86.0     -161.5  0.0


In [83]:
# save to zip file
path_save = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/projectP/data/'

merged.to_csv(path_save + 'weather_daily_averages.zip', compression={'method': 'zip', 'archive_name': 'weather_daily_averages.csv'}, index=False)

# Try different approach to create one single NetCDF file to use for interpolation

In [5]:
# function
# def append_netcdf(files, input_folder, identifier: str, output_folder):
#     datasets = [xr.open_dataset(input_folder + file) for file in files if file.startswith(identifier)]
#     concatenated = xr.concat(datasets , dim="valid_time")
#     concatenated.to_netcdf(output_folder + identifier + ".nc")

def append_netcdf(input_folder, identifier: str, output_folder):
    # Open datasets that match the identifier
    files = os.listdir(input_folder)
    datasets = [xr.open_dataset(input_folder + file) for file in files if file.startswith(identifier)]
    
    # Concatenate datasets along the 'valid_time' dimension
    concatenated = xr.concat(datasets, dim="valid_time")
    
    # Save the concatenated dataset to a new NetCDF file
    concatenated.to_netcdf(output_folder + identifier + ".nc")

In [92]:
# temperature netcdf
path_renamed_files = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/era5_files/'
path_save = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/projectP/data/'

append_netcdf(path_renamed_files, '2m_temperature', path_save)

In [93]:
# create precipitation .nc file

path_renamed_files = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/era5_files/'
path_save = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/projectP/data/'


append_netcdf(path_renamed_files, 'total_precipitation', path_save)

In [98]:
precipitation = xr.open_dataset(path_save + 'total_precipitation.nc', engine='netcdf4')
temperature = xr.open_dataset(path_save + '2m_temperature.nc', engine='netcdf4')

precipitation_df = precipitation.to_dataframe()
temperature_df = temperature.to_dataframe()

precipitation_df = precipitation_df.drop(columns=['number'])
temperature_df = temperature_df.drop(columns=['number'])

print(precipitation_df.head())
print(temperature_df.head())

                                     tp
valid_time latitude longitude          
2011-09-01 -38.0    -162.0     0.000754
                    -161.5     0.000713
                    -161.0     0.000645
                    -160.5     0.000594
                    -160.0     0.000606
                                      t2m
valid_time latitude longitude            
2005-01-01 -38.0    -162.0     288.651276
                    -161.5     288.757233
                    -161.0     288.834137
                    -160.5     288.909088
                    -160.0     289.017975


In [99]:
# merge the two netcdf files so both variables are stored in one file
path_save = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/projectP/data/'

precipitation_xr = precipitation_df.to_xarray()
temperature_xr = temperature_df.to_xarray() 

merged = xr.merge([precipitation_xr, temperature_xr])


In [100]:
# to netcdf
merged.to_netcdf(path_save + 'merged_weather.nc')

In [101]:
data_test = xr.open_dataset(path_save + 'merged_weather.nc', engine='netcdf4')

data_test_df = data_test.to_dataframe()

print(data_test_df.head())



                                     tp         t2m
valid_time latitude longitude                      
1996-10-30 -38.0    -162.0     0.000010  286.694824
                    -161.5     0.000016  286.722168
                    -161.0     0.000021  286.678955
                    -160.5     0.000032  286.636230
                    -160.0     0.000057  286.636230


# ERA5 Cohort 2


In [8]:
path = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/ERA5_cohort2_zipped'
output_path = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/ERA5_2_unzipped/'

files=os.listdir(path)


unzip_files(path, output_path)

In [11]:
# read each folder in unzipped folder and rename the files
path_new = output_path
path_renamed_files = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/era5_2_files/'
                
rename_files(path_new, path_renamed_files)

In [18]:
# save netcdfs

path_renamed_files = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/era5_2_files/'
path_save = '/Users/philipp/Documents/02_Master_Uni/Uni_Tübingen/Semester_1/06 Data Literacy/02 Project/projectP/data/'
all_files = os.listdir(path_renamed_files)

identifiers = ['mean_snowmelt', 'sea_ice_cover', 'sea_surface_temp', 'snow_density', 'snow_depth']
for identifier in identifiers:
    append_netcdf(all_files, path_renamed_files, identifier, path_save)