In [1]:
import xarray as xr
import glob
import datetime

import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import numpy as np

from google.cloud import storage
import gcsfs
import zarr

import xskillscore as xs
import pandas as pd

import os

import re

In [3]:
def transform_dataset(meso, start_hour, end_hour):
    # Convert 10 minutes to hours (since rain intensity is in m/s)
    time_delta_hours = 10 / 60  # 10 minutes = 1/6 hours

    # Define the rolling window size, which corresponds to 6 hours
    rolling_window_size = int(6 / time_delta_hours)  # 36 time steps

    # Calculate the rolling sum over the last 6 hours and convert to mm
    total_precipitation_6hr_mm = (
        meso['prate']
        .rolling(time=rolling_window_size, center=False)
        .sum() * time_delta_hours * 3600 * 1000  # m/s to mm
    )

    # Add this as a new variable in the dataset
    meso['total_precipitation_6hr_meso'] = total_precipitation_6hr_mm

    # Define the date range based on your dataset's time dimension
    date_range = pd.date_range(start='2022-01-01', end='2023-01-01', freq='D')[:-1]

    # Extract the time values for each day (144 points per day, 10-minute intervals)
    time_points = pd.date_range('2022-01-01 00:00', periods=144, freq='10min')

    # Transform each variable with a time dimension
    data_vars = {
        var_name: (
            ('date', 'time') + var_data.dims[1:], 
            var_data.values.reshape((len(date_range), len(time_points)) + var_data.shape[1:])
        )
        if 'time' in var_data.dims else var_data
        for var_name, var_data in meso.data_vars.items()
    }

    # Create the new dataset with date and time dimensions
    meso = xr.Dataset(
        data_vars=data_vars,
        coords={
            'date': date_range,
            'time': time_points,
            'height_above_ground_level': meso.coords['height_above_ground_level'],
            'station': meso['id'].data,
        }
    )

    # Drop unnecessary variables and coordinates
    meso = meso.drop_vars(['id', 'xf', 'yf', 'i', 'j', 'height_above_base_level'])

    # Slice for hourly values (every hour)
    meso = meso.sel(time=meso['time'].dt.minute == 0)

    # Rename 'time' to 'prediction_timedelta' and convert to timedelta64
    meso = meso.rename({'time': 'prediction_timedelta'})
    prediction_timedelta = (
        meso['prediction_timedelta'].astype('datetime64[ns]')
        .astype('timedelta64[ns]') % np.timedelta64(1, 'D')
    )

    # Adjust prediction_timedelta for start and end hours
    prediction_timedelta += np.timedelta64(start_hour, 'h')

    # Assign the adjusted prediction_timedelta back to the dataset
    meso = meso.assign_coords(prediction_timedelta=prediction_timedelta)

    # Rename 'date' to 'time'
    meso = meso.rename({'date': 'time'})

    # Slice prediction_timedelta for every 6 hours within the specified range
    meso = meso.sel(prediction_timedelta=meso['prediction_timedelta'].isin(
        [np.timedelta64(i, 'h') for i in range(start_hour, end_hour + 1, 6)]
    ))

    # Slice for start date 2022-01-12
    meso = meso.sel(time=slice('2022-01-12', None))

    # Extract 2m temperature and 10m wind components
    meso['2m_temperature_meso'] = meso['TC'].sel(height_above_ground_level=2)
    meso['10m_u_component_of_wind_meso'] = meso['u'].sel(height_above_ground_level=10)
    meso['10m_v_component_of_wind_meso'] = meso['v'].sel(height_above_ground_level=10)

#     # Convert surface level pressure to mean sea level pressure and convert to hPa
#     meso['mean_sea_level_pressure_meso'] = (
#     (meso['ps'] / 100) + 
#     meso['height_above_sea_level'].sel(height_above_ground_level=2) - 
#     2 * 9.81 / 1000 * meso['height_above_sea_level'].sel(height_above_ground_level=2)
# )

    meso['mean_sea_level_pressure_meso'] = meso['ps']

    # Drop original variables and unnecessary coordinates
    meso = meso.drop_vars(['TC', 'u', 'v', 'ps', 'M', 'phi', 'prate', 'height_above_ground_level'])

    # Slice for all prediction_timedelta values larger than 0
    meso = meso.sel(prediction_timedelta=meso['prediction_timedelta'] > np.timedelta64(0, 'h'))

    return meso

def add_96_hour_lead_time(meso):
    # Identify the last time step (T23:50) in the dataset
    last_time_step = meso.isel(prediction_timedelta=-1)

    # Duplicate the last time step to create a new time step for T24:00
    new_time_step = last_time_step.copy(deep=True)
    new_time_step = new_time_step.assign_coords(prediction_timedelta=np.timedelta64(96, 'h'))

    # Concatenate the new time step to the existing dataset
    meso_extended = xr.concat([meso, new_time_step], dim='prediction_timedelta')

    return meso_extended

# Load the datasets
meso_0_24 = xr.open_dataset('/net/shared/student-projects/koenr/mesograsp_2022/extractorOutTFMetmast.000_leadtime-00-24.nc')
meso_24_48 = xr.open_dataset('/net/shared/student-projects/koenr/mesograsp_2022/extractorOutTFMetmast.000_leadtime-24-48.nc')
meso_48_72 = xr.open_dataset('/net/shared/student-projects/koenr/mesograsp_2022/extractorOutTFMetmast.000_leadtime-48-72.nc')
meso_72_96 = xr.open_dataset('/net/shared/student-projects/koenr/mesograsp_2022/extractorOutTFMetmast.000_leadtime-72-96.nc')

# Apply transformations
meso_0_24_transformed = transform_dataset(meso_0_24, start_hour=0, end_hour=24)
meso_24_48_transformed = transform_dataset(meso_24_48, start_hour=24, end_hour=48)
meso_48_72_transformed = transform_dataset(meso_48_72, start_hour=48, end_hour=72)
meso_72_96_transformed = transform_dataset(meso_72_96, start_hour=72, end_hour=96)

# Add the missing 96-hour lead time to the 72-96 dataset
meso_72_96_extended = add_96_hour_lead_time(meso_72_96_transformed)

# Concatenate the datasets along the prediction_timedelta dimension
meso_combined = xr.concat([
    meso_0_24_transformed, 
    meso_24_48_transformed, 
    meso_48_72_transformed, 
    meso_72_96_extended
], dim='prediction_timedelta')

# Ensure 'station' is correctly aligned with 'index'
if 'index' in meso_combined.coords and 'station' in meso_combined.coords:
    meso_combined = meso_combined.assign_coords(station=('index', meso_combined['station'].values))

# Now swap 'index' with 'station' and drop 'index'
meso_combined = meso_combined.swap_dims({'index': 'station'}).drop_vars('index')

meso_combined.drop_sel(station='06252')

In [4]:
# Constants
g = 9.80665  # gravitational acceleration in m/s^2
R = 287.05   # specific gas constant for dry air in J/(kg·K)

# Extract surface pressure
P_surface = meso_combined['mean_sea_level_pressure_meso']

# Convert temperature from Celsius to Kelvin
T_kelvin = meso_combined['2m_temperature_meso'] + 273.15

# Extract the surface height above sea level (assuming height_above_ground_level == 0)
surface_height = meso_combined['height_above_sea_level'].sel(height_above_ground_level=0)

# Calculate mean sea level pressure using the barometric formula
mean_sea_level_pressure = P_surface * np.exp((g * surface_height) / (R * T_kelvin))

# Add the new variable to the dataset
meso_combined['mean_sea_level_pressure_meso'] = mean_sea_level_pressure


# drop height_above_sea_level
meso_combined = meso_combined.drop_vars('height_above_sea_level')

# meso['P0_approx'] = P0_approx
# Final combined dataset

meso_combined


In [16]:
meso_ondisk = xr.open_dataset('meso_2022.nc')

meso_ondisk.sel(time='2022-01-12', prediction_timedelta='18h', station='06380').compute()


In [18]:
print(meso_ondisk)

<xarray.Dataset> Size: 5MB
Dimensions:                       (time: 354, prediction_timedelta: 16,
                                   station: 47)
Coordinates:
  * time                          (time) datetime64[ns] 3kB 2022-01-12 ... 20...
  * prediction_timedelta          (prediction_timedelta) timedelta64[ns] 128B ...
  * station                       (station) <U5 940B '06201' '06203' ... '06212'
Data variables:
    total_precipitation_6hr_meso  (time, prediction_timedelta, station) float32 1MB ...
    2m_temperature_meso           (time, prediction_timedelta, station) float32 1MB ...
    10m_u_component_of_wind_meso  (time, prediction_timedelta, station) float32 1MB ...
    10m_v_component_of_wind_meso  (time, prediction_timedelta, station) float32 1MB ...
    mean_sea_level_pressure_meso  (time, prediction_timedelta, station) float32 1MB ...


In [20]:
meso_24_processed.sel(valid_time='2022-01-13T12', station='06203').compute()

In [5]:
meso_24_processed = xr.open_dataset('meso_24_processed.nc')
meso_24 = xr.open_dataset('meso_24.nc')

In [6]:
meso_24

In [39]:
# add the first datapoint of the 24-48 dataset to the 0-24 dataset
meso_0_24_incl = xr.concat([meso_0_24_transformed, meso_24_48_transformed.isel(prediction_timedelta=0)], dim='prediction_timedelta')

# Constants
g = 9.80665  # Acceleration due to gravity (m/s^2)
R = 287.05   # Specific gas constant for dry air (J/(kg·K))

# Extract surface pressure
P_surface = meso_0_24_incl['mean_sea_level_pressure_meso']

# Convert 2m temperature to Kelvin
T_kelvin = meso_0_24_incl['2m_temperature_meso'] + 273.15

# Extract the surface height above sea level (assuming height_above_ground_level == 0)
surface_height = meso_0_24_incl['height_above_sea_level'].sel(height_above_ground_level=0)

# Calculate mean sea level pressure using the barometric formula
mean_sea_level_pressure = P_surface * np.exp((g * surface_height) / (R * T_kelvin))

# Add the new variable to the dataset
meso_0_24_incl['mean_sea_level_pressure_meso'] = mean_sea_level_pressure /100


# drop height_above_sea_level
meso_0_24_incl = meso_0_24_incl.drop_vars('height_above_sea_level')

meso_0_24_incl = meso_0_24_incl.drop_sel(station='06252')
meso_0_24_incl = meso_0_24_incl.drop_sel(index=16)

# string the forecasts toegether along the prediction_timedelta dimension, the new index is time_contd = time + prediction_timedelta
meso_0_24_incl_stacked = meso_0_24_incl.stack(time_contd=('time', 'prediction_timedelta'))	

# create a valid_time dimension by adding the prediction_timedelta to the time dimension
meso_0_24_incl_stacked['valid_time'] = meso_0_24_incl_stacked.time + meso_0_24_incl_stacked.prediction_timedelta

# replace the time_contd index with the valid_time index
meso_0_24_incl_stacked = meso_0_24_incl_stacked.swap_dims({'time_contd': 'valid_time'})

# drop time_contd as a coordinate
meso_0_24_incl_stacked = meso_0_24_incl_stacked.drop_vars('time_contd')


# Ensure 'station' is correctly aligned with 'index'
if 'index' in meso_0_24_incl_stacked.coords and 'station' in meso_0_24_incl_stacked.coords:
    meso_0_24_incl_stacked = meso_0_24_incl_stacked.assign_coords(station=('index', meso_0_24_incl_stacked['station'].values))

# Now swap 'index' with 'station' and drop 'index'
meso_0_24_incl_stacked = meso_0_24_incl_stacked.swap_dims({'index': 'station'}).drop_vars('index')

print(meso_0_24_incl)

meso_0_24_incl_stacked.to_netcdf('meso_24_processed.nc')

<xarray.Dataset> Size: 1MB
Dimensions:                       (time: 354, prediction_timedelta: 4,
                                   index: 47, station: 47)
Coordinates:
  * index                         (index) int32 188B 0 1 2 3 4 ... 44 45 46 47
  * time                          (time) datetime64[ns] 3kB 2022-01-12 ... 20...
  * station                       (station) <U5 940B '06201' '06203' ... '06212'
  * prediction_timedelta          (prediction_timedelta) timedelta64[ns] 32B ...
Data variables:
    total_precipitation_6hr_meso  (time, prediction_timedelta, index) float32 266kB ...
    2m_temperature_meso           (time, prediction_timedelta, index) float32 266kB ...
    10m_u_component_of_wind_meso  (time, prediction_timedelta, index) float32 266kB ...
    10m_v_component_of_wind_meso  (time, prediction_timedelta, index) float32 266kB ...
    mean_sea_level_pressure_meso  (time, prediction_timedelta, index) float32 266kB ...


In [41]:
meso_0_24_incl.isel(station=30, time=0, prediction_timedelta=0, index=30)['mean_sea_level_pressure_meso']
# meso_0_24_incl['height_above_sea_level'].isel(height_above_ground_level=0,index=0)

ValueError: Dimensions {'prediction_timedelta', 'time', 'index'} do not exist. Expected one or more of FrozenMappingWarningOnValuesAccess({'station': 47, 'valid_time': 1416})

In [16]:
meso_0_24_incl_stacked_nobarometric.to_netcdf('meso_24_processed_nobarometric.nc')

In [15]:
suffix = '_nobarometric'

# add a suffix to all the variables in the dataset
new_vars = {}
for var in meso_0_24_incl_stacked_nobarometric.data_vars:
    new_vars[var + suffix] = meso_0_24_incl_stacked_nobarometric[var]

meso_0_24_incl_stacked_nobarometric = xr.Dataset(new_vars)

meso_0_24_incl_stacked_nobarometric

In [None]:
meso_0_24_incl_stacked.to_netcdf('meso_24_processed.nc')

In [None]:
# Choose variables and times for comparison
variables_to_check = ['2m_temperature', '10m_u_component_of_wind', 'total_precipitation_6hr']
times_to_check = ['2022-01-13', '2022-01-15']

# Define prediction times for 0-24 and 24-48 hours
prediction_times_0_24 = [ '6h', '12h', '18h']
prediction_times_24_48 = ['24h', '30h', '36h', '42h']

# Function to compare values
def compare_values(dataset1, dataset2, var, time, prediction_timedelta):
    value1 = dataset1[var].sel(time=time, prediction_timedelta=prediction_timedelta).values
    value2 = dataset2[var].sel(time=time, prediction_timedelta=prediction_timedelta).values
    return np.array_equal(value1, value2)

# Iterate over the selected variables, times, and prediction times
for var in variables_to_check:
    for time in times_to_check:
        # Compare values for the 0-24 hour predictions
        for pred_time in prediction_times_0_24:
            if compare_values(meso_combined, meso_0_24_transformed, var, time, pred_time):
                print(f"Values for {var} at time {time} and prediction_timedelta={pred_time} match in meso_0_24.")
            else:
                print(f"Mismatch found for {var} at time {time} and prediction_timedelta={pred_time} in meso_0_24.")
        
        # Compare values for the 24-48 hour predictions
        for pred_time in prediction_times_24_48:
            if compare_values(meso_combined, meso_24_48_transformed, var, time, pred_time):
                print(f"Values for {var} at time {time} and prediction_timedelta={pred_time} match in meso_24_48.")
            else:
                print(f"Mismatch found for {var} at time {time} and prediction_timedelta={pred_time} in meso_24_48.")


KeyError: "No variable named '2m_temperature'. Variables on the dataset include ['total_precipitation_6hr_meso', '2m_temperature_meso', '10m_u_component_of_wind_meso', '10m_v_component_of_wind_meso', 'mean_sea_level_pressure_meso', 'time', 'prediction_timedelta', 'station']"

In [None]:
# meso_combined.to_netcdf('meso_2022.nc')