In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import os

#  Step 1: Load the data

In [2]:
summary_dir = '/Trex/case_results/i.e215.I2000Clm50SpGs.hw_production.02/research_results/summary'

# merged_feather_path = os.path.join(summary_dir, 'local_hour_adjusted_variables.feather')
merged_feather_path = os.path.join(summary_dir, 'local_hour_adjusted_variables_with_location_ID.feather')

local_hour_adjusted_df = pd.read_feather(merged_feather_path)
local_hour_adjusted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58245960 entries, 0 to 58245959
Data columns (total 23 columns):
 #   Column        Dtype         
---  ------        -----         
 0   lat           float32       
 1   lon           float32       
 2   time          datetime64[ns]
 3   UHI           float32       
 4   UWBI          float32       
 5   WIND          float32       
 6   RAIN          float32       
 7   SNOW          float32       
 8   HW            float64       
 9   Q2M_R         float32       
 10  Q2M_U         float32       
 11  VAPOR_PRES_R  float32       
 12  VAPOR_PRES_U  float32       
 13  hour          int32         
 14  month         int32         
 15  year          int32         
 16  UHI_avg       float32       
 17  UWBI_avg      float32       
 18  UHI_diff      float32       
 19  UWBI_diff     float32       
 20  local_time    datetime64[ns]
 21  local_hour    int32         
 22  location_ID   int64         
dtypes: datetime64[ns](2), float32(

#  Step 2: Create event ID

In [3]:
import numpy as np
import pandas as pd

# Sort the DataFrame by location_ID and time
local_hour_adjusted_df = local_hour_adjusted_df.sort_values(['location_ID', 'time'])

# Create a boolean mask to identify the start of each heatwave event within each location
mask = local_hour_adjusted_df.groupby('location_ID')['time'].diff() > pd.Timedelta(hours=1)

# Create a new column 'event_ID' and assign unique event IDs within each location
local_hour_adjusted_df['event_ID'] = (
        mask.groupby(local_hour_adjusted_df['location_ID']).cumsum().astype(int)
        + local_hour_adjusted_df.groupby('location_ID').ngroup() * local_hour_adjusted_df['location_ID'].nunique()
)

##  Step 2.2 Check the continuity of dates within each event

In [5]:
# Check if hours are continuous within each event
is_continuous = local_hour_adjusted_df.groupby('event_ID')['time'].apply(
    lambda x: x.diff().fillna(pd.Timedelta(hours=0)).eq(pd.Timedelta(hours=1)).all()
).all()
print("Hours are continuous within each event:", is_continuous)

Hours are continuous within each event: False


##  Step 2.3 Check the uniqueness of event IDs within each location:

In [4]:
# Check if event IDs are unique across all locations and continuous heatwave periods
is_unique = local_hour_adjusted_df['event_ID'].nunique() == local_hour_adjusted_df.groupby(['location_ID', 'event_ID']).ngroups
print("Event IDs are unique across all locations and continuous heatwave periods:", is_unique)


Event IDs are unique across all locations and continuous heatwave periods: True


In [7]:
if not is_continuous.all():
    print("Hours are not continuous within each event.")

    # Convert is_continuous to a pandas Series
    is_continuous_series = pd.Series(is_continuous, index=local_hour_adjusted_df['event_ID'].unique())

    # Get the event_IDs where hours are not continuous
    non_continuous_events = is_continuous_series[~is_continuous_series].index

    # Print the data for the first 20 non-continuous events
    for event_id in non_continuous_events[:20]:
        event_data = local_hour_adjusted_df[local_hour_adjusted_df['event_ID'] == event_id]
        print(f"Event ID: {event_id}")
        print(event_data[['location_ID', 'event_ID', 'time']])
        print()
else:
    print("Hours are continuous within each event.")

Hours are not continuous within each event.
Event ID: 0
         location_ID  event_ID                time
8807640        13770         0 1989-02-03 00:00:00
8807643        13770         0 1989-02-03 01:00:00
8807646        13770         0 1989-02-03 02:00:00
8807649        13770         0 1989-02-03 03:00:00
8807652        13770         0 1989-02-03 04:00:00
...              ...       ...                 ...
8807699        13770         0 1989-02-05 19:00:00
8807702        13770         0 1989-02-05 20:00:00
8807705        13770         0 1989-02-05 21:00:00
8807708        13770         0 1989-02-05 22:00:00
8807711        13770         0 1989-02-05 23:00:00

[72 rows x 3 columns]

Event ID: 1
          location_ID  event_ID                time
14280936        13770         1 1992-01-22 00:00:00
14280939        13770         1 1992-01-22 01:00:00
14280942        13770         1 1992-01-22 02:00:00
14280945        13770         1 1992-01-22 03:00:00
14280948        13770         1 1992

## Step 2.4 Manually inspect a few events

In [8]:
# Inspect a few events manually
sample_events = local_hour_adjusted_df.groupby(['location_ID', 'event_ID']).head(1).sort_values('event_ID')
print(sample_events[['location_ID', 'event_ID', 'local_time']])

          location_ID  event_ID          local_time
8807640         13770         0 1989-02-03 19:00:00
14280936        13770         1 1992-01-22 19:00:00
15691512        13770         2 1993-12-22 19:00:00
32783424        13770         3 2002-01-17 19:00:00
53262480        13770         4 2012-02-10 19:00:00
...               ...       ...                 ...
39193752        46987  13704800 2004-07-17 03:00:00
50604504        46987  13704801 2010-08-05 03:00:00
50604576        47265  13708504 2010-07-29 02:00:00
45755304        47566  13712208 2008-07-21 03:00:00
58245888        48138  13715912 2013-06-25 03:00:00

[293889 rows x 3 columns]


In [9]:
sample_events[['location_ID', 'event_ID', 'local_time']]

Unnamed: 0,location_ID,event_ID,local_time
8807640,13770,0,1989-02-03 19:00:00
14280936,13770,1,1992-01-22 19:00:00
15691512,13770,2,1993-12-22 19:00:00
32783424,13770,3,2002-01-17 19:00:00
53262480,13770,4,2012-02-10 19:00:00
...,...,...,...
39193752,46987,13704800,2004-07-17 03:00:00
50604504,46987,13704801,2010-08-05 03:00:00
50604576,47265,13708504,2010-07-29 02:00:00
45755304,47566,13712208,2008-07-21 03:00:00


In [17]:
local_hour_adjusted_df

Unnamed: 0,lat,lon,time,UHI,UWBI,WIND,RAIN,SNOW,HW,Q2M_R,...,month,year,UHI_avg,UWBI_avg,UHI_diff,UWBI_diff,local_time,local_hour,location_ID,event_ID
8807640,-45.706806,292.5,1989-02-03 00:00:00,0.540588,-0.162042,7.673144,0.0,0.0,1.0,0.005918,...,2,1989,0.454097,-0.031426,0.086491,-0.130616,1989-02-03 19:00:00,19,13770,0
8807643,-45.706806,292.5,1989-02-03 01:00:00,0.656219,0.010044,7.079582,0.0,0.0,1.0,0.006022,...,2,1989,0.527253,0.077270,0.128967,-0.067226,1989-02-03 20:00:00,20,13770,0
8807646,-45.706806,292.5,1989-02-03 02:00:00,0.625397,0.057197,6.486020,0.0,0.0,1.0,0.006199,...,2,1989,0.522004,0.104143,0.103393,-0.046946,1989-02-03 21:00:00,21,13770,0
8807649,-45.706806,292.5,1989-02-03 03:00:00,0.565552,0.059347,6.313677,0.0,0.0,1.0,0.006253,...,2,1989,0.490119,0.105526,0.075433,-0.046178,1989-02-03 22:00:00,22,13770,0
8807652,-45.706806,292.5,1989-02-03 04:00:00,0.489014,0.040320,6.281740,0.0,0.0,1.0,0.006269,...,2,1989,0.457829,0.099154,0.031185,-0.058834,1989-02-03 23:00:00,23,13770,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58245947,67.382202,52.5,2013-06-27 19:00:00,0.888916,0.327215,5.298829,0.0,0.0,1.0,0.005154,...,6,2013,0.938771,0.185750,-0.049855,0.141465,2013-06-27 22:00:00,22,48138,13715912
58245950,67.382202,52.5,2013-06-27 20:00:00,0.931152,0.385791,4.657486,0.0,0.0,1.0,0.004895,...,6,2013,0.935369,0.253186,-0.004216,0.132605,2013-06-27 23:00:00,23,48138,13715912
58245953,67.382202,52.5,2013-06-27 21:00:00,1.093048,0.406535,4.410651,0.0,0.0,1.0,0.004919,...,6,2013,0.941973,0.285870,0.151075,0.120665,2013-06-28 00:00:00,0,48138,13715912
58245956,67.382202,52.5,2013-06-27 22:00:00,1.264160,0.445448,4.295317,0.0,0.0,1.0,0.005003,...,6,2013,0.957070,0.314096,0.307091,0.131353,2013-06-28 01:00:00,1,48138,13715912
