In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import os
import cftime
import duckdb


In [2]:
summary_dir = '/Trex/case_results/i.e215.I2000Clm50SpGs.hw_production.02/research_results/hw95_summary'



In [3]:
df_hw = pd.read_feather('/Trex/case_results/i.e215.I2000Clm50SpGs.hw_production.02/research_results/hw95_summary/HW95.feather')
df_no_hw = pd.read_feather('/Trex/case_results/i.e215.I2000Clm50SpGs.hw_production.02/research_results/hw95_summary/no_hw_HW95.feather')

In [4]:
def calculate_uhi_diff(df_hw, df_no_hw_avg):
    """
    Calculate the difference between UHI values of HW and average NO_HW on matching columns.

    Args:
        df_hw (pd.DataFrame): DataFrame containing HW data.
        df_no_hw_avg (pd.DataFrame): DataFrame containing averaged NO_HW data.

    Returns:
        pd.DataFrame: DataFrame with added 'UHI_diff' and 'UBWI_diff' columns.
    """
    merged_df = pd.merge(df_hw, df_no_hw_avg[['lat', 'lon', 'year', 'hour', 'UHI', 'UWBI']], on=['lat', 'lon', 'year', 'hour'],
                         suffixes=('', '_avg'))
    merged_df['UHI_diff'] = merged_df['UHI'] - merged_df['UHI_avg']
    merged_df['UWBI_diff'] = merged_df['UWBI'] - merged_df['UWBI_avg']
    return merged_df

In [5]:
df_no_hw_avg = df_no_hw[['lat', 'lon', 'year', 'hour', 'UHI', 'UWBI']].groupby(['lat', 'lon', 'year', 'hour']).mean()


In [6]:
df_no_hw_avg.reset_index(inplace=True)
df_no_hw_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2950344 entries, 0 to 2950343
Data columns (total 6 columns):
 #   Column  Dtype  
---  ------  -----  
 0   lat     float32
 1   lon     float32
 2   year    int32  
 3   hour    int32  
 4   UHI     float32
 5   UWBI    float32
dtypes: float32(4), int32(2)
memory usage: 67.5 MB


In [7]:
local_hour_adjusted_df = calculate_uhi_diff(df_hw, df_no_hw_avg)

In [8]:
local_hour_adjusted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495311 entries, 0 to 495310
Data columns (total 73 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   time             495311 non-null  datetime64[ns]
 1   lat              495311 non-null  float32       
 2   lon              495311 non-null  float32       
 3   APPAR_TEMP       495311 non-null  float32       
 4   APPAR_TEMP_R     495311 non-null  float32       
 5   APPAR_TEMP_U     495311 non-null  float32       
 6   EFLX_LH_TOT      495311 non-null  float32       
 7   EFLX_LH_TOT_R    495311 non-null  float32       
 8   EFLX_LH_TOT_U    495311 non-null  float32       
 9   FGR              495311 non-null  float32       
 10  FGR_R            495311 non-null  float32       
 11  FGR_U            495311 non-null  float32       
 12  FIRA             495311 non-null  float32       
 13  FIRA_R           495311 non-null  float32       
 14  FIRA_U           495

In [9]:
# local_hour_adjusted_df.drop(columns=['TOPO_x', 'TOPO_y'], inplace=True)
# local_hour_adjusted_df.drop(columns=['TOPO'], inplace=True)
# local_hour_adjusted_df.info()

In [10]:
import pandas as pd
import xarray as xr
import os

# Set paths
summary_dir = '/Trex/case_results/i.e215.I2000Clm50SpGs.hw_production.02/research_results/summary'

location_ID_path = os.path.join(summary_dir, 'location_IDs.nc')
heightdat = os.path.join(summary_dir, 'topodata_0.9x1.25_USGS_070110_stream_c151201.nc')


ds_location_ID = xr.open_dataset(location_ID_path, engine='netcdf4', chunks={'lat': 100, 'lon': 100})
ds_height = xr.open_dataset(heightdat, engine='netcdf4', chunks={'lat': 100, 'lon': 100})

# Merge TOPO into location_ID dataset, selecting the first time slice
ds_merged = xr.merge([ds_location_ID, ds_height.TOPO.isel(time=0)])
ds_merged

Unnamed: 0,Array,Chunk
Bytes,432.00 kiB,78.12 kiB
Shape,"(192, 288)","(100, 100)"
Dask graph,6 chunks in 2 graph layers,6 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 432.00 kiB 78.12 kiB Shape (192, 288) (100, 100) Dask graph 6 chunks in 2 graph layers Data type int64 numpy.ndarray",288  192,

Unnamed: 0,Array,Chunk
Bytes,432.00 kiB,78.12 kiB
Shape,"(192, 288)","(100, 100)"
Dask graph,6 chunks in 2 graph layers,6 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,432.00 kiB,78.12 kiB
Shape,"(192, 288)","(100, 100)"
Dask graph,6 chunks in 3 graph layers,6 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 432.00 kiB 78.12 kiB Shape (192, 288) (100, 100) Dask graph 6 chunks in 3 graph layers Data type float64 numpy.ndarray",288  192,

Unnamed: 0,Array,Chunk
Bytes,432.00 kiB,78.12 kiB
Shape,"(192, 288)","(100, 100)"
Dask graph,6 chunks in 3 graph layers,6 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [11]:
#drop time dimension in ds_merged
ds_merged = ds_merged.drop('time')

# Convert only necessary variables to a DataFrame
merged_df = ds_merged[['location_ID', 'TOPO']].to_dataframe().reset_index()
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55296 entries, 0 to 55295
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   lat          55296 non-null  float32
 1   lon          55296 non-null  float32
 2   location_ID  55296 non-null  int64  
 3   TOPO         55296 non-null  float64
dtypes: float32(2), float64(1), int64(1)
memory usage: 1.3 MB


  ds_merged = ds_merged.drop('time')


In [12]:

# # Ensure unique TOPO values per location_ID
# merged_df = merged_df.groupby('location_ID').first().reset_index()

# Merge TOPO values with existing DataFrame
local_hour_adjusted_df = local_hour_adjusted_df.merge(merged_df[['location_ID', 'TOPO']], on='location_ID', how='left', validate='m:1')

# Free up memory
del ds_location_ID, ds_height, ds_merged, merged_df

# Display info of the final DataFrame
print(local_hour_adjusted_df.info())

# Optionally, save the updated DataFrame
# local_hour_adjusted_df.to_feather(os.path.join(summary_dir, 'local_hour_adjusted_variables_with_TOPO.feather'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495311 entries, 0 to 495310
Data columns (total 74 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   time             495311 non-null  datetime64[ns]
 1   lat              495311 non-null  float32       
 2   lon              495311 non-null  float32       
 3   APPAR_TEMP       495311 non-null  float32       
 4   APPAR_TEMP_R     495311 non-null  float32       
 5   APPAR_TEMP_U     495311 non-null  float32       
 6   EFLX_LH_TOT      495311 non-null  float32       
 7   EFLX_LH_TOT_R    495311 non-null  float32       
 8   EFLX_LH_TOT_U    495311 non-null  float32       
 9   FGR              495311 non-null  float32       
 10  FGR_R            495311 non-null  float32       
 11  FGR_U            495311 non-null  float32       
 12  FIRA             495311 non-null  float32       
 13  FIRA_R           495311 non-null  float32       
 14  FIRA_U           495