In [4]:
import pandas as pd
import numpy as np
import xarray as xr
import os

#  Step 1: Load the data

In [5]:
summary_dir = '/Trex/case_results/i.e215.I2000Clm50SpGs.hw_production.02/research_results/summary'

# merged_feather_path = os.path.join(summary_dir, 'local_hour_adjusted_variables.feather')
merged_feather_path = os.path.join(summary_dir, 'local_hour_adjusted_variables_with_location_ID.feather')

local_hour_adjusted_df = pd.read_feather(merged_feather_path)
local_hour_adjusted_df.info()

#  Step 2: Create event ID

In [6]:
# Sort by 'location_ID' and 'time'
local_hour_adjusted_df.sort_values(by=['location_ID', 'time'], inplace=True)

# Create a new column 'time_diff' to find the difference in hours between consecutive rows
local_hour_adjusted_df['time_diff'] = local_hour_adjusted_df.groupby('location_ID')['time'].diff().dt.total_seconds() / 3600

# Identify the start of a new event (any gap of more than one hour)
local_hour_adjusted_df['new_event'] = (local_hour_adjusted_df['time_diff'] > 1)


# Generate cumulative sum to assign unique event IDs within each location
local_hour_adjusted_df['event_ID'] = local_hour_adjusted_df.groupby('location_ID')['new_event'].cumsum()

#Combine location_ID with event_ID to create a globally unique event identifier
local_hour_adjusted_df['global_event_ID'] = local_hour_adjusted_df['location_ID'].astype(str) + '_' + local_hour_adjusted_df['event_ID'].astype(str)

# # Drop the helper columns if they are no longer needed
# local_hour_adjusted_df.drop(columns=['time_diff', 'new_event'], inplace=True)

# Now the DataFrame 'local_hour_adjusted_df' includes a unique 'global_event_ID' for each heatwave event


In [7]:
local_hour_adjusted_df.info()
local_hour_adjusted_df.head(200)

##  Step 2.2 Check the continuity of dates within each event

In [8]:
import pandas as pd

# Function to validate continuity of events for each location
def validate_event_continuity(df):
    # Group by location_ID and event_ID
    grouped = df.groupby(['location_ID', 'event_ID'])
    errors = []  # To store any errors found during validation

    # Iterate through each group
    for (location_id, event_id), group in grouped:
        # Sort timestamps to ensure sequential processing
        sorted_times = group['time'].sort_values().tolist()

        # Check if each timestamp is no more than an hour apart from the next
        for i in range(1, len(sorted_times)):
            if (sorted_times[i] - sorted_times[i - 1]).total_seconds() > 3600:
                errors.append(f"Gap of over an hour found in event {event_id} for location {location_id}")

    return errors

# Validate event continuity
continuity_errors = validate_event_continuity(local_hour_adjusted_df)
if continuity_errors:
    print("Continuity Errors:")
    for error in continuity_errors:
        print(error)
else:
    print("All events are continuous with no gaps of over an hour.")


##  Step 2.3 Check the uniqueness of event IDs within each location:

In [9]:
# Check if event IDs are unique across all locations and continuous heatwave periods
is_unique = local_hour_adjusted_df['global_event_ID'].nunique() == local_hour_adjusted_df.groupby(['location_ID', 'global_event_ID']).ngroups
print("Event IDs are unique across all locations and continuous heatwave periods:", is_unique)


## Step 2.4 Manually inspect a few events

In [10]:
# Inspect a few events manually
sample_events = local_hour_adjusted_df.groupby(['location_ID', 'event_ID']).head(1).sort_values('event_ID')
print(sample_events[['location_ID', 'event_ID', 'local_time']])

In [11]:
# sample_events[['location_ID', 'event_ID', 'local_time']]

In [12]:
local_hour_adjusted_df.head()

## Step 2.5 Save the updated DataFrame

In [14]:
local_hour_adjusted_df.info()

In [15]:
# Save the updated DataFrame with event IDs
merged_feather_path = os.path.join(summary_dir, 'local_hour_with_location_id_event_id.feather')
# Reset the index to convert it into a column
local_hour_adjusted_df_reset = local_hour_adjusted_df.reset_index()

# Now save to Feather
local_hour_adjusted_df_reset.to_feather(merged_feather_path)


In [16]:
local_hour_adjusted_df_reset.info()

# step 3: For each urban grid, identify HWs with positive and negative UHI-HW interactions and then calculate the mean UHI_diff value. Then compare the meteorological conditions (air temperature, humidity, wind, planet boundary layer depth, etc.) between the positive UHI-HW-interaction event and negative UHI-HW-interaction event. 

# Question
How do I find planet boundary layer depth?

##  Step 3.1: define day and night time 
Daytime: 08:00 to 16:00 local time. (Keer paper)
Nighttime: 20:00 to 04:00 local time.

In [17]:
import pandas as pd

# Assuming 'local_hour_adjusted_df' is your DataFrame name

# Step 1: Define masks for daytime and nighttime
daytime_mask = local_hour_adjusted_df['local_hour'].between(8, 16)
nighttime_mask = (local_hour_adjusted_df['local_hour'].between(20, 24) |
                  local_hour_adjusted_df['local_hour'].between(0, 4))




##  Step 3.2: Calculate the mean UHI_diff value for each event day and night

In [None]:
# Function to compute averages for UHI_diff based on given mask
def compute_uhi_diff_averages(df, mask):
    return df[mask].groupby('global_event_ID')['UHI_diff'].mean()

# Calculate averages for UHI_diff for daytime and nighttime
daytime_uhi_diff_avg = compute_uhi_diff_averages(local_hour_adjusted_df, daytime_mask)
nighttime_uhi_diff_avg = compute_uhi_diff_averages(local_hour_adjusted_df, nighttime_mask)

In [37]:

# Step 2: Compute simple averages for all other relevant columns
columns_to_average = [ 'UHI_diff', 'UHI', 'UWBI', 'WIND', 'RAIN', 'SNOW', 
                      'Q2M_R', 'Q2M_U', 'VAPOR_PRES_R', 'VAPOR_PRES_U']
uhi_diff_avg_df = local_hour_adjusted_df.groupby('global_event_ID')[columns_to_average].mean()
uhi_diff_avg_df.info()

In [38]:

# Step 3: Add daytime and nighttime UHI_diff averages to the dataframe
uhi_diff_avg_df['UHI_diff_daytime'] = daytime_uhi_diff_avg
uhi_diff_avg_df['UHI_diff_nighttime'] = nighttime_uhi_diff_avg

# The resulting DataFrame, 'simple_averages_df', now includes the requested columns

In [39]:
uhi_diff_avg_df.info()
uhi_diff_avg_df.head(300)


In [40]:
uhi_diff_avg_df.query('UHI_diff< 0').count()

# Step 4: Data Analysis


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu

# Assuming 'uhi_diff_avg_df' is already loaded in your environment

# Separate the data into two groups
negative_uhi_diff = uhi_diff_avg_df[uhi_diff_avg_df['UHI_diff'] < 0]
positive_uhi_diff = uhi_diff_avg_df[uhi_diff_avg_df['UHI_diff'] > 0]

# Define non-UHI columns
non_uhi_columns = ['UWBI', 'WIND', 'RAIN', 'SNOW', 'Q2M_R', 'Q2M_U', 'VAPOR_PRES_R', 'VAPOR_PRES_U']

# # Descriptive Statistics
# print("Descriptive Statistics for UHI_diff < 0:")
# print(negative_uhi_diff[non_uhi_columns].describe())
# print("\nDescriptive Statistics for UHI_diff > 0:")
# print(positive_uhi_diff[non_uhi_columns].describe())

In [41]:
negative_uhi_diff

In [42]:
local_hour_adjusted_df.query('global_event_ID == "15782_1"')

In [43]:


# Correlation Analysis
correlations = uhi_diff_avg_df[non_uhi_columns + ['UHI_diff']].corr()['UHI_diff']
print("\nCorrelations with UHI_diff:")
print(correlations)


# Step 5: Which Variable is contributing to the UHI_diff

##  Step 5.1: Using logistic regression

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import shap
from sklearn.preprocessing import StandardScaler

# Load your data
# Assuming 'uhi_diff_avg_df' is already in your environment

# Prepare the data
X = uhi_diff_avg_df[['UWBI', 'WIND', 'RAIN', 'SNOW', 'Q2M_R', 'Q2M_U', 'VAPOR_PRES_R', 'VAPOR_PRES_U']]
y = (uhi_diff_avg_df['UHI_diff'] > 0).astype(int)  # Create a binary target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Summarize the background data using shap.kmeans
background_data = shap.kmeans(X_train_scaled, k=30)  # Summarize with 30 representative clusters

# Create SHAP values using KernelExplainer with the summarized background
explainer = shap.KernelExplainer(model.predict_proba, background_data, link="logit", n_jobs = 32)
shap_values = explainer.shap_values(X_test_scaled)

# Plot the SHAP values for the positive class
shap.summary_plot(shap_values[1], X_test_scaled, feature_names=X.columns, plot_type="bar")




In [45]:
import pandas as pd
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split

# Load your data
# Assuming 'uhi_diff_avg_df' is already in your environment

# Prepare the data
X = uhi_diff_avg_df[['UWBI', 'WIND', 'RAIN', 'SNOW', 'Q2M_R', 'Q2M_U', 'VAPOR_PRES_R', 'VAPOR_PRES_U']]
y = uhi_diff_avg_df['UHI']  # Assuming you want to predict UHI_diff directly

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = xgb.XGBRegressor(objective='reg:squarederror')
model.fit(X_train, y_train)

# Create SHAP values
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# Summarize the SHAP values in a plot to show the impact of each feature
shap.summary_plot(shap_values, X_test, plot_type="bar")


strong positive correlation UHI and HW
negative event 
insignficant event 
global map 
for each grid # postive and negative interaction 
