In [None]:
import pandas as pd
import numpy as np
import xarray as xr
import os
import netCDF4
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import catboost
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


#  Data Loading

In [None]:
summary_dir = '/Trex/test_case_results/i.e215.I2000Clm50SpGs.hw_production.02/research_results/summary'
# summary_dir = '/Users/yguo/DataSpellProjects/hw/uhi'


# merged_feather_path = os.path.join(summary_dir, 'local_hour_adjusted_variables.feather')
merged_feather_path = os.path.join(summary_dir, 'local_hour_adjusted_variables_with_location_ID_event_ID.feather')

local_hour_adjusted_df = pd.read_feather(merged_feather_path)
local_hour_adjusted_df.info()

In [None]:
location_ID_path = os.path.join(summary_dir, 'location_IDs.nc')
location_ID_ds = xr.open_dataset(location_ID_path, engine='netcdf4')

In [None]:
location_ID_ds

## Feature list loading

In [None]:
df_daily_vars = pd.read_excel('../Data/hourlyDataSchema.xlsx')
daily_vars =df_daily_vars.loc[df_daily_vars['X_vars2'] == 'Y', 'Variable']
daily_var_lst = daily_vars.tolist()
daily_var_lst

In [None]:
df_daily_vars.info()

In [None]:
def add_long_name(input_df, join_column='Feature', df_daily_vars=df_daily_vars):
    # Perform a left join on the input dataframe using the specified join column
    # This will add all columns from df_daily_vars to input_df where the join_column matches the 'Variable' in df_daily_vars
    merged_df = pd.merge(input_df, df_daily_vars[['Variable', 'Long Name']], left_on=join_column, right_on='Variable', how='left')

    # Since the 'Variable' column from df_daily_vars will be redundant, we can drop it
    merged_df.drop(columns=['Variable'], inplace=True)

    # Return the modified DataFrame
    return merged_df


##  Day Night mask

In [None]:
# Step 1: Define masks for daytime and nighttime
daytime_mask = local_hour_adjusted_df['local_hour'].between(8, 16)
nighttime_mask = (local_hour_adjusted_df['local_hour'].between(20, 24) |
                  local_hour_adjusted_df['local_hour'].between(0, 4))


In [None]:
daytime_uhi_diff = local_hour_adjusted_df[daytime_mask]
X_day = daytime_uhi_diff[daily_var_lst]
y_day = daytime_uhi_diff['UHI_diff']
X_day.info()
y_day.info()

In [None]:
nighttime_uhi_diff = local_hour_adjusted_df[nighttime_mask]
X_night = nighttime_uhi_diff[daily_var_lst]
y_night = nighttime_uhi_diff['UHI_diff']

X_night.info()
y_night.info()

#  feature selection

In [None]:
# import catboost
# from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
# 
# # Split your data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# 
# train_pool = Pool(X_train, y_train)
# validation_pool = Pool(X_val, y_val)

#  Running the feature selection algorithm

#   SHAP waterfall plot

#  Functions to create model and graph

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
import shap

def train_and_evaluate(time_uhi_diff, df_daily_vars):
    daily_vars =df_daily_vars.loc[df_daily_vars['X_vars2'] == 'Y', 'Variable']
    daily_var_lst = daily_vars.tolist()
    # Select features and target
    X = time_uhi_diff[daily_var_lst]
    y = time_uhi_diff['UHI_diff']

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    train_pool = Pool(X_train, y_train)
    validation_pool = Pool(X_val, y_val)

    # Train the final model with all columns on selected features after feature selection
    model = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.03,
        depth=6,
        loss_function='RMSE',
        eval_metric='RMSE',
        random_seed=42,
        task_type='GPU',
        early_stopping_rounds=100,
        verbose=False
    )
    model.fit(X_train, y_train, eval_set=(X_val, y_val),
                       use_best_model=True, early_stopping_rounds=50, 
                       plot=True, 
                       verbose=False)
    return model

def importance_plot(model, validation_pool, df_daily_vars):
    # Get long_feature_names
    feature_names = validation_pool.get_feature_names()
    feature_name_mapping = pd.Series(df_daily_vars['Long Name'].values, index=df_daily_vars['Variable']).to_dict()
    long_feature_names = [feature_name_mapping.get(name, "Unknown Feature") for name in feature_names]
    # Calculate SHAP values    
    explainer = shap.TreeExplainer(model, feature_names=long_feature_names)
    shap_values = explainer.shap_values(validation_pool)
    mean_abs_shap_values = np.abs(shap_values).mean(axis=0)
    # Normalize the SHAP values so that they sum to one
    total = mean_abs_shap_values.sum()
    normalized_shap_values = mean_abs_shap_values / total
    # Plot the SHAP values for the first instance in the validation dataset
    shap.waterfall_plot(shap.Explanation(normalized_shap_values, base_values=explainer.expected_value,
                                         data=validation_pool.get_features()[0],
                                         feature_names=validation_pool.get_feature_names()))
    
    shap.plots.waterfall(shap_values)


In [None]:
night_model = train_and_evaluate(nighttime_uhi_diff, df_daily_vars=df_daily_vars)
day_model = train_and_evaluate(daytime_uhi_diff, df_daily_vars=df_daily_vars)


In [None]:
#saving models
night_model.save_model('/home/jguo/research/hw_global/results/model_night_model_lr003_it3000_depth6.cbm')
day_model.save_model('/home/jguo/research/hw_global/results/model_day_model_lr003_it3000_depth6.cbm')

## Shap dependency Analysis

In [None]:
day_full_pool = Pool(X_day, y_day)
night_full_pool = Pool(X_night, y_night)

In [None]:
# day_full_pool = Pool(X_day, y_day)
# day_shap_interations = day_model.get_feature_importance(day_full_pool, type='ShapInteractionValues')
# day_shap_interations

In [None]:
def get_ordered_feature_importance(model: CatBoostRegressor, pool, type='FeatureImportance'):
    if type == 'FeatureImportance':
        feature_importances = model.get_feature_importance()
    else:
        feature_importances = model.get_feature_importance(pool, type=type)
    feature_importance_df = pd.DataFrame({'Feature': pool.get_feature_names(), 'Importance': feature_importances})
    feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)
    
    feature_importance_df = add_long_name(feature_importance_df, join_column='Feature')
    return feature_importance_df

In [None]:
# Get feature importance
get_ordered_feature_importance(day_model, day_full_pool)


In [None]:
# Get feature importance
get_ordered_feature_importance(day_model, day_full_pool, type='LossFunctionChange')


In [None]:
get_ordered_feature_importance(night_model, night_full_pool)  

In [None]:
get_ordered_feature_importance(night_model, night_full_pool, type='LossFunctionChange')

In [None]:
import catboost
from catboost import *
import shap
shap.initjs() 

In [None]:
shap_values = day_model.get_feature_importance(Pool(X, y), type='ShapValues')

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

# visualize the first prediction's explanation
shap.force_plot(expected_value, shap_values[0,:], X.iloc[0,:])

In [None]:
shap_values[0,:]

In [None]:
shap_values

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X)

In [None]:
shap.dependence_plot("U10", shap_values, X)

##  Dependence plots for all feature pairs

In [None]:
import shap
import matplotlib.pyplot as plt

# Assuming your model variable is stored as 'model'
# Calculate SHAP values for the validation data
shap_values = day_model.get_feature_importance(validation_pool, type='ShapValues')

# Remove the last column from shap_values which is the base value
shap_values = shap_values[:,:-1]

# List of features for pairwise dependence plots
features = X_train.columns.tolist()

# Loop over each pair of features and create a dependence plot
for i in range(len(features)):
    for j in range(i + 1, len(features)):
        shap.dependence_plot((features[i], features[j]), shap_values, X_val, interaction_index=features[j])
        plt.title(f'Dependence Plot: {features[i]} vs {features[j]}')
        plt.show()


In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(X['U10'], y)
plt.title('Plot of ColumnX vs ColumnY')
plt.xlabel('ColumnX')
plt.ylabel('ColumnY')
plt.grid(True)
plt.show()