# Mutual Information across Scales

In [2]:
import geopandas as gpd
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Function to calculate correlations and mutual information
def calculate_statistics(data, target_column):
    results = []
    
    # Ensure the target column exists
    if target_column not in data.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame.")

    # Loop through each column except the target column
    for col in data.columns:
        if col == target_column:
            continue

        print(f"Calculating statistics for '{col}'...")

        # Drop NA values for pairwise comparison
        valid_data = data[[col, target_column]].dropna()

        if len(valid_data) <= 2:
            # Append results
            results.append({
                'Parameter': col,
                'Pearson Correlation': None,
                'Pearson p-value': None,
                'Spearman Correlation': None,
                'Spearman p-value': None,
                'Mutual Information': None
            })
        else:

            x = valid_data[col]
            y = valid_data[target_column]

            # Calculate Pearson correlation
            pearson_corr, pearson_pval = pearsonr(x, y)

            # Calculate Spearman's rank correlation
            spearman_corr, spearman_pval = spearmanr(x, y)

            # Calculate mutual information
            if len(x) < 4:
                mi = None
            else:
                mi = mutual_info_regression(x.values.reshape(-1, 1), y)[0]

            # Append results
            results.append({
                'Parameter': col,
                'Pearson Correlation': pearson_corr,
                'Pearson p-value': pearson_pval,
                'Spearman Correlation': spearman_corr,
                'Spearman p-value': spearman_pval,
                'Mutual Information': mi
            })

    return pd.DataFrame(results)

# Example usage
if __name__ == "__main__":
    # Example DataFrame with urban form variables and temperature
    data = pd.DataFrame({
        'Building Area': [100, 150, 200, 250, 300],
        'Building Height': [10, 12, 15, 18, 20],
        'Vegetation Index': [0.3, 0.35, 0.4, 0.45, 0.5],
        'Temperature': [22, 24, 26, 25, 27]
    })

    target_column = 'Temperature'
    stats = calculate_statistics(data, target_column)
    print(stats)


Calculating statistics for 'Building Area'...
Calculating statistics for 'Building Height'...
Calculating statistics for 'Vegetation Index'...
          Parameter  Pearson Correlation  Pearson p-value  \
0     Building Area             0.904194         0.035082   
1   Building Height             0.882618         0.047417   
2  Vegetation Index             0.904194         0.035082   

   Spearman Correlation  Spearman p-value  Mutual Information  
0                   0.9          0.037386            0.250000  
1                   0.9          0.037386            0.183333  
2                   0.9          0.037386            0.250000  


Temperature

In [5]:
temp =  pd.read_csv('/Users/lisawink/Documents/paper1/data/gap_filled_data_ta_rh.csv')
temp['datetime_UTC']=pd.to_datetime(temp['datetime_UTC'])
temp = temp[temp['variable']=='Ta_deg_C']

In [6]:
temp['datetime_UTC'] = temp['datetime_UTC'].astype(str)
temp = temp.pivot(index='station_id', columns='datetime_UTC', values='value')

Urban form

In [38]:
#time = '2023-08-22 04:00:00+00:00'
time = '2023-08-23 15:00:00+00:00'
radii = [20,30,40,50,60,70,80,90,100,120,140,160,180,200,250,300,400,500,750]
stats_dict = {}
for i in radii:
    stats = pd.read_csv(f'/Users/lisawink/Documents/paper1/data/processed_data/processed_station_stats_{i}.csv')
    stats.index = stats['Parameter']
    stats_dict[i] = stats

In [None]:
time = '2023-08-22 15:00:00+00:00'
radii = [10,20,30,40,50,60,70,80,90,100,120,140,160,180,200,250,300,400,500,750]
stats_dict = {}
for i in tqdm(radii):
    params = gpd.read_parquet(f'/Users/lisawink/Documents/paper1/data/processed_data/processed_station_params_{i}.parquet')
    params = params.set_index('station_id')
    if params.empty:
        continue
    if 'FRTECH' in params.index:
        params = params[params.index!='FRTECH']

    to_remove = ['station_no','station_name','station_long_name','station_type','station_lat','station_lon','station_elevation','mounting_structure','sky_view_factor','dominant_land_use','local_climate_zone','urban_atlas_class','urban_atlas_code','geometry','SVF']
    params = params.drop(to_remove, axis=1)
    params = params.merge(temp[time], left_on='station_id', right_on='station_id',how='inner')
    stats = calculate_statistics(params, time)
    stats.index = stats['Parameter']
    stats.to_csv(f'/Users/lisawink/Documents/paper1/data/processed_data/processed_station_stats_{i,time}.csv', index=False)
    stats_dict[i] = stats

In [46]:
top_params = []
for i in radii[1:]:
    print(i,stats_dict[i].sort_values(by='Mutual Information', ascending=False).head(3))
    st = stats_dict[i].sort_values(by='Mutual Information', ascending=False).head(2)
    top_params.append(st['Parameter'].to_list()[0])

20                      Parameter  Pearson Correlation  Pearson p-value  \
Parameter                                                              
BuProx_min          BuProx_min             0.621347         0.378653   
BuRough_3D_MAD  BuRough_3D_MAD             0.352312         0.647688   
BuCCD_mean_MAD  BuCCD_mean_MAD             0.345258         0.654742   

                Spearman Correlation  Spearman p-value  Mutual Information  
Parameter                                                                   
BuProx_min                       0.4               0.6        2.220446e-16  
BuRough_3D_MAD                  -0.2               0.8        2.220446e-16  
BuCCD_mean_MAD                  -0.2               0.8        2.220446e-16  
30                  Parameter  Pearson Correlation  Pearson p-value  \
Parameter                                                          
BuPer_IQR        BuPer_IQR            -0.415853         0.109149   
BuCorDev_sum  BuCorDev_sum            -0.648

In [47]:
# extract mutual information of all top_params for all radii in stats_dict
mi_dict = {}
for i in radii[1:]:
    mi_dict[i] = stats_dict[i].loc[stats_dict[i]['Parameter'].isin(top_params)]['Mutual Information']
    

# Daytime

In [51]:
# interactive plot
import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure()
for i in top_params:
    fig.add_trace(go.Scatter(x=list(mi_dict.keys()), y=[mi_dict[j][mi_dict[j].index==i].values[0] for j in mi_dict.keys()], mode='lines', name=i))


fig.update_layout(
    title="Mutual Information of top parameters against radii",
    xaxis_title="Radii",
    yaxis_title="Mutual Information",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)

fig.show()

# Nighttime

In [37]:
# interactive plot
import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure()
for i in top_params:
    fig.add_trace(go.Scatter(x=list(mi_dict.keys()), y=[mi_dict[j][mi_dict[j].index==i].values[0] for j in mi_dict.keys()], mode='lines', name=i))


fig.update_layout(
    title="Mutual Information of top parameters against radii",
    xaxis_title="Radii",
    yaxis_title="Mutual Information",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)

fig.show()
