# Linear Regression - 150m

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely
from shapely.wkt import loads
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
from IPython.core.display import HTML

# Custom CSS to increase the width of the notebook content
HTML("""
    <style>
        .container {
            width: 90% !important;  /* Set width to 90% of the browser window, adjust as needed */
            max-width: 2000px !important;  /* Optional: limit the max width */
            margin: 0 auto;  /* Center the content */
        }
    </style>
""")

### Choose night time temperatures in July

In [75]:
temp =  pd.read_csv('/Users/lisawink/Documents/paper1/data/gap_filled_data_ta_rh.csv')
temp['datetime_UTC']=pd.to_datetime(temp['datetime_UTC'])
temp = temp[temp['variable']=='Ta_deg_C']

In [76]:
# Choose clear sky day (22 August 2023)

temp = temp[(temp['datetime_UTC'].dt.month==8)]
temp = temp[(temp['datetime_UTC'].dt.day==22) | (temp['datetime_UTC'].dt.day==23)]

In [None]:
#temp_result = temp.groupby('station_id').mean(numeric_only=True)

In [73]:
# Assuming temp is your DataFrame
temp = temp.reset_index()

In [52]:
# Create a multi-selection that chooses the station ID
selection = alt.selection_point(fields=['station_id'], bind='legend', on='click', toggle='event.shiftKey')

# Create the chart
chart = alt.Chart(temp).mark_line().encode(
    x='datetime_UTC:T',
    y='value:Q',
    color=alt.condition(selection, 'station_id:N', alt.value('lightgray'), legend=alt.Legend(columns=2, symbolLimit=0)),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(
    selection
).properties(
    title='Temperature over time for each station'
).interactive()

chart.show()

In [77]:
temp = temp[(temp['datetime_UTC'].dt.day==22) & (temp['datetime_UTC'].dt.hour==4)]

In [78]:
print(temp)

                    datetime_UTC station_id  variable  value data_type
349484 2023-08-22 04:00:00+00:00     FRASHA  Ta_deg_C  20.48  observed
349485 2023-08-22 04:00:00+00:00     FRBETZ  Ta_deg_C  21.61  observed
349486 2023-08-22 04:00:00+00:00     FRBRUH  Ta_deg_C  21.90  observed
349487 2023-08-22 04:00:00+00:00     FRDIET  Ta_deg_C  18.75  observed
349488 2023-08-22 04:00:00+00:00     FRDREI  Ta_deg_C  22.01  observed
349489 2023-08-22 04:00:00+00:00     FREBNE  Ta_deg_C  18.15  observed
349490 2023-08-22 04:00:00+00:00     FREICH  Ta_deg_C  21.71   imputed
349491 2023-08-22 04:00:00+00:00     FRFRIE  Ta_deg_C  20.95  observed
349492 2023-08-22 04:00:00+00:00     FRGART  Ta_deg_C  21.41  observed
349493 2023-08-22 04:00:00+00:00     FRGLBA  Ta_deg_C  19.60  observed
349494 2023-08-22 04:00:00+00:00     FRGUNT  Ta_deg_C  20.02  observed
349495 2023-08-22 04:00:00+00:00     FRHAID  Ta_deg_C  20.74  observed
349496 2023-08-22 04:00:00+00:00     FRHBHF  Ta_deg_C  22.70  observed
349497

## Add precalculated station parameters

In [103]:
# import and drop index
params = gpd.read_parquet('/Users/lisawink/Documents/paper1/data/processed_data/processed_station_params_150.parquet')
params = params[params['station_id']!='FRTECH']
params.index = params['station_id']

In [104]:
items = [
    'BuAre_count', 'BuAre_sum', 'BuAre_mean', 'BuAre_std', 'BuAre_median', 'BuAre_MAD', 'BuAre_IQR', 'BuAre_skew',
    'BuHt_mean', 'BuHt_std', 'BuHt_median', 'BuHt_MAD', 'BuHt_IQR', 'BuHt_skew',
    'BuPer_mean', 'BuPer_std', 'BuPer_median', 'BuPer_MAD', 'BuPer_IQR', 'BuPer_skew',
    'BuLAL_mean', 'BuLAL_std', 'BuLAL_median', 'BuLAL_MAD', 'BuLAL_IQR', 'BuLAL_skew',
    'BuCCD_mean_mean', 'BuCCD_mean_std', 'BuCCD_mean_median', 'BuCCD_mean_MAD', 'BuCCD_mean_IQR', 'BuCCD_mean_skew',
    'BuCor_mean', 'BuCor_std', 'BuCor_median', 'BuCor_MAD', 'BuCor_IQR', 'BuCor_skew',
    'BuCWA_mean', 'BuCWA_std', 'BuCWA_median', 'BuCWA_MAD', 'BuCWA_IQR', 'BuCWA_skew',
    'BuCon_mean', 'BuCon_std', 'BuCon_median', 'BuCon_MAD', 'BuCon_IQR', 'BuCon_skew',
    'BuElo_mean', 'BuElo_std', 'BuElo_median', 'BuElo_MAD', 'BuElo_IQR', 'BuElo_skew',
    'BuERI_mean', 'BuERI_std', 'BuERI_median', 'BuERI_MAD', 'BuERI_IQR', 'BuERI_skew',
    'BuFR_mean', 'BuFR_std', 'BuFR_median', 'BuFR_MAD', 'BuFR_IQR', 'BuFR_skew',
    'BuFF_mean', 'BuFF_std', 'BuFF_median', 'BuFF_MAD', 'BuFF_IQR', 'BuFF_skew',
    'BuFD_mean', 'BuFD_std', 'BuFD_median', 'BuFD_MAD', 'BuFD_IQR', 'BuFD_skew',
    'BuRec_mean', 'BuRec_std', 'BuRec_median', 'BuRec_MAD', 'BuRec_IQR', 'BuRec_skew',
    'BuShI_mean', 'BuShI_std', 'BuShI_median', 'BuShI_MAD', 'BuShI_IQR', 'BuShI_skew',
    'BuSqC_mean', 'BuSqC_std', 'BuSqC_median', 'BuSqC_MAD', 'BuSqC_IQR', 'BuSqC_skew',
    'BuSqu_mean', 'BuSqu_std', 'BuSqu_median', 'BuSqu_MAD', 'BuSqu_IQR', 'BuSqu_skew',
    'BuAdj', 
    'BuIBD', 
    'BuSWR_mean', 'BuSWR_std', 'BuSWR_median', 'BuSWR_MAD', 'BuSWR_IQR', 'BuSWR_skew',
    'BuOri_mean', 'BuOri_std', 'BuOri_median', 'BuOri_MAD', 'BuOri_IQR', 'BuOri_skew',
    'BuAli_mean', 'BuAli_std', 'BuAli_median', 'BuAli_MAD', 'BuAli_IQR', 'BuAli_skew',
    'StrAli_mean', 'StrAli_std', 'StrAli_median', 'StrAli_MAD', 'StrAli_IQR', 'StrAli_skew',
    'StrW_mean', 'StrW_std', 'StrW_median', 'StrW_MAD', 'StrW_IQR', 'StrW_skew',
    'StrWD_mean', 'StrWD_std', 'StrWD_median', 'StrWD_MAD', 'StrWD_IQR', 'StrWD_skew',
    'StrOpe_mean', 'StrOpe_std', 'StrOpe_median', 'StrOpe_MAD', 'StrOpe_IQR', 'StrOpe_skew',
    'StrHW_mean', 'StrHW_std', 'StrHW_median', 'StrHW_MAD', 'StrHW_IQR', 'StrHW_skew',
    'StrLen_mean', 'StrLen_std', 'StrLen_median', 'StrLen_MAD', 'StrLen_IQR', 'StrLen_skew',
    'StrCNS_mean', 'StrCNS_std', 'StrCNS_median', 'StrCNS_MAD', 'StrCNS_IQR', 'StrCNS_skew',
    'BpM_mean', 'BpM_std', 'BpM_median', 'BpM_MAD', 'BpM_IQR', 'BpM_skew',
    'StrLin_mean', 'StrLin_std', 'StrLin_median', 'StrLin_MAD', 'StrLin_IQR', 'StrLin_skew',
    'StrClo400_mean', 'StrClo400_std', 'StrClo400_median', 'StrClo400_MAD', 'StrClo400_IQR', 'StrClo400_skew',
    'StrBet400_mean', 'StrBet400_std', 'StrBet400_median', 'StrBet400_MAD', 'StrBet400_IQR', 'StrBet400_skew',
    'StrSCl_mean', 'StrSCl_std', 'StrSCl_median', 'StrSCl_MAD', 'StrSCl_IQR', 'StrSCl_skew',
    'StrCyc400_mean', 'StrCyc400_std', 'StrCyc400_median', 'StrCyc400_MAD', 'StrCyc400_IQR', 'StrCyc400_skew',
    'StrENR400_mean', 'StrENR400_std', 'StrENR400_median', 'StrENR400_MAD', 'StrENR400_IQR', 'StrENR400_skew',
    'StrGam400_mean', 'StrGam400_std', 'StrGam400_median', 'StrGam400_MAD', 'StrGam400_IQR', 'StrGam400_skew',
    'StrDeg_mean', 'StrDeg_std', 'StrDeg_median', 'StrDeg_MAD', 'StrDeg_IQR', 'StrDeg_skew',
    'StrMes400_mean', 'StrMes400_std', 'StrMes400_median', 'StrMes400_MAD', 'StrMes400_IQR', 'StrMes400_skew',
    'value'
]

In [105]:
params

Unnamed: 0_level_0,station_id,station_no,station_name,station_long_name,station_type,station_lat,station_lon,station_elevation,mounting_structure,sky_view_factor,...,StrSCl_median,StrSCl_std,StrSCl_min,StrSCl_max,StrSCl_sum,StrSCl_nunique,StrSCl_mode,StrSCl_IQR,StrSCl_MAD,StrSCl_skew
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FRASHA,FRASHA,1,Freiburg Alte Stadthalle,Station Freiburg Alte Stadthalle (FRASHA),Tier_I,47.986557,7.870209,296.0,Lamp post,0.721,...,0.0,0.041145,0.0,0.090909,0.167832,3.0,0.0,0.038462,0.0,0.9804667
FRBETZ,FRBETZ,2,Freiburg Betzenhausen,Station Freiburg Betzenhausen (FRBETZ),Tier_II,48.0049,7.817673,250.5,Lamp post,0.645,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,
FRBRUH,FRBRUH,3,Freiburg Bruehl,Station Freiburg Bruehl (FRBRUH),Tier_I,48.031009,7.854189,237.6,Lamp post,0.73,...,,,,,,,,,,
FRDIET,FRDIET,4,Freiburg Dietenbach,Station Freiburg Dietenbach (FRDIET),Tier_II,48.013766,7.792878,230.0,Free-standing post,0.868,...,,,,,,,,,,
FRDREI,FRDREI,5,Freiburg Dreisam,Station Freiburg Dreisam (FRDREI),Tier_I,47.995872,7.826545,260.1,Lamp post,0.778,...,0.090909,0.0,0.090909,0.090909,0.363636,1.0,0.090909,0.0,0.0,
FRFRIE,FRFRIE,8,Freiburg Hauptfriedhof,Station Freiburg Hauptfriedhof (FRFRIE),Tier_II,48.010887,7.841538,257.0,Lamp post,0.762,...,,,,,,,,,,
FRGART,FRGART,9,Freiburg Gartenstadt,Station Freiburg Gartenstadt (FRGART),Tier_II,47.986798,7.824259,262.3,Lamp post,0.799,...,0.076923,0.04103,0.0,0.111111,0.616386,6.0,0.0,0.028571,0.023077,-0.9591944
FRGLBA,FRGLBA,10,Freiburg Glasbach,Station Freiburg Glasbach (FRGLBA),Tier_II,48.006918,7.868736,289.5,Lamp post,0.475,...,0.058824,,0.058824,0.058824,0.058824,1.0,0.058824,0.0,0.0,
FRGUNT,FRGUNT,11,Freiburg Guenterstal,Station Freiburg Guenterstal (FRGUNT),Tier_I,47.964012,7.858853,339.2,Lamp post,0.62,...,0.088889,0.031427,0.066667,0.111111,0.177778,2.0,0.066667,0.022222,0.022222,9.262305e-16
FRHAID,FRHAID,12,Freiburg Haid,Station Freiburg Haid (FRHAID),Tier_II,47.985761,7.785271,235.4,Lamp post,0.829,...,0.069264,0.030611,0.047619,0.090909,0.138528,2.0,0.047619,0.021645,0.021645,0.0


In [106]:
params = params.merge(temp, left_on=params.index, right_on='station_id',how='left')
params.index = params['station_id']
params = params[items]
# standardize data
scaler = StandardScaler()
params_scaled = scaler.fit_transform(params)
params_scaled = pd.DataFrame(params_scaled, columns=params.columns, index=params.index)
params_scaled = params_scaled.dropna(thresh=params_scaled.shape[0] - 7, axis=1)
params_scaled = params_scaled.dropna()

In [107]:
# Define mapping of abbreviations to categories
prefix_to_category = {
    'BuAre': 'Dimension', 'BuHt': 'Dimension', 'BuPer': 'Dimension',
    'BuLAL': 'Dimension', 'BuCCD': 'Dimension', 'BuCor': 'Dimension',
    'CyAre': 'Dimension', 'CyInd': 'Dimension', 'BuCWA': 'Shape',
    'BuCon': 'Shape', 'BuElo': 'Shape', 'BuERI': 'Shape',
    'BuFR': 'Shape', 'BuFF': 'Shape', 'BuFD': 'Shape',
    'BuRec': 'Shape', 'BuShI': 'Shape', 'BuSqC': 'Shape',
    'BuSqu': 'Shape', 'BuAdj': 'Distribution', 'BuIBD': 'Distribution',
    'BuSWR': 'Distribution', 'BuOri': 'Orientation', 'BuAli': 'Orientation',
    'StrAli': 'Orientation', 'StrW': 'Distribution', 'StrWD': 'Distribution',
    'StrOpe': 'Distribution', 'StrHW': 'Distribution', 'StrLen': 'Dimension',
    'StrCNS': 'Dimension', 'BpM': 'Intensity', 'StrLin': 'Shape',
    'StrClo400': 'Connectivity', 'StrBet400': 'Connectivity', 
    'StrSCl': 'Connectivity', 'StrCyc400': 'Connectivity', 
    'StrENR400': 'Connectivity', 'StrGam400': 'Connectivity', 
    'StrDeg': 'Connectivity', 'StrMes400': 'Connectivity',
}

unique_prefixes = [item.split('_')[0] for item in items]

# Generate categories data dynamically
categories_data = [
    {'Category': prefix_to_category.get(prefix, 'Unknown'), 'Abbrev.': items[i]}
    for i,prefix in enumerate(unique_prefixes)
]

print(categories_data)

[{'Category': 'Dimension', 'Abbrev.': 'BuAre_count'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_sum'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_median'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_MAD'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_IQR'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_skew'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_median'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_MAD'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_IQR'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_skew'}, {'Category': 'Dimension', 'Abbrev.': 'BuPer_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuPer_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuPer_median'}, {'Category': 'Dimension', 'Abbrev.': 'BuPer_MAD'}, {'Category': 'Dimension', 'Abbrev.': 'BuPer_IQR'}, {'Category': 'Dimens

In [108]:
categories_df = pd.DataFrame(categories_data)

In [120]:
# Step 1: Collect regression results
results = []
for param in params_scaled.columns[:-1]:
    X = params_scaled[[param]]
    y = params_scaled['value']
    
    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    
    # Metrics
    gradient = model.coef_[0]
    intercept = model.intercept_
    r_squared = r2_score(y, y_pred)
    
    # Append results
    data = pd.DataFrame({'x': X[param], 'y': y, 'y_pred': y_pred})
    data['param'] = param  # Add parameter name for lookup
    results.append({'param': param, 'gradient': gradient, 'r_squared': r_squared, 'data': data})

# Create a DataFrame for summary results
results_df = pd.DataFrame(results).drop(columns=['data'])
results_df = results_df.merge(categories_df, how='left', left_on='param', right_on='Abbrev.')

# Combine all data for lookup
all_data = pd.concat([res['data'] for res in results])

# Step 2: Altair plots
# Left plot: R-squared vs Gradient scatter plot
selection = alt.selection_point(fields=['param'], empty='none', on='click', toggle='event.shiftKey')  # Selection on param
selection1 = alt.selection_point(fields=['Category'], bind='legend', on='click', toggle='event.shiftKey')
selection2 = alt.selection_point(fields=['stats'], bind='legend', on='click', toggle='event.shiftKey')

In [127]:
all_data['station_id'] = all_data.index
results_df['stats'] = results_df['param'].str.split('_').str[1]

In [135]:
# Step 2: Add category coloring to scatter plot
category_colors = alt.Scale(scheme='category10')  # Use a predefined Altair color scheme

scatter_plot = alt.Chart(results_df).mark_point(size=100).encode(
    x=alt.X('gradient:Q', title='Gradient'),
    y=alt.Y('r_squared:Q', title='R-squared'),
    color=alt.condition(selection1, 'Category:N', alt.value('lightgray')),
    shape=alt.Shape('stats:N', title='Statistic'),
    opacity=alt.condition(selection2, alt.value(1), alt.value(0.2)),
    tooltip=['param', 'Category', 'gradient', 'r_squared']
).add_params(
    selection, selection1, selection2
).properties(
    title='Gradient vs R-squared',
    width=400,
    height=300
).interactive()

# Step 3: Right plot remains the same
points = alt.Chart(all_data).transform_filter(
    selection
).mark_point().encode(
    x=alt.X('x:Q', title='X'),
    y=alt.Y('y:Q', title='Average July Night Time Temperature'),
    tooltip=['x', 'y']
)

# Create the text labels for the station IDs
text_labels = alt.Chart(all_data).transform_filter(
    selection
).mark_text(
    align='left', 
    baseline='middle', 
    dx=5,  # Slightly offset the text so it doesn't overlap the point
).encode(
    x='x',
    y='y',
    text='station_id'  # Use station_id as the label
)

line = alt.Chart(all_data).transform_filter(
    selection
).mark_line(color='red').encode(
    x='x:Q',
    y='y_pred:Q'
)

regression_plot = (points + line + text_labels).properties(
    title='Linear Regression Plot',
    width=400,
    height=300
)

# Combine the plots
final_chart = alt.vconcat(scatter_plot, regression_plot)
final_chart.show()