# Linear Regression - 150m

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely
from shapely.wkt import loads
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [3]:
from IPython.core.display import HTML

# Custom CSS to increase the width of the notebook content
HTML("""
    <style>
        .container {
            width: 90% !important;  /* Set width to 90% of the browser window, adjust as needed */
            max-width: 2000px !important;  /* Optional: limit the max width */
            margin: 0 auto;  /* Center the content */
        }
    </style>
""")

### Choose night time temperatures in July

In [4]:
temp =  pd.read_csv('/Users/lisawink/Documents/paper1/data/gap_filled_data_ta_rh.csv')
temp['datetime_UTC']=pd.to_datetime(temp['datetime_UTC'])
temp = temp[temp['variable']=='Ta_deg_C']

In [5]:
# choose all times between 11pm and 6am in July
temp = temp[(temp['datetime_UTC'].dt.hour>=23) | (temp['datetime_UTC'].dt.hour<6)]
temp = temp[(temp['datetime_UTC'].dt.month==7)]
temp_result = temp.groupby('station_id').mean(numeric_only=True)

In [6]:
print(temp_result)

                value
station_id           
FRASHA      17.384516
FRBETZ      18.499631
FRBRUH      18.681705
FRDIET      16.922488
FRDREI      18.395069
FREBNE      15.556083
FREICH      16.675346
FRFRIE      18.341014
FRGART      18.357788
FRGLBA      16.420968
FRGUNT      16.746728
FRHAID      18.344839
FRHBHF      19.055853
FRHERD      19.007650
FRHOCH      16.778479
FRHOLZ      17.871567
FRIHOC      17.191336
FRINST      19.600138
FRKART      19.692673
FRLAND      17.699447
FRLORE      17.066313
FRMERZ      16.635161
FRMESS      18.795806
FROPFS      16.558940
FROWIE      18.364977
FRPDAS      18.493917
FRRIES      18.808387
FRSEEP      18.413134
FRSTGA      18.632488
FRSTGE      18.516221
FRSTUH      18.801060
FRTIEN      16.376590
FRUNIK      19.291567
FRUWIE      18.135991
FRVAUB      17.823272
FRWAHS      15.966452
FRWEIN      18.648525
FRWILD      17.125023
FRWITT      18.801705
FRWSEE      17.621198
FRZAHR      18.148940


In [7]:
# import and drop index
params = gpd.read_parquet('/Users/lisawink/Documents/paper1/data/processed_data/processed_station_params_150.parquet')
params = params[params['station_id']!='FRTECH']
params.index = params['station_id']

In [8]:
items = [
    'BuAre_count', 'BuAre_sum', 'BuAre_mean', 'BuAre_std', 
    'BuHt_mean', 'BuHt_std', 
    'BuPer_mean', 'BuPer_std', 
    'BuLAL_mean', 'BuLAL_std', 
    'BuCCD_mean_mean', 'BuCCD_mean_std', 
    'BuCor_mean', 'BuCor_std', 
    'CyAre_sum', 'CyAre_mean', 'CyAre_std', 
    'CyInd_mean', 'CyInd_std', 
    'BuCWA_mean', 'BuCWA_std', 
    'BuCon_mean', 'BuCon_std', 
    'BuElo_mean', 'BuElo_std', 
    'BuERI_mean', 'BuERI_std', 
    'BuFR_mean', 'BuFR_std', 
    'BuFF_mean', 'BuFF_std', 
    'BuFD_mean', 'BuFD_std', 
    'BuRec_mean', 'BuRec_std', 
    'BuShI_mean', 'BuShI_std', 
    'BuSqC_mean', 'BuSqC_std', 
    'BuSqu_mean', 'BuSqu_std', 
    'BuAdj', 
    'BuIBD', 
    'BuSWR_mean', 'BuSWR_std', 
    'BuOri_mean', 'BuOri_std', 
    'BuAli_mean', 'BuAli_std', 
    'StrAli_mean', 'StrAli_std', 
    'StrW_mean', 'StrW_std', 
    'StrWD_mean', 'StrWD_std', 
    'StrOpe_mean', 'StrOpe_std', 
    'StrHW_mean', 'StrHW_std', 
    'StrLen_mean', 'StrLen_std', 
    'StrCNS_mean', 'StrCNS_std', 
    'BpM_mean', 'BpM_std', 
    'StrLin_mean', 'StrLin_std', 
    'StrClo400_mean', 'StrClo400_std', 
    'StrBet400_mean', 'StrBet400_std', 
    'StrSCl_mean', 'StrSCl_std', 
    'StrCyc400_mean', 'StrCyc400_std', 
    'StrENR400_mean', 'StrENR400_std', 
    'StrGam400_mean', 'StrGam400_std', 
    'StrDeg_mean', 'StrDeg_std', 
    'StrMes400_mean', 'StrMes400_std',
    'StrClo1200_mean', 'StrClo1200_std', 
    'StrBet1200_mean', 'StrBet1200_std',  
    'StrCyc1200_mean', 'StrCyc1200_std', 
    'StrENR1200_mean', 'StrENR1200_std', 
    'StrGam1200_mean', 'StrGam1200_std', 
    'StrMes1200_mean', 'StrMes1200_std'
]

In [9]:
params = params[items]
params = params.merge(temp_result, left_on='station_id', right_on='station_id',how='inner')
# standardize data
scaler = StandardScaler()
params_scaled = scaler.fit_transform(params)
params_scaled = pd.DataFrame(params_scaled, columns=params.columns, index=params.index)
params_scaled = params_scaled.dropna(thresh=params_scaled.shape[0] - 7, axis=1)
params_scaled = params_scaled.dropna()

In [10]:
# Define mapping of abbreviations to categories
prefix_to_category = {
    'BuAre': 'Dimension', 'BuHt': 'Dimension', 'BuPer': 'Dimension',
    'BuLAL': 'Dimension', 'BuCCD': 'Dimension', 'BuCor': 'Dimension',
    'CyAre': 'Dimension', 'CyInd': 'Dimension', 'BuCWA': 'Shape',
    'BuCon': 'Shape', 'BuElo': 'Shape', 'BuERI': 'Shape',
    'BuFR': 'Shape', 'BuFF': 'Shape', 'BuFD': 'Shape',
    'BuRec': 'Shape', 'BuShI': 'Shape', 'BuSqC': 'Shape',
    'BuSqu': 'Shape', 'BuAdj': 'Distribution', 'BuIBD': 'Distribution',
    'BuSWR': 'Distribution', 'BuOri': 'Orientation', 'BuAli': 'Orientation',
    'StrAli': 'Orientation', 'StrW': 'Distribution', 'StrWD': 'Distribution',
    'StrOpe': 'Distribution', 'StrHW': 'Distribution', 'StrLen': 'Dimension',
    'StrCNS': 'Dimension', 'BpM': 'Intensity', 'StrLin': 'Shape',
    'StrClo400': 'Connectivity', 'StrBet400': 'Connectivity', 
    'StrSCl': 'Connectivity', 'StrCyc400': 'Connectivity', 
    'StrENR400': 'Connectivity', 'StrGam400': 'Connectivity', 
    'StrDeg': 'Connectivity', 'StrMes400': 'Connectivity',
    'StrClo1200': 'Connectivity', 'StrBet1200': 'Connectivity',
    'StrCyc1200': 'Connectivity', 'StrENR1200': 'Connectivity', 
    'StrGam1200': 'Connectivity', 'StrMes1200': 'Connectivity'
}

unique_prefixes = [item.split('_')[0] for item in items]

# Generate categories data dynamically
categories_data = [
    {'Category': prefix_to_category.get(prefix, 'Unknown'), 'Abbrev.': items[i]}
    for i,prefix in enumerate(unique_prefixes)
]

print(categories_data)


[{'Category': 'Dimension', 'Abbrev.': 'BuAre_count'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_sum'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuAre_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuHt_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuPer_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuPer_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuLAL_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuLAL_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuCCD_mean_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuCCD_mean_std'}, {'Category': 'Dimension', 'Abbrev.': 'BuCor_mean'}, {'Category': 'Dimension', 'Abbrev.': 'BuCor_std'}, {'Category': 'Dimension', 'Abbrev.': 'CyAre_sum'}, {'Category': 'Dimension', 'Abbrev.': 'CyAre_mean'}, {'Category': 'Dimension', 'Abbrev.': 'CyAre_std'}, {'Category': 'Dimension', 'Abbrev.': 'CyInd_mean'}, {'Category': 'Dimension', 'Abbrev.': 'CyInd_std'}, {'Category':

In [11]:
categories_df = pd.DataFrame(categories_data)

In [12]:
# Step 1: Collect regression results
results = []
for param in params_scaled.columns[:-1]:
    X = params_scaled[[param]]
    y = params_scaled['value']
    
    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    
    # Metrics
    gradient = model.coef_[0]
    intercept = model.intercept_
    r_squared = r2_score(y, y_pred)
    
    # Append results
    data = pd.DataFrame({'x': X[param], 'y': y, 'y_pred': y_pred})
    data['param'] = param  # Add parameter name for lookup
    results.append({'param': param, 'gradient': gradient, 'r_squared': r_squared, 'data': data})

# Create a DataFrame for summary results
results_df = pd.DataFrame(results).drop(columns=['data'])
results_df = results_df.merge(categories_df, how='left', left_on='param', right_on='Abbrev.')

# Combine all data for lookup
all_data = pd.concat([res['data'] for res in results])

# Step 2: Altair plots
# Left plot: R-squared vs Gradient scatter plot
selection = alt.selection_single(fields=['param'], empty='none')  # Selection on param

Deprecated since `altair=5.0.0`. Use selection_point instead.
  selection = alt.selection_single(fields=['param'], empty='none')  # Selection on param


In [22]:
all_data['station_id'] = all_data.index

In [24]:
all_data

Unnamed: 0_level_0,x,y,y_pred,param,station_id
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
FRASHA,-0.151218,-1.004757,0.199994,BuAre_count,FRASHA
FRBETZ,0.655708,0.371544,0.103867,BuAre_count,FRBETZ
FRDREI,-0.745796,0.242491,0.270825,BuAre_count,FRDREI
FRGART,1.972272,0.196478,-0.052973,BuAre_count,FRGART
FRGUNT,0.422124,-1.791930,0.131693,BuAre_count,FRGUNT
...,...,...,...,...,...
FRUWIE,-0.134762,-0.077269,0.162250,StrMes1200_std,FRUWIE
FRVAUB,0.139269,-0.463234,0.125439,StrMes1200_std,FRVAUB
FRWEIN,-0.398423,0.555312,0.197668,StrMes1200_std,FRWEIN
FRWSEE,-0.484781,-0.712638,0.209269,StrMes1200_std,FRWSEE


In [25]:
# Step 2: Add category coloring to scatter plot
category_colors = alt.Scale(scheme='category10')  # Use a predefined Altair color scheme

scatter_plot = alt.Chart(results_df).mark_point(size=100).encode(
    x=alt.X('gradient:Q', title='Gradient'),
    y=alt.Y('r_squared:Q', title='R-squared'),
    color=alt.Color('Category:N', scale=category_colors, title='Category'),
    tooltip=['param', 'Category', 'gradient', 'r_squared']
).add_selection(
    selection
).properties(
    title='Gradient vs R-squared',
    width=400,
    height=300
)

# Step 3: Right plot remains the same
points = alt.Chart(all_data).transform_filter(
    selection
).mark_point().encode(
    x=alt.X('x:Q', title='X'),
    y=alt.Y('y:Q', title='Average July Night Time Temperature'),
    tooltip=['x', 'y']
)

# Create the text labels for the station IDs
text_labels = alt.Chart(all_data).transform_filter(
    selection
).mark_text(
    align='left', 
    baseline='middle', 
    dx=5,  # Slightly offset the text so it doesn't overlap the point
).encode(
    x='x',
    y='y',
    text='station_id'  # Use station_id as the label
)

line = alt.Chart(all_data).transform_filter(
    selection
).mark_line(color='red').encode(
    x='x:Q',
    y='y_pred:Q'
)

regression_plot = (points + line + text_labels).properties(
    title='Linear Regression Plot',
    width=400,
    height=300
)

# Combine the plots
final_chart = alt.vconcat(scatter_plot, regression_plot)
final_chart.show()

Deprecated since `altair=5.0.0`. Use add_params instead.
  ).add_selection(
