In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Generate synthetic "climate" data
 
# For reproducibility
rng = np.random.default_rng(seed=42) 

# We want 1000 samples
n_samples = 1000
temperature = rng.normal(loc=15, scale=10, size=n_samples)  # Mean 15°C, std 10°C
humidity = rng.normal(loc=75, scale=15, size=n_samples)     # Mean 75%, std 15%
# Introduce correlation: precipitation is a function of humidity plus some noise
precipitation = 0.5 * humidity + rng.normal(loc=0, scale=10, size=n_samples)  # Correlated with humidity
wind_speed = rng.normal(loc=10, scale=5, size=n_samples)    # Mean 10m/s, std 5m/s
# Introduce correlation: solar radiation is a function of temperature plus some noise
solar_radiation = 0.8 * temperature + rng.normal(loc=0, scale=5, size=n_samples)  # Correlated with temperature

# Combine into a DataFrame
climate_data = pd.DataFrame({
    'Temperature (°C)': temperature,
    'Humidity (%)': humidity,
    'Precipitation (mm)': precipitation,
    'Wind Speed (m/s)': wind_speed,
    'Solar Radiation (W/m2)': solar_radiation
})
climate_data.head()

In [None]:
# Now, let's calculate a *correlation matrix*
correlation_matrix = climate_data.corr()

# Finally, let us display the correlation matrix with a color gradient
correlation_matrix.style.background_gradient(cmap='coolwarm')

Which feature(s) might we remove given these correlations?

In [None]:
# Here, plot a pair of highly correlated variables (NOT the same variable).