### Install necessary libraries

In [None]:
pip install osmnx geopandas pydeck seaborn

### Visualizing buildings from benguerir

### Given real data we can use the distribution to simulate other data

In [None]:
import osmnx as ox
import geopandas as gpd
import pydeck as pdk
import numpy as np
import pandas as pd

# Fetch building data for Casablanca
city_name = "Casablanca, Morocco"
gdf = ox.features_from_place(city_name, tags={"building": True})
gdf = gdf[gdf.geometry.type == "Polygon"]

# Add random data for CO₂, water, and energy usage
gdf["CO2_Usage"] = np.random.uniform(50, 500, len(gdf))  # CO₂ usage in kg/month
gdf["Water_Usage"] = np.random.uniform(1000, 10000, len(gdf))  # Water usage in liters/month
gdf["Energy_Consumption"] = np.random.uniform(500, 5000, len(gdf))  # Energy consumption in kWh/month

# Assign random heights (since OSM may not have height info)
gdf["height"] = np.random.uniform(10, 100, len(gdf))
gdf["footprint"] = gdf["geometry"].apply(lambda geom: [[list(coord) for coord in geom.exterior.coords]])

# Dynamic color scaling based on CO₂ usage
def get_color(co2_usage):
    normalized_co2 = (co2_usage - gdf["CO2_Usage"].min()) / (gdf["CO2_Usage"].max() - gdf["CO2_Usage"].min())
    return [int(255 * normalized_co2), int(255 * (1 - normalized_co2)), 0, 200]  # Red to Green gradient

gdf["color"] = gdf["CO2_Usage"].apply(get_color)

# Convert to DataFrame for pydeck
data = gdf[["footprint", "height", "CO2_Usage", "Water_Usage", "Energy_Consumption", "color"]].reset_index()

# Create a pydeck layer
building_layer = pdk.Layer(
    "PolygonLayer",
    data,
    get_polygon="footprint",
    get_elevation="height",
    elevation_scale=1,
    get_fill_color="color",  # Color based on CO₂
    extruded=True,
    pickable=True,
    auto_highlight=True,
)

# Set the view state
view_state = pdk.ViewState(
    latitude=gdf.geometry.centroid.y.mean(),
    longitude=gdf.geometry.centroid.x.mean(),
    zoom=15,
    pitch=50,
)

# Create the deck
deck = pdk.Deck(
    layers=[building_layer],
    initial_view_state=view_state,
    tooltip={"text": "Height: {height}m\nCO₂: {CO2_Usage}kg\nWater: {Water_Usage}L\nEnergy: {Energy_Consumption}kWh"}
)

# Show the deck
deck.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load real data from your dataset
real_data = gdf[["CO2_Usage", "Water_Usage", "Energy_Consumption"]]

# Compute mean vector and covariance matrix dynamically
mean_vector = real_data.mean().values
cov_matrix = real_data.cov().values


# Generate synthetic data
num_samples = 100
simulated_data = np.random.multivariate_normal(mean_vector, cov_matrix, num_samples)

# Convert to DataFrame
df_simulated = pd.DataFrame(simulated_data, columns=["CO2_Usage", "Water_Usage", "Energy_Consumption"])

# Ensure no negative values
df_simulated = df_simulated.clip(lower=0)

# Plot histograms of real vs. simulated data
fig, axes = plt.subplots(1, 3, figsize=(8, 3))

for i, col in enumerate(["CO2_Usage", "Water_Usage", "Energy_Consumption"]):
    sns.histplot(real_data[col], bins=30, kde=True, color='blue', label="Real", ax=axes[i])
    sns.histplot(df_simulated[col], bins=30, kde=True, color='red', label="Simulated", ax=axes[i], alpha=0.6)
    axes[i].set_title(col)
    axes[i].legend()

plt.tight_layout()
plt.show()

# Scatter plot to see relationships
sns.pairplot(pd.concat([real_data.assign(Data="Real"), df_simulated.assign(Data="Simulated")]),
             hue="Data", diag_kind="kde", markers=["o", "s"])
plt.show()


In [None]:
# Ensure all values are finite and numeric
df_simulated_clean = df_simulated.select_dtypes(include=[np.number]).dropna()

# Compute the correlation matrix
correlation_matrix_clean = df_simulated_clean.corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_clean, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Environmental Attributes")
plt.show()

# Pairplot for visualizing distributions and relationships
sns.pairplot(df_simulated_clean, diag_kind='kde', plot_kws={'alpha': 0.6, 's': 10})
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms with KDE
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(["CO2_Usage", "Water_Usage", "Energy_Consumption"]):
    sns.histplot(real_data[col], bins=30, kde=True, ax=axes[i], color="blue")
    axes[i].set_title(f"Distribution of {col}")

plt.tight_layout()
plt.show()


### Some other plots

In [None]:
import scipy.stats as stats

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(["CO2_Usage", "Water_Usage", "Energy_Consumption"]):
    stats.probplot(real_data[col], dist="norm", plot=axes[i])
    axes[i].set_title(f"QQ-Plot of {col}")

plt.tight_layout()
plt.show()


### Or if we know the correlations between the 3 elements

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

# Define mean values for each feature (CO₂ Usage, Water Usage, Energy Consumption)
mean_vector = np.array([300, 5000, 2500])

# Define covariance matrix (assumed correlations)
cov_matrix = np.array([
    [5000,  20000,  10000],  # Variance & covariance between CO₂, Water, and Energy
    [20000, 100000, 30000],
    [10000, 30000, 40000]
])

# Generate 1000 random samples
num_samples = 1000
simulated_data = np.random.multivariate_normal(mean_vector, cov_matrix, num_samples)

# Convert to DataFrame
df_simulated = pd.DataFrame(simulated_data, columns=["CO2_Usage", "Water_Usage", "Energy_Consumption"])

# Ensure values remain positive
df_simulated = df_simulated.clip(lower=0)

# Display first few rows
print(df_simulated.head())

# Compute the correlation matrix
correlation_matrix = df_simulated.corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Environmental Attributes")
plt.show()

# Pairplot for visualizing distributions and relationships
sns.pairplot(df_simulated, diag_kind='kde', plot_kws={'alpha': 0.6, 's': 10})
plt.show()


# Mahalanobis distance based classification

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import mahalanobis
import plotly.express as px
import plotly.graph_objects as go


#  the optimal reference point (minimum values for each KPI)
optimal_point = np.array([
    real_data["CO2_Usage"].min(),  # Minimum CO₂ usage
    real_data["Water_Usage"].min(),  # Minimum water usage
    real_data["Energy_Consumption"].min(),  # Minimum energy consumption
])

# Compute the covariance matrix and its inverse
cov_matrix = real_data[["CO2_Usage", "Water_Usage", "Energy_Consumption"]].cov().values
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Function to compute Mahalanobis distance
def mahalanobis_distance(row, optimal_point, inv_cov_matrix):
    return mahalanobis(row, optimal_point, inv_cov_matrix)

# Apply the Mahalanobis distance calculation to each building
real_data["Mahalanobis_Distance"] = real_data.apply(
    lambda row: mahalanobis_distance(
        row[["CO2_Usage", "Water_Usage", "Energy_Consumption"]],
        optimal_point,
        inv_cov_matrix
    ),
    axis=1
)

# Assign classes based on Mahalanobis distance
num_classes = 6  # A, B, C, D, E, F
real_data["Global_Class"] = pd.cut(
    real_data["Mahalanobis_Distance"],
    bins=num_classes,
    labels=['A', 'B', 'C', 'D', 'E', 'F']
)

# Define class colors
class_colors = {
    'A': 'green', 'B': 'lightgreen', 'C': 'yellow',
    'D': 'orange', 'E': 'red', 'F': 'darkred'
}

# Create the 3D scatter plot
fig = px.scatter_3d(
    real_data,
    x='CO2_Usage',
    y='Water_Usage',
    z='Energy_Consumption',
    color='Global_Class',
    color_discrete_map=class_colors,
    labels={
        'CO2_Usage': 'CO₂ Usage (tCO2e)',
        'Water_Usage': 'Water Usage (m³)',
        'Energy_Consumption': 'Energy Consumption (kWh)',
        'Global_Class': 'Class'
    },
    title="3D Classification of Buildings Based on Mahalanobis Distance",
    hover_data={
        'CO2_Usage': ':.2f',
        'Water_Usage': ':.2f',
        'Energy_Consumption': ':.2f',
        'Mahalanobis_Distance': ':.2f',
        'Global_Class': True
    }
)

# Update marker size and opacity
fig.update_traces(marker=dict(size=6, opacity=0.8))

# Add the optimal point as a marker
fig.add_trace(go.Scatter3d(
    x=[optimal_point[0]],
    y=[optimal_point[1]],
    z=[optimal_point[2]],
    mode='markers',
    marker=dict(size=10, color='blue', symbol='diamond'),
    name='Optimal Point'
))

# Update layout
fig.update_layout(
    scene=dict(
        xaxis_title='CO₂ Usage (tCO2e)',
        yaxis_title='Water Usage (m³)',
        zaxis_title='Energy Consumption (kWh)'
    ),
    legend_title_text='Class',
    margin=dict(l=0, r=0, b=0, t=40)
)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define classification function
def classify_building(distance, percentiles):
    if distance <= percentiles[0.1]:  # Top 10%
        return "A"
    elif distance <= percentiles[0.3]:  # Next 20%
        return "B"
    elif distance <= percentiles[0.6]:  # Next 30%
        return "C"
    elif distance <= percentiles[0.8]:  # Next 20%
        return "D"
    elif distance <= percentiles[0.9]:  # Next 10%
        return "E"
    else:  # Bottom 10%
        return "F"

# Calculate percentiles for Mahalanobis distances
percentiles = real_data["Mahalanobis_Distance"].quantile([0.1, 0.3, 0.6, 0.8, 0.9])

# Apply classification to each building
real_data["Class"] = real_data["Mahalanobis_Distance"].apply(
    lambda x: classify_building(x, percentiles)
)



In [None]:
# Define class boundaries based on percentiles
percentiles = real_data["Mahalanobis_Distance"].quantile([0.1, 0.3, 0.6, 0.8, 0.9])

# Define a color map for classes
class_colors = {
    "A": "green",
    "B": "lightgreen",
    "C": "yellow",
    "D": "orange",
    "E": "red",
    "F": "darkred",
}

# Assign colors to each data point based on class
real_data["class_color"] = real_data["Class"].map(class_colors)

# Plot histogram with colored bars
plt.figure(figsize=(10, 6))
sns.histplot(
    data=real_data,
    x="Mahalanobis_Distance",
    bins=30,
    kde=True,
    hue="Class",
    palette=class_colors,
    multiple="stack",  # Stack bars for each class
)
plt.title("Distribution of Mahalanobis Distances ")
plt.xlabel("Mahalanobis Distance")
plt.ylabel("Frequency")
plt.legend(title="Class")
plt.show()

In [None]:
# Pairplot for feature relationships
sns.pairplot(real_data, vars=["CO2_Usage", "Water_Usage", "Energy_Consumption"], hue="Class", palette=class_colors)
plt.suptitle("Pairplot of Features Colored by Class", y=1.02)
plt.show()

# 3D Scatter Plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection="3d")
scatter = ax.scatter(
    real_data["CO2_Usage"],
    real_data["Water_Usage"],
    real_data["Energy_Consumption"],
    c=real_data["class_color"],
    s=50,
)
ax.set_xlabel("CO2 Usage")
ax.set_ylabel("Water Usage")
ax.set_zlabel("Energy Consumption")
plt.title("3D Scatter Plot of Buildings Colored by Class")
plt.legend(handles=scatter.legend_elements()[0], labels=class_colors.keys())
plt.show()

# PCA Analysis

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -- 1. Normalize the KPIs --
kpi_columns = ["CO2_Usage", "Water_Usage", "Energy_Consumption"]
scaler = MinMaxScaler()
real_data[[f"{col}_norm" for col in kpi_columns]] = scaler.fit_transform(real_data[kpi_columns])

# -- 2. Compute PCA on normalized KPIs (1 component) --
pca = PCA(n_components=1)
real_data["PC1"] = pca.fit_transform(real_data[[f"{col}_norm" for col in kpi_columns]])

# -- 3. Flip sign if needed so that LOWER KPI => LOWER PC1 --
# We'll check correlation with one KPI (e.g., CO2_Usage_norm).
# If it's negative, we multiply PC1 by -1 to ensure it becomes positive.
# But since we want "lower KPI => lower PC1," we actually want a POSITIVE correlation.
# If correlation is negative, flip the sign of PC1.
corr_CO2 = np.corrcoef(real_data["PC1"], real_data["CO2_Usage_norm"])[0, 1]
if corr_CO2 < 0:
    real_data["PC1"] = -real_data["PC1"]

# -- 4. Classify buildings so that the LOWEST PC1 is Class A (best) --
# We'll cut on percentiles: bottom 10% = A, next 20% = B, etc.
def classify_pca(score, percentiles):
    # If PC1 is small, building is more efficient.
    if score <= percentiles[0.1]:   # Lowest 10% => best performers
        return "A"
    elif score <= percentiles[0.3]: # Next 20%
        return "B"
    elif score <= percentiles[0.6]: # Next 30%
        return "C"
    elif score <= percentiles[0.8]: # Next 20%
        return "D"
    elif score <= percentiles[0.9]: # Next 10%
        return "E"
    else:                           # Top 10% => worst performers
        return "F"

percentiles = real_data["PC1"].quantile([0.1, 0.3, 0.6, 0.8, 0.9])
real_data["PCA_Class"] = real_data["PC1"].apply(lambda x: classify_pca(x, percentiles))

# -- 5. Plot histogram of PC1 with color-coded classes --
pca_class_colors = {
    "A": "green",
    "B": "lightgreen",
    "C": "yellow",
    "D": "orange",
    "E": "red",
    "F": "darkred",
}

plt.figure(figsize=(10, 6))
sns.histplot(
    data=real_data,
    x="PC1",
    bins=30,
    kde=True,
    hue="PCA_Class",
    palette=pca_class_colors,
    multiple="stack"
)
plt.title("Distribution of PC1 Scores with PCA Class Coloring\n(Lower PC1 = Better Performance)")
plt.xlabel("PC1 Score")
plt.ylabel("Frequency")
plt.legend(title="PCA Class")
plt.show()


In [None]:
# Plot the distribution of PCA classes
plt.figure(figsize=(8, 6))
sns.countplot(x="PCA_Class", data=real_data, order=["A", "B", "C", "D", "E","F"], palette="viridis")
plt.title("Distribution of PCA Classes")
plt.xlabel("PCA Class")
plt.ylabel("Number of Buildings")
plt.show()

In [None]:
# Pairplot of all KPIs with PCA class coloring
sns.pairplot(
    real_data,
    vars=["CO2_Usage", "Water_Usage", "Energy_Consumption"],
    hue="PCA_Class",
    palette=pca_class_colors,
)
plt.suptitle("Pairplot of KPIs Colored by PCA Class", y=1.02)
plt.show()

# weighted based classification

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------------
# 1. Normalize the KPIs
# ------------------------------------
def normalize_kpis(data, kpi_columns):
    scaler = MinMaxScaler()
    normalized_columns = [f"{col}_norm" for col in kpi_columns]
    data[normalized_columns] = scaler.fit_transform(data[kpi_columns])
    return data, normalized_columns

# ------------------------------------
# 2. Assign weights & calculate global score
# ------------------------------------
def calculate_global_score(data, normalized_columns, weights):
    """
    Global_Score = sum of (normalized_KPI * weight)
    NOTE: Lower score = better performance
    """
    data["Global_Score"] = np.sum(data[normalized_columns] * weights, axis=1)
    return data

# ------------------------------------
# 3. Classify buildings so that
#    LOWER Global_Score => Class A
# ------------------------------------
def classify_global_score(data, score_column):
    """
    Slices the distribution of Global_Score into percentiles.
    Bottom 10% (lowest scores) => A
    ...
    Top 10% (highest scores)   => F
    """
    # We define the cut points
    percentiles = data[score_column].quantile([0.1, 0.3, 0.6, 0.8, 0.9])

    def classify_score(score):
        if score <= percentiles[0.1]:
            return "A"  # Lowest 10% => best
        elif score <= percentiles[0.3]:
            return "B"
        elif score <= percentiles[0.6]:
            return "C"
        elif score <= percentiles[0.8]:
            return "D"
        elif score <= percentiles[0.9]:
            return "E"
        else:
            return "F"  # Highest 10% => worst

    data["Global_Class"] = data[score_column].apply(classify_score)
    return data

# ------------------------------------
# 4. Visualize the results
# ------------------------------------
def plot_global_scores(data, score_column, class_column):
    """
    Plots the distribution of Global_Score
    and a stacked histogram colored by Global_Class.
    """
    # Simple histogram of Global_Score
    plt.figure(figsize=(8, 6))
    sns.histplot(data[score_column], bins=30, kde=True, color="blue")
    plt.title("Distribution of Global Scores")
    plt.xlabel("Global Score")
    plt.ylabel("Frequency")
    plt.show()

    # Stacked histogram colored by Global_Class
    global_class_colors = {
        "A": "green",     # Best performance
        "B": "lightgreen",
        "C": "yellow",
        "D": "orange",
        "E": "red",
        "F": "darkred",   # Worst performance
    }

    plt.figure(figsize=(10, 6))
    sns.histplot(
        data=data,
        x=score_column,
        bins=30,
        kde=True,
        hue=class_column,
        palette=global_class_colors,
        multiple="stack"
    )
    plt.title("Distribution of Global Scores with Global Class Coloring")
    plt.xlabel("Global Score")
    plt.ylabel("Frequency")
    plt.legend(title="Global Class")
    plt.show()

# ------------------------------------
# Main workflow example
# ------------------------------------
# Suppose 'real_data' is already loaded with columns: CO2_Usage, Water_Usage, Energy_Consumption
kpi_columns = ["CO2_Usage", "Water_Usage", "Energy_Consumption"]
weights = np.array([0.2, 0.3, 0.5])  # 20% CO2, 30% Water, 50% Energy

# 1. Normalize KPIs
real_data, normalized_columns = normalize_kpis(real_data, kpi_columns)

# 2. Calculate global score (lower = better)
real_data = calculate_global_score(real_data, normalized_columns, weights)

# 3. Classify global scores (lowest => A, highest => F)
real_data = classify_global_score(real_data, "Global_Score")

# 4. Display results
print(real_data[["CO2_Usage", "Water_Usage", "Energy_Consumption",
                 "Global_Score", "Global_Class"]].head())

# 5. Visualize results
plot_global_scores(real_data, "Global_Score", "Global_Class")


# 3d classification with euclidean distances

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

def plot_3d_classification(df):
    """3D classification visualization with spheres representing class boundaries."""
    if df is None or df.empty:
        print("No data available for visualization")
        return None

    # Calculate Euclidean distance (if not already in the dataframe)
    if 'distance' not in df.columns:
        df['distance'] = np.sqrt(
            df['Energy_Consumption_norm']**2 +
            df['CO2_Usage_norm']**2 +
            df['Water_Usage_norm']**2
        )

    # Sort the dataframe by Euclidean distance (ascending order)
    df = df.sort_values(by='distance')

    # Assign classes based on distance (A for best, F for worst)
    num_classes = 6  # A, B, C, D, E, F
    df['Global_Class'] = pd.cut(
        df['distance'],
        bins=num_classes,
        labels=['A', 'B', 'C', 'D', 'E', 'F']
    )

    # Define class colors
    class_colors = {
        'A': 'green', 'B': 'lightgreen', 'C': 'yellow',
        'D': 'orange', 'E': 'red', 'F': 'darkred'
    }

    # Create the 3D scatter plot
    fig = px.scatter_3d(
        df, x='Energy_Consumption', y='CO2_Usage', z='Water_Usage',
        color='Global_Class',  # Use the assigned classification column
        color_discrete_map=class_colors,
        labels={
            'Energy_Consumption': 'Energy (kWh)',
            'CO2_Usage': 'Carbon (tCO2e)',
            'Water_Usage': 'Water (m³)',
            'Global_Class': 'Class'
        },
        title="3D Classification of Buildings",
        hover_data={
            'Energy_Consumption': ':.2f',
            'CO2_Usage': ':.2f',
            'Water_Usage': ':.2f',
            'Global_Class': True
        }
    )

    # Update marker size and opacity
    fig.update_traces(marker=dict(size=6, opacity=0.8))

    # Calculate the minimum values of the normalized features
    min_energy = df['Energy_Consumption_norm'].min()
    min_co2 = df['CO2_Usage_norm'].min()
    min_water = df['Water_Usage_norm'].min()

    # Define thresholds for class boundaries
    d_min = df['distance'].min()  # Best-performing building (minimum distance)
    d_max = df['distance'].max()  # Worst-performing building (maximum distance)
    thresholds = np.linspace(d_min, d_max, num_classes + 1)  # 6 classes (A-F)

    # Add spherical boundaries for each class
    for i, threshold in enumerate(thresholds[1:]):
        # Create a sphere mesh
        u, v = np.mgrid[0:2*np.pi:50j, 0:np.pi:50j]
        x = min_energy + threshold * np.cos(u) * np.sin(v)
        y = min_co2 + threshold * np.sin(u) * np.sin(v)
        z = min_water + threshold * np.cos(v)

        # Add the sphere to the plot
        fig.add_trace(go.Surface(
            x=x, y=y, z=z,
            opacity=0.2,  # Adjust opacity for better visibility
            showscale=False,
            colorscale=[[0, class_colors[chr(65 + i)]], [1, class_colors[chr(65 + i)]]],
            name=f'Class {chr(65 + i)} Boundary'
        ))

    # Update layout
    fig.update_layout(
        scene=dict(
            xaxis_title='Energy Consumption (kWh)',
            yaxis_title='Carbon Footprint (tCO2e)',
            zaxis_title='Water Usage (m³)'
        ),
        legend_title_text='Class',
        margin=dict(l=0, r=0, b=0, t=40)
    )

    return fig

In [None]:
fig = plot_3d_classification(real_data)
fig.show()