# Data Visualization and Visual Analytics: Problemset 2

**Note:** As always I did some data exploration in the beginning which I don't show in this notebook because I want to save space. Also I don't want to take away the storytelling process from the visualizations.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### 1. Package Import and Data Cleaning:

In [2]:
airquality = pd.read_csv("airquality.csv", header=0)
airquality["date"] = pd.to_datetime(airquality["date"], format="%Y-%m-%d")
airquality.loc[airquality.site_id=="nl00644","site"] = "Cabauw-Wielsekade"
airquality.loc[airquality.site_id=="nl00644","country"]= "netherlands"
airquality.loc[airquality.site_id=="nl00644","site_type"]= "background"
airquality.loc[airquality.site_id=="nl00644","site_area"]= "rural"
airquality.loc[airquality.site_id=="nl00644","elevation"]= 1.0

airquality.loc[airquality.site_id=="pl0209a","site"] = "Gorzów Wlkp. ul. Kosynierów Gdyńskich"
airquality.loc[airquality.site_id=="pl0209a","country"]= "poland"
airquality.loc[airquality.site_id=="pl0209a","site_type"]= "background"
airquality.loc[airquality.site_id=="pl0209a","site_area"]= "urban"
airquality.loc[airquality.site_id=="pl0209a","elevation"]= 22.0

### 2. Data Preparation for ALL visualizations and defining of helper functions:

In [3]:
# Step 1: Filter sites with at least 90 % of days in 2023 having NO₂ data:

# Step 1.1: Group by site and count the number of non-null NO₂ values:
site_no2_count = airquality.groupby("site_id")["no2"].count()

# Step 1.2: Find sites that have at least 90 % days of NO₂ data (=329 days out of 365 days) and filter original dataset to keep only valid sites:
list_valid_sites = site_no2_count[site_no2_count > (0.9*365)].index
valid_sites_data = airquality[airquality["site_id"].isin(list_valid_sites)]

# Step 2: Add the "highlight" column to flag greek sites:
sites_daily_no2 = valid_sites_data[["site_id", "site", "country", "date", "no2"]].copy()
sites_daily_no2["highlight"] = sites_daily_no2["country"].apply(
    lambda x: "yes" if x == "greece" else "no")

# Step 3: Group sites by country and calculate the daily average NO₂ over all sites for each country:
country_daily_no2_avg = sites_daily_no2.groupby(["country", "date"])["no2"].mean().reset_index()
# Add a new column "highlight" to the DataFrame
#country_daily_no2_avg["highlight"] = country_daily_no2_avg["country"].apply(
    #lambda x: "yes" if x == "greece" else "no")

# Step 4: For each greek site count exceedances of the WHO daily NO₂ limit (25 µg/m³) and store the data:
greek_sites_data = sites_daily_no2[sites_daily_no2["highlight"] == "yes"]
greek_sites_exceedances = greek_sites_data[greek_sites_data["no2"] > 25] \
    .groupby(["site_id", "site"])["no2"] \
    .count() \
    .reset_index()
greek_sites_exceedances.rename(columns={"no2": "exceedances"}, inplace=True)

In [4]:
# Define the layout template: 
base_layout = dict(
    showlegend=False,
    margin=dict(l=80, r=20, t=80, b=50),
    plot_bgcolor="white",
    paper_bgcolor="white",
    title_font=dict(size=27, family="Times New Roman", weight="bold"),
    font=dict(size=15, family="Times New Roman", color="black"),  # font settings for axis ticks
    xaxis=dict(
        title="Year 2023",
        title_font_size=18,
        tickformat="%b",   # use month abbreviations as tick labels
        tickvals=pd.date_range("2023-01-01", "2023-12-01", freq="MS") + pd.Timedelta(days=15),  # tick positions in the middle of a month
    ),
    yaxis=dict(
        title="NO₂ in µg/m³",
        title_font_size=18
    )
)

In [5]:
# Define a function for altering the data which is shown when hovering over the time series:
def update_hover_template(fig, template_type="default"):
    """
    Updates the hovertemplate of a Plotly figure based on the type of template required.

    Parameters:
    - fig: Plotly figure object to update.
    - template_type: Type of hovertemplate to apply. 
                     Options: "country" for figure 1, "site" for figures 2 & 3.
    """
    if template_type == "country":
        hover_template = (
            "<b>Country:</b> %{customdata[0]}<br>"
            "<b>Date:</b> %{x|%b %d, %Y}<br>"
            "<b>NO₂:</b> %{y} µg/m³<br><extra></extra>"
        )
    elif template_type == "site":
        hover_template = (
            "<b>Site:</b> %{customdata[0]}<br>"
            "<b>Site ID:</b> %{customdata[1]}<br>"
            "<b>Date:</b> %{x|%b %d, %Y}<br>"
            "<b>NO₂:</b> %{y} µg/m³<br><extra></extra>"
        )
    else:
        raise ValueError("Invalid template_type. Use 'country' or 'site'.")
    
    fig.update_traces(hovertemplate=hover_template)

In [6]:
# Define a function for trace styling: 
def set_trace_style(fig, trace_name, line_width):
    for trace in fig.data:
        if trace.name == trace_name:
            trace.update(line=dict(width=line_width))

### 3. Time Series of average NO₂ measurements from all sites grouped by country:

In [7]:
# Preparation: color map -> Greece blue, all other countries lightgrey and country names with first letter capitalized for hovering:
unique_countries = country_daily_no2_avg["country"].unique()
country_color_map = {country: "rgba(28, 117, 188, 1)" if country == "greece" else "rgba(211, 211, 211, 0.45)" for country in unique_countries}
country_daily_no2_avg["hover_country"] = country_daily_no2_avg["country"].str.title()

# Plot grouped NO₂ time series by country, with Greece highlighted:
fig1 = px.line(
    country_daily_no2_avg,  # country_daily_no2_avg from Step 2 (Data Preparation)
    x="date",
    y="no2",
    color="country",  
    range_y=(0, 65),  
    title="<span style='color:rgba(28, 117, 188, 1);'>NO₂ measurements in Greece were high in 2023</span><br>"
          "<sup>Is there a site in Greece that stands out?</sup>",
    color_discrete_map=country_color_map,
    hover_data={"country": True, "date": True, "no2": True},
    custom_data=["hover_country"]
)

update_hover_template(fig1, template_type="country")  # update hover data with predefinied custom function


# Customize layout to match the first visualization:
fig1.update_layout(**base_layout)


# Add the label "Greece" as annotation directly beside the time series (blue):
greece_last_point = country_daily_no2_avg[country_daily_no2_avg["country"] == "greece"].iloc[-1]
fig1.add_annotation(
    x=greece_last_point["date"], 
    y=greece_last_point["no2"], 
    text="Greece", 
    showarrow=False, 
    font=dict(color="rgba(28, 117, 188, 1)", size=20, weight="bold"), 
    align="right",
    xanchor="left"  
)

# Add the label "Other" as annotation directly beside all other time series (lightgrey):
fig1.add_annotation(
    x=greece_last_point["date"],
    y=20,  # Position visually based on the range of other traces
    text="Other", 
    showarrow=False, 
    font=dict(color="rgba(211, 211, 211, 1)", size=20), 
    align="right",
    xanchor="left"
)

# Reorder traces to make sure Greece is in the foreground:
greece_trace = [t for t in fig1.data if t.name == "greece"]  # greek trace
set_trace_style(fig1, "greece", 2.5)                         # use custom function set_trace_style() (last cell of step 2)
other_traces = [t for t in fig1.data if t.name != "greece"]  # all other traces
fig1.data = tuple(other_traces + greece_trace)               # reorder: others first, greek last (foreground)

# Show plot:
fig1.show()

### 4. Time Series of NO₂ measurements of greek sites:

In [8]:
# Preparation: update color map -> Piraeus-1 red, other greek sites blue in background and new column with site names for hovering:
site_color_map = {
    site: "rgba(201, 4, 4, 1)" if site == "gr0030a" else "rgba(31, 119, 180, 0.16)" if row["country"] == "greece" else None  
    for site, row in sites_daily_no2.groupby("site_id").first().iterrows()  
}
sites_daily_no2["hover_site"] = sites_daily_no2["site"].str.title()

# Plot NO₂ time series by site, with Greek sites highlighted
fig2 = px.line(
    sites_daily_no2,  # sites_daily_no2 from Step 2 (Data Preparation)
    x="date",
    y="no2",
    color="site_id",
    range_y=(0, 110),  
    title="<span style='color:rgba(201, 4, 4, 1);'>Particularly the site PIREAUS-1 stands out</span>"
          "<span style='color:rgba(31, 119, 180, 0.35);'> from other greek sites</span><br>"
          "<sup>How does it compare to the WHO limit of 25 µg/m³?</sup>",
    color_discrete_map=site_color_map,
    hover_data={"hover_site": True, "site_id": True, "date": True, "no2": True},
    custom_data=["hover_site", "site_id"]
)

update_hover_template(fig2, template_type="site")  # update hover data with predefinied custom function

# Customize layout:
fig2.update_layout(**base_layout)

# Add the label "Piraeus-1" as annotation directly beside the time series (red):
piraeus_last_point = sites_daily_no2[sites_daily_no2["site_id"] == "gr0030a"].iloc[-1]
fig2.add_annotation(
    x=piraeus_last_point["date"], 
    y=piraeus_last_point["no2"], 
    text="Piraeus-1", 
    showarrow=False, 
    font=dict(color="rgba(201, 4, 4, 1)", size=20, weight="bold"), 
    align="right",
    xanchor="left"  
)

# Add the label "Other greek sites" as annotation directly beside all other time series (blue):
fig2.add_annotation(
    x=piraeus_last_point["date"],
    y=25,  # Position visually based on the range of other traces
    text="Other",
    showarrow=False, 
    font=dict(color="rgba(31, 119, 180, 0.37)", size=20), 
    align="right",
    xanchor="left"
)

# Separate traces based on the site types
gr0030a_trace = [t for t in fig2.data if t.name == "gr0030a"]  # Piraeus-1 trace (foreground)
set_trace_style(fig2, "gr0030a", 2.5)                          # use custom function set_trace_style() (last cell of step 2)
greek_traces = [t for t in fig2.data if t.name != "gr0030a" and site_color_map[t.name] == "rgba(31, 119, 180, 0.16)"]  # other Greek sites (foreground) # 98e5fa

# Reorder traces: Piraeus-1 (gr0030a) last, other Greek sites first, then non-Greek sites
fig2.data = tuple(greek_traces + gr0030a_trace)  # Piraeus-1 first in the foreground

# Show plot
fig2.show()


### 5. Time Series of PIREAUS-1 in comparison to the WHO limit:

In [9]:
# Filter out the site gr0030a and define the NO₂ time series for it and new column with site name for hovering:
gr0030a_data = sites_daily_no2[sites_daily_no2["site_id"] == "gr0030a"].copy()
gr0030a_data["hover_site"] = sites_daily_no2["site"].str.title()

fig3 = px.line(
    gr0030a_data,  
    x="date",
    y="no2",
    range_y=(0,110),
    title="<span style='color:rgba(201, 4, 4, 1);'>PIREAUS-1 consistently exceeded the WHO NO₂ limit in 2023</span><br>"
          "<sup><span style='color:rgba(0, 0, 0, 1);'>The limit was only met on 11 out of 365 days, indicated by black points</span></sup>",
    color="site_id",  
    color_discrete_map={"gr0030a": "rgba(201, 4, 4, 1)"},
    hover_data={"hover_site": True, "site_id": True, "date": True, "no2": True},
    custom_data=["hover_site", "site_id"]
)

# Add black points on the 11 days where the WHO limit is met:
points_data = gr0030a_data[gr0030a_data["no2"] <= 25]
fig3.add_trace(
    px.scatter(
        points_data, 
        x="date", 
        y="no2", 
        color_discrete_sequence=["rgba(0, 0, 0, 1)"],  
        hover_data={"hover_site": True, "site_id": True, "date": True, "no2": True},
        custom_data=["hover_site", "site_id"]
    ).update_traces(marker=dict(size=6, symbol="circle"))  
    .data[0]
)

update_hover_template(fig3, template_type="site")  # update hover data with predefinied custom function

# Customize layout:
fig3.update_layout(**base_layout)

# Add the horizontal line at 25 µg/m³ (WHO limit) as a background shape:
fig3.add_hline(
    y=25,
    line=dict(color="grey", dash="dash"),
    annotation_text="WHO daily NO₂ limit = 25 µg/m³",
    annotation_position="bottom right",
    annotation_font=dict(size=13, color="rgba(0, 0, 0, 1)"),
    layer="below"  
)

# Choose a point for the annotation
annotation_point = points_data.iloc[5]  # Use the first black point as the reference
fig3.add_annotation(
    x=annotation_point["date"],  # x-coordinate (date of the point)
    y=annotation_point["no2"],   # y-coordinate (just above the point to avoid overlap)
    text="WHO limit met",        # annotation text
    showarrow=True,              # show arrow
    arrowhead=2,                 # style of the arrowhead
    arrowsize=1.5,               # size of the arrow
    font=dict(size=13),          # font size of annotation text
    ax=70,                       # horizontal offset for the arrow (adjust as needed)
    ay=-6,                       # vertical offset for the arrow (adjust as needed)
    standoff=5
)

set_trace_style(fig3, "gr0030a", 2.2) 

# Show the plot:
fig3.show()
