# Data Visualization Notebook


Version with all the topics

In [None]:
%pip install pandas plotly kaleido imageio

In [33]:
import json
from pathlib import Path
import pandas as pd
import plotly.express as px

base_dir = Path("/app/repos/ECAI_DC_EDI_Submission_1").resolve() # TODO remove the hardcoded path
topic_year_count_path = base_dir / "Data/topic_year_counts.json"
with open(topic_year_count_path, 'r', encoding='utf-8') as f:
    topic_counts = json.load(f)

rows = []
for year, topics in topic_counts.items():
    year_int = int(year)
    for topic, count in topics.items():
        count_int = int(count)
        if count_int > 1:
            rows.append({
                "topic": topic,
                "start": pd.Timestamp(year_int, 1, 1),
                "finish": pd.Timestamp(year_int + 1, 1, 1),
                "count": count_int,
            })

df = pd.DataFrame(rows) #.sort_values(['topic', 'start'])

topic_code_to_name = json.load(open(base_dir / "Data/topic_label.json", 'r', encoding='utf-8'))
df['topic'] = df['topic'].map(topic_code_to_name)

fig = px.timeline(
    df,
    x_start='start',
    x_end='finish',
    y='count',
    color='topic',
    color_discrete_sequence=px.colors.qualitative.Plotly,
    title='ECAI Topic Count Evolution by Year'
)
fig.update_layout(xaxis_title='Year', yaxis_title='N. of Occurrences by Topic', bargap=0.2)
fig.show()

There are some gaps, due to data avalability.

# Gif evolution

In [6]:
from pathlib import Path
from io import BytesIO
import json
import pandas as pd
import plotly.express as px
import imageio.v2 as imageio

try:
    base_dir
except NameError:
    base_dir = Path.cwd()

with open(base_dir / "Data/topic_year_counts.json", "r", encoding="utf-8") as f:
    topic_counts = json.load(f)

with open(base_dir / "Data/topic_label.json", "r", encoding="utf-8") as f:
    topic_code_to_name = json.load(f)

rows = []
for year, topics in topic_counts.items():
    sorted_topics = sorted(
        (
            {
                "year": int(year),
                "topic": topic_code_to_name.get(topic, topic),
                "count": int(count),
            }
            for topic, count in topics.items()
        ),
        key=lambda row: row["count"],
        reverse=True,
    )
    rows.extend(sorted_topics[:10])

if not rows:
    raise ValueError("No topic counts were found to plot.")

top_10_df = pd.DataFrame(rows)
top_10_df.sort_values(["year", "count"], ascending=[True, False], inplace=True)

fig_top10 = px.bar(
    top_10_df,
    x="count",
    y="topic",
    color="topic",
    orientation="h",
    animation_frame="year",
    range_x=[0, top_10_df["count"].max()],
    title="Top 10 Topic Counts per Year",
)
fig_top10.update_layout(
    xaxis_title="Occurrences",
    yaxis_title="Topic",
    showlegend=False,
)
fig_top10.update_yaxes(categoryorder="total ascending")

# if fig_top10.layout.updatemenus and fig_top10.layout.updatemenus[0].buttons:
#     play_button = fig_top10.layout.updatemenus[0].buttons[0]
#     if len(play_button.args) > 1:
#         play_button.args[1].setdefault("frame", {})["duration"] = 2000
#         play_button.args[1].setdefault("transition", {})["duration"] = 500
#         play_button.args[1]["mode"] = "immediate"

# if fig_top10.layout.sliders:
#     slider = fig_top10.layout.sliders[0]
#     slider.setdefault("transition", {})["duration"] = 500
#     for step in slider.steps:
#         if len(step.args) > 1:
#             step.args[1].setdefault("frame", {})["duration"] = 2000
#             step.args[1].setdefault("transition", {})["duration"] = 500

# fig_top10.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000

fig_top10.show()


# Single Year Plots

In [17]:
from pathlib import Path
import json
import pandas as pd
import plotly.express as px

try:
    base_dir
except NameError:
    base_dir = Path.cwd()

with open(base_dir / "Data/topic_year_counts.json", "r", encoding="utf-8") as f:
    topic_counts = json.load(f)

with open(base_dir / "Data/topic_label.json", "r", encoding="utf-8") as f:
    topic_code_to_name = json.load(f)

all_topics = sorted(
    {
        topic_code_to_name.get(topic_code, topic_code)
        for topics in topic_counts.values()
        for topic_code in topics
    }
)
palette = (
    px.colors.qualitative.Plotly
    + px.colors.qualitative.Safe
    + px.colors.qualitative.Pastel
    + px.colors.qualitative.Antique
)
color_map = {topic: palette[index % len(palette)] for index, topic in enumerate(all_topics)}

output_dir = base_dir / "Visualisations"
output_dir.mkdir(parents=True, exist_ok=True)

for year, topics in sorted(topic_counts.items(), key=lambda item: int(item[0])):
    df_year = (
        pd.DataFrame(
            [
                {
                    "topic": topic_code_to_name.get(topic_code, topic_code),
                    "count": int(count),
                }
                for topic_code, count in topics.items()
            ]
        )
        .sort_values("count", ascending=False)
        .head(10)
    )
    if df_year.empty:
        continue
    fig_year = px.bar(
        df_year.sort_values("count"),
        x="count",
        y="topic",
        orientation="h",
        color="topic",
        color_discrete_map=color_map,
        title=f"Top 10 Topics in {year}",
        labels={"count": "Occurrences", "topic": "Topic"},
    )
    fig_year.update_layout(yaxis_title="Topic", xaxis_title="Occurrences", showlegend=False)
    fig_year.show()