### This notebook consist of code for creating the html files for the website each time data is updated.

## Set-up

In [61]:
import json
from pathlib import Path
import numpy as np
import shutil
import pandas as pd
from jinja2 import Environment, FileSystemLoader

Defining Constants

In [62]:
GROUP_DATA_DIR = Path("../../group-data")
TEMPLATE_DIR_PATH = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "templates"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data/"
HOSTING_PATH = GROUP_DATA_DIR.parent / "kerzendorf-group.github.io"
ARTICLE_DIR_PATH = Path("../../research_news/articles")
ARTICLE_IMAGE_PATH = Path("../../research_news/images")
ARTICLE_IMAGE_DESTINATION_DIR = (HOSTING_PATH / "website_files" / "images" / "article_content")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"
CONTENT_DIR_PATH = WEBSITE_DATA_PATH / "content"
SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"

ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"

Setting up jinja environment

In [63]:
# Function to create proper HTML file names by replacing spaces with underscores
def page_link(a):
    """Return the HTML file name after replacing blank spaces(" ") with underscores("-")"""
    return a.replace(" ", "_") if " " in a else a

environment = Environment(
    loader=FileSystemLoader(TEMPLATE_DIR_PATH),
    extensions=["jinja2.ext.loopcontrols", "jinja2.ext.do"],
)
environment.globals["page_link"] = page_link

In [64]:
def create_page(template, html, **kwargs):
    """
    Create an HTML page using a Jinja2 template and save it to a specified path.

    Parameters:
    ----------
    template : str
        The filename of the Jinja2 template to be used.
    html : str
        The filename of the HTML file to be generated.
    **kwargs : dict
        Additional keyword arguments to be passed to the Jinja2 template for rendering.

    Returns:
    -------
    None

    """
    page_template = environment.get_template(template)
    template_level = html.count("/")
    page_html_path = HOSTING_PATH / html
    page_html_path.parent.mkdir(parents=True, exist_ok=True)
    page_content = page_template.render(TEMPLATE_LEVEL=template_level, **kwargs)
    with open(page_html_path, mode="w", encoding="utf-8") as page:
        page.write(page_content)

In [65]:
def loading_website_data(file_to_load):
    """
    Load data from JSON files specified in a list of file names.

    Parameters:
    ----------
    json_data_list : list of str
        A list of file names (without extension) to load as JSON.

    Returns:
    -------
    dict
        A dictionary where keys are file names and values are the corresponding JSON data.

    Raises:
    ------
    FileNotFoundError:
        If a specified file does not exist.
    json.JSONDecodeError:
        If there's an issue decoding the JSON content from a file.

    """
    loaded_data = {}
    file_matches = WEBSITE_DATA_PATH/ f"{file_to_load}.json"
    if file_matches:
        try:
            with open(file_matches, "r") as json_file:
                loaded_data = json.load(json_file)
        except json.JSONDecodeError:
            print(f"Error decoding JSON in '{file_matches}'.")
    else:
        print(f"File '{file_to_load}.json' not found.")

    return loaded_data

In [66]:
def read_member_data_jsons(file_to_read):
    member_data_list = []
    member_data_df = pd.DataFrame([])
    for single_info_file_path in MEMBERS_DIR_PATH.glob("*/info.json"):
        with open(single_info_file_path, "r") as f_info:
            member_data = json.load(f_info)
        member_unique_id = member_data["id"]
        file_to_read_path = single_info_file_path.parent / "jsons" / file_to_read

        if file_to_read_path.exists():
            with file_to_read_path.open("r") as f_data:
                member_other_data = json.load(f_data)
            for entry in member_other_data:
                entry["id"] = member_unique_id
            member_data_list.append(
                pd.DataFrame(member_other_data)
            )
        # else:
        #     data_path_in_kl = KERZENDORF_GROUP_DATA / "members" / member_unique_id / "jsons" / file_to_read
        #     if data_path_in_kl.exists():
        #         with data_path_in_kl.open("r") as data_file:
        #             member_other_data_kl = json.load(data_file)
        #         for entry in member_other_data_kl:
        #             entry["id"] = member_unique_id
        #         member_data_list.append(
        #             pd.DataFrame(member_other_data_kl)
        #         )

    if member_data_list:
        member_data_df = pd.concat(
            member_data_list, ignore_index=True
        )
        member_data_df.set_index("id", inplace=True)

    return member_data_df

In [67]:
def set_new_image_path(old_image_path):
    image_source = ARTICLE_IMAGE_PATH / old_image_path.name
    image_destination = ARTICLE_IMAGE_DESTINATION_DIR / old_image_path.name
    website_files_index = image_destination.parts.index("website_files")
    new_image_path = Path(*image_destination.parts[website_files_index:])
    shutil.copy2(image_source, image_destination)
    print(str(new_image_path))
    return str(new_image_path)

## Reading data

In [68]:
# Reading website data
general = loading_website_data("general")
homepage = loading_website_data("homepage")
contact = loading_website_data("contact")
research = loading_website_data("research_categories")
support = loading_website_data("support")

In [69]:
# Reading all articles
article_content_list = []
for content_file_name in ARTICLE_DIR_PATH.iterdir():
    with open(content_file_name, "r") as fcontent:
        article_content = json.load(fcontent)
    if "dti" in article_content['platforms']:
        image_path = Path(article_content["cover_image"])
        article_content["cover_image"] = set_new_image_path(image_path)
        for content_key, content_value in article_content['content'].items():
            if "img" in content_key or "vid" in content_key:
                new_content_value = set_new_image_path(Path(content_value))
                article_content["content"][content_key] = new_content_value
        article_content_list.append(article_content)
article_content_df = pd.DataFrame(article_content_list)

article_content_df["date"] = pd.to_datetime(
    article_content_df["date"], format="%m-%d-%Y"
)

article_content_df["cover_image_height"] = (
    article_content_df["cover_image_height"].fillna("330px").replace("", "330px")
)
article_content_df["cover_image_width"] = article_content_df['cover_image_width'].fillna("520px").replace("", "520px")


website_files/images/article_content/MAST_Poster.jpg
website_files/images/article_content/cmse_comp_vicente.jpg
website_files/images/article_content/bcbsm.jpeg
website_files/images/article_content/esa_internship.png
website_files/images/article_content/2BCAFnorway_geo.jpg
website_files/images/article_content/pacmanweb.png
website_files/images/article_content/proposals.mp4
website_files/images/article_content/img_PRUR.png
website_files/images/article_content/bea_midsure_poster.jpg


Categorizing news and research articles

In [70]:
news_content_df = article_content_df[
    (article_content_df["category"] == "News")
    | (
        article_content_df["tags"].apply(
            lambda x: "news" in x if isinstance(x, list) else False
        )
    )
].sort_values(by=["date"], ascending=[False])

research_content_df = article_content_df[
    article_content_df["category"] != "News"
].sort_values(by=["category", "date"], ascending=[True, False])

Reading personal member data from info.json

In [71]:
info_json_list = []
for single_info_file_path in MEMBERS_DIR_PATH.glob("*/info.json"):
    with open(single_info_file_path, "r") as f_info:
        member_data = json.load(f_info)
    # if len(member_data.keys()) == 1:
    #     info_json_path = (
    #         KERZENDORF_GROUP_DATA / "members" / member_data["id"] / "info.json"
    #     )
    #     member_images_dir = HOSTING_PATH / "members" / member_data["id"] / "media"
    #     with open(info_json_path, "r") as f_info_kl:
    #         member_data_from_kl = json.load(f_info_kl)
    #     member_images_dir_source = (
    #         KERZENDORF_GROUP_DATA / "members" / member_data["id"] / "media"
    #     )

    #     shutil.copytree(member_images_dir_source, member_images_dir, dirs_exist_ok=True)
    #     info_json_list.append(member_data_from_kl)
    # else:
    info_json_list.append(member_data)
info_json_df = pd.DataFrame(info_json_list)
info_json_df.set_index("id", inplace=True)
info_json_df["full_name"] = info_json_df.apply(
    lambda row: (
        row["nick_name"] + " " + row["last_name"]
        if pd.notna(row["nick_name"])
        else row["first_name"] + " " + row["last_name"]
    ),
    axis=1,
)
info_json_dict = info_json_df.to_dict("index")

Reading various json files

In [72]:
exp_df = read_member_data_jsons("experiences.json")[
    [
        "role",
        "start_date",
        "end_date",
        "institution",
        "group",
    ]
]
edu_df = read_member_data_jsons("education.json")[
    ["start_date", "end_date", "institution", "subject", "degree"]
]

Reading social_links json

In [73]:
social_link_list = []
for single_member_file_path in MEMBERS_DIR_PATH.rglob("social_links.json"):
    with open(single_member_file_path, "r") as fname:
        member_social_link = json.load(fname)
    info_json_file_path = single_member_file_path.parent.parent / "info.json"
    with open(info_json_file_path, "r") as file_info:
        member_info_data = json.load(file_info)
    mem_id = member_info_data["id"]
    member_social_link["id"] = mem_id
    social_link_list.append(member_social_link)
social_links_df = pd.DataFrame(social_link_list)
social_links_df.set_index("id", inplace=True)
social_links_df.fillna("", inplace=True)

## Homepage

In [74]:
recent_content = article_content_df.sort_values(
    by=["category", "date"], ascending=[True, False]
)
# Get the first row for each category using groupby and head
recent_content = recent_content.groupby("category").head(1).copy()

In [75]:
create_page(
    "homepage.html.j2",
    "index.html",
    general=general,
    homepage=homepage,
    recent_content=recent_content.to_dict(orient="records"),
)

## Support Page

In [76]:
create_page("support.html.j2", "Support.html", general=general, support=support)

## Contact Page

In [77]:
create_page("contact.html.j2", "Contact.html", general=general, contact=contact)

## News Page

In [78]:
create_page(
    "news.html.j2",
    "News.html",
    general=general,
    content=news_content_df,
    category="news",
    member_data=info_json_dict,
)

## Individual News Page

In [79]:
for ind_news_keys, ind_news_values in news_content_df.iterrows():
    folder_path = HOSTING_PATH / "news" / page_link(ind_news_values.article_id.lower())
    create_page(
        "news_page_no_twitter.html.j2",
        f"news/{page_link(ind_news_values.article_id.lower())}.html",
        general=general,
        content=ind_news_values,
        member_data=info_json_dict,
        category="News",
    )

## Research Page

In [80]:
create_page(
    "research.html.j2",
    "Research.html",
    general=general,
    content=research_content_df,
    current_members=info_json_dict,
    research=research,
)

## Sub-Research Page

In [81]:
SUB_RESEARCH_PATH.mkdir(parents=True, exist_ok=True)

for category in article_content_df.loc[
    article_content_df.category != "News", "category"
].unique():
    create_page(
        "sub_research_frontpage.html.j2",
        f"sub_research/{page_link(category.lower())}.html",
        general=general,
        research=research,
        content=research_content_df,
        category=category,
        current_members=info_json_dict,
    )

UndefinedError: 'dict object' has no attribute 'Overview'

## Individual Research Page

In [None]:
for ind_research_keys, ind_research_values in research_content_df.iterrows():
    destination_research_path = f"sub_research/{page_link(ind_research_values.category.lower())}/{page_link(ind_research_values.article_id.lower())}.html"
    if ind_research_values['category'] == "Software":
        destination_research_path = f"sub_research/{page_link(ind_research_values.article_id.lower())}.html"

    folder_path = SUB_RESEARCH_PATH / page_link(ind_research_values.category.lower())
    folder_path.mkdir(parents=True, exist_ok=True)
    create_page(
        "research_page_no_twitter.html.j2",
        destination_research_path,
        general=general,
        content=ind_research_values,
        member_data=info_json_dict,
        article_id=ind_research_values["article_id"],
    )

## Current members Page

In [None]:
filtered_exp_df = exp_df[exp_df["end_date"].isna()]
def most_recent_row(group):
    return group[group["start_date"] == group["start_date"].max()]
filtered_exp_df_most_recent = (
    filtered_exp_df.groupby("id").apply(most_recent_row).droplevel(0)
)

In [None]:
filtered_edu_df = edu_df[(edu_df["end_date"].isna()) & (edu_df["institution"] == INSTITUTION_FILTER)]

In [None]:
exp_ids = filtered_exp_df_most_recent.index.unique()
edu_ids = filtered_edu_df.index.unique()
common_ids = list(set(exp_ids).union(edu_ids))
current_member_df = info_json_df.loc[common_ids]

In [None]:
for m_key, m_value in current_member_df.iterrows():
    if m_key in filtered_edu_df.index:
        if (
            filtered_edu_df.loc[m_key, "degree"] == "Bachelors"
            and filtered_edu_df.loc[m_key, "institution"] == INSTITUTION_FILTER
        ):
            current_academic_role = "Undergraduate Student"
        if filtered_edu_df.loc[m_key, "degree"] in ["PhD", "Masters"]:
            current_academic_role = "Graduate Student"
    elif m_key in filtered_exp_df_most_recent.index:
        current_academic_role = filtered_exp_df_most_recent.loc[m_key, "role"]
    else:
        current_academic_role = ""
    current_member_df.loc[m_key,"academic_role"] = current_academic_role

In [None]:
projects_df = read_member_data_jsons("projects.json").sort_values(
    by=["end_date"], ascending=False
)

In [None]:
for mem_key, mem_value in current_member_df.iterrows():
    if mem_key in projects_df.index:
        mem_projects = projects_df.loc[mem_key]
        if not mem_projects.empty:
            if isinstance(mem_projects, pd.Series):
                current_project_title = mem_projects["project_title"]
            else:
                current_project_title = mem_projects.iloc[0]["project_title"]
    else:
        current_project_title = ""
    current_member_df.loc[mem_key, "current_project_title"] = current_project_title

In [None]:
create_page(
    "current_members.html.j2",
    "current_members.html",
    general=general,
    current_members=current_member_df.to_dict("index"),
    socials=social_links_df.to_dict("index"),
)

## Alumni Members Page

In [None]:
def most_recent_row_end_date(group):
    return group[group["end_date"] == group["end_date"].max()]

In [None]:
alumni_filtered_edu_df = edu_df[edu_df["end_date"].notna()]
req_edu_df = alumni_filtered_edu_df.loc[
    alumni_filtered_edu_df["institution"] == INSTITUTION_FILTER
]
req_edu_df_most_recent = req_edu_df.groupby("id").apply(most_recent_row_end_date).droplevel(0)


alumni_filtered_exp_df = exp_df[exp_df["end_date"].notna()]
req_exp_df = alumni_filtered_exp_df.loc[
    (alumni_filtered_exp_df["institution"] == INSTITUTION_FILTER)
    | (alumni_filtered_exp_df["group"].isin(GROUP_FILTER))
]
req_exp_df_most_recent = (
    req_exp_df.groupby("id").apply(most_recent_row_end_date).droplevel(0)
)

In [None]:
alumni_exp_ids = req_exp_df_most_recent.index.unique()
alumni_edu_ids = req_edu_df_most_recent.index.unique()
alumni_common_ids = list(set(alumni_exp_ids).union(alumni_edu_ids))
alumni_member_df = info_json_df.loc[alumni_common_ids]

In [None]:
# Get the indices of duplicates using set intersection
duplicate_indices = set(current_member_df.index) & set(alumni_member_df.index)

# Drop duplicates from alumni_members_df based on the indices
alumni_member_df.drop(duplicate_indices, inplace=True)

In [None]:
for m_key, m_value in alumni_member_df.iterrows():
    if m_key in req_edu_df_most_recent.index:
        if (
            req_edu_df_most_recent.loc[m_key, "degree"] == "Bachelors"
            and req_edu_df_most_recent.loc[m_key, "institution"] == INSTITUTION_FILTER
        ):
            current_academic_role = "Undergraduate Student"
        if req_edu_df_most_recent.loc[m_key, "degree"] in ["PhD", "Masters"]:
            current_academic_role = "Graduate Student"
    elif m_key in req_exp_df_most_recent.index:
        current_academic_role = req_exp_df_most_recent.loc[m_key, "role"]
    else:
        current_academic_role = " "
    alumni_member_df.loc[m_key, "academic_role"] = str(current_academic_role)
alumni_member_df['academic_role'] = alumni_member_df['academic_role'].fillna("")

In [None]:
create_page(
    "alumni_members.html.j2",
    "alumni_members.html",
    general=general,
    alumni_members=alumni_member_df.to_dict("index"),
)

## Individual People Page

In [None]:
def group_df(df):
    new_df = (df.groupby("id")
    .apply(lambda x: x.to_dict(orient="records"))
    .reset_index(name="info")
    .set_index("id")
    .to_dict(orient="index"))
    return new_df

In [None]:
document_df = read_member_data_jsons("documents.json")

In [None]:
outreach_df = read_member_data_jsons("outreach.json")
if not outreach_df.empty:
    outreach_grouped = group_df(outreach_df)

In [None]:
awards_df = read_member_data_jsons("awards.json")
awards_grouped = group_df(awards_df)

exp_grouped = group_df(exp_df)
edu_grouped = group_df(edu_df)
projects_grouped = group_df(projects_df)

In [None]:
for member_id, member_data in info_json_df.iterrows():
    if member_id in current_member_df.index:
        info_json_df.at[member_id, "academic_role"] = current_member_df.loc[
            member_id, "academic_role"
        ]
        info_json_df.at[member_id, "current_project_title"] = current_member_df.loc[
            member_id, "current_project_title"
        ]
    elif member_id in alumni_member_df.index:
        info_json_df.at[member_id, "academic_role"] = alumni_member_df.loc[
            member_id, "academic_role"
        ]
alumni_member_df.replace("nan", np.nan, inplace=True)
alumni_member_df.fillna("", inplace=True)
current_member_df.fillna("", inplace=True)

In [None]:
for person_id, person_data in info_json_df.iterrows():
    create_page(
        "individual_person.html.j2",
        f"members/{person_id}/{person_id}.html",
        general=general,
        member_id=person_id,
        member_data=person_data,
        socials=social_links_df.to_dict("index"),
        documents=document_df.to_dict("index"),
        education=edu_grouped,
        experience=exp_grouped,
        projects=projects_grouped,
        awards=awards_grouped,
        outreach=outreach_df,
        section_headings=INDIVIDUAL_MEMBER_SECTION_MAP,
        content=article_content_df.to_dict("index"),
    )