### This notebook consist of code for creating the html files for the website each time data is updated.

# Set-up

Importing classes

In [42]:
import json
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from pathlib import Path
import shutil
from datetime import datetime, date
import numpy as np


Defining paths

In [43]:
GROUP_DATA_DIR = Path("../../group-data")
TEMPLATE_DIR_PATH = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "templates"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data/"
HOSTING_PATH = GROUP_DATA_DIR.parent / "kerzendorf-group.github.io"
ARTICLE_DIR_PATH = Path("../../research_news/articles")
ARTICLE_IMAGE_PATH = Path("../../research_news/images")
ARTICLE_IMAGE_DESTINATION_DIR = (HOSTING_PATH / "website_files" / "images" / "article_content")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"
CONTENT_DIR_PATH = WEBSITE_DATA_PATH / "content"
SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"

ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"

Setting up jinja environment

In [44]:
# Function to create proper HTML file names by replacing spaces with underscores
def page_link(a):
    """Return the HTML file name after replacing blank spaces(" ") with underscores("-")"""
    return a.replace(" ", "_") if " " in a else a

environment = Environment(
    loader=FileSystemLoader(TEMPLATE_DIR_PATH), extensions=["jinja2.ext.loopcontrols", "jinja2.ext.do"]
)
environment.globals["page_link"] = page_link

# Data Processing Setup

Data Processing Parameters

In [45]:
# Needed columns for articles
ARTICLE_METADATA_FIELDS = [
    "article_id",
    "category",
    "date",
    "tags",
    "title",
    "cover_image",
    "short_description"
]
# Groups and institution used in filtering data
GROUP_FILTER = ["DTI", "TARDIS", "ICER", "kerzendorf"]
INSTITUTION_FILTER = "Michigan State University"

# Map roles to standardized roles for consistency
ROLE_MAP = {
    "Assistant Professor": "Professor",
    "Professorial Assistant": "Undergraduate Student",
    "Visiting Researcher": "Postdoctoral Researcher"
}

# Map degrees to standardized academic levels
DEGREE_MAP = {
    "Masters": "Graduate Student",
    "PhD": "Postdoctorate",  #  if end_date is present
    "Bachelors": "Undergraduate Student",
}

INDIVIDUAL_MEMBER_SECTION_MAP = {
    "education": "Education",
    "experiences": "Experience",
    "projects": "Projects",
    "awards": "Awards & Recognition",
    "outreach": "Outreach Programs",
}

# Functions for Data Handling

In [46]:
def loading_website_data(file_to_load):
    """
    Load data from JSON files specified in a list of file names.

    Parameters:
    ----------
    json_data_list : list of str
        A list of file names (without extension) to load as JSON.

    Returns:
    -------
    dict
        A dictionary where keys are file names and values are the corresponding JSON data.

    Raises:
    ------
    FileNotFoundError:
        If a specified file does not exist.
    json.JSONDecodeError:
        If there's an issue decoding the JSON content from a file.

    """
    loaded_data = {}
    file_matches = WEBSITE_DATA_PATH/ f"{file_to_load}.json"
    if file_matches:
        try:
            with open(file_matches, "r") as json_file:
                loaded_data = json.load(json_file)
        except json.JSONDecodeError:
            print(f"Error decoding JSON in '{file_matches}'.")
    else:
        print(f"File '{file_to_load}.json' not found.")

    return loaded_data

In [47]:
def read_member_data_jsons(file_to_read):
    member_data_list = []
    member_data_df = pd.DataFrame([])
    for single_info_file_path in MEMBERS_DIR_PATH.glob("*/info.json"):
        with open(single_info_file_path, "r") as f_info:
            member_data = json.load(f_info)
        member_unique_id = member_data["id"]
        file_to_read_path = single_info_file_path.parent / "jsons" / file_to_read

        if file_to_read_path.exists():
            with file_to_read_path.open("r") as f_data:
                member_other_data = json.load(f_data)
            for entry in member_other_data:
                entry["id"] = member_unique_id
            member_data_list.append(
                pd.DataFrame(member_other_data)
            )
        # else:
        #     data_path_in_kl = KERZENDORF_GROUP_DATA / "members" / member_unique_id / "jsons" / file_to_read
        #     if data_path_in_kl.exists():
        #         with data_path_in_kl.open("r") as data_file:
        #             member_other_data_kl = json.load(data_file)
        #         for entry in member_other_data_kl:
        #             entry["id"] = member_unique_id
        #         member_data_list.append(
        #             pd.DataFrame(member_other_data_kl)
        #         )

    if member_data_list:
        member_data_df = pd.concat(
            member_data_list, ignore_index=True
        )
        member_data_df.set_index("id", inplace=True)

    return member_data_df

In [48]:
def set_new_image_path(old_image_path):
    image_source = ARTICLE_IMAGE_PATH / old_image_path.name
    image_destination = ARTICLE_IMAGE_DESTINATION_DIR / old_image_path.name
    website_files_index = image_destination.parts.index("website_files")
    new_image_path = Path(*image_destination.parts[website_files_index:])
    shutil.copy2(image_source, image_destination)
    return str(new_image_path)

# DataFrame Creation and Processing

Creating dataframes for articles which can be updated further 

In [49]:
# Reading all articles
article_content_list = []
today = date.today()
for content_file_name in ARTICLE_DIR_PATH.iterdir():
    with open(content_file_name, "r") as fcontent:
        article_content = json.load(fcontent)
    today_datetime = datetime.combine(today, datetime.min.time())
    article_date = datetime.strptime(article_content["date"], "%m-%d-%Y")
    if "kg" in article_content["platforms"] and article_date <= today_datetime:
        image_path = Path(article_content["cover_image"])
        article_content["cover_image"] = set_new_image_path(image_path)
        for content_key, content_value in article_content["content"].items():
            if "img" in content_key:
                new_content_value = set_new_image_path(Path(content_value))
                article_content["content"][content_key] = new_content_value
        article_content_list.append(article_content)
article_content_df = pd.DataFrame(article_content_list)

article_content_df["date"] = pd.to_datetime(
    article_content_df["date"], format="%m-%d-%Y"
)

article_content_df["cover_image_height"] = (
    article_content_df["cover_image_height"].fillna("330px").replace("", "330px")
)
article_content_df["cover_image_width"] = (
    article_content_df["cover_image_width"].fillna("520px").replace("", "520px")
)

#THis line is only for kerzendorf lab and is not needed on dti
article_content_df["category"] = article_content_df["category"].replace(
    "Overview", "Computational Metascience"
)

In [50]:
news_content_df = article_content_df[
    (article_content_df["category"] == "News")
    | (
        article_content_df["tags"].apply(
            lambda x: "news" in x if isinstance(x, list) else False
        )
    )
].sort_values(by=["date"], ascending=[False])

research_content_df = article_content_df[
    article_content_df["category"] != "News"
].sort_values(by=["category", "date"], ascending=[True, False])

In [51]:
info_json_list = []
for single_info_file_path in MEMBERS_DIR_PATH.glob("*/info.json"):
    with open(single_info_file_path, "r") as f_info:
        member_data = json.load(f_info)
    # if len(member_data.keys()) == 1:
    #     info_json_path = (
    #         KERZENDORF_GROUP_DATA / "members" / member_data["id"] / "info.json"
    #     )
    #     member_images_dir = HOSTING_PATH / "members" / member_data["id"] / "media"
    #     with open(info_json_path, "r") as f_info_kl:
    #         member_data_from_kl = json.load(f_info_kl)
    #     member_images_dir_source = (
    #         KERZENDORF_GROUP_DATA / "members" / member_data["id"] / "media"
    #     )

    #     shutil.copytree(member_images_dir_source, member_images_dir, dirs_exist_ok=True)
    #     info_json_list.append(member_data_from_kl)
    # else:
    info_json_list.append(member_data)
info_json_df = pd.DataFrame(info_json_list)
info_json_df.set_index("id", inplace=True)
info_json_df["full_name"] = info_json_df.apply(
    lambda row: (
        row["nick_name"] + " " + row["last_name"]
        if pd.notna(row["nick_name"])
        else row["first_name"] + " " + row["last_name"]
    ),
    axis=1,
)
info_json_dict = info_json_df.to_dict("index")

In [52]:
exp_df = read_member_data_jsons("experiences.json")[
    [
        "role",
        "start_date",
        "end_date",
        "institution",
        "group",
    ]
]
edu_df = read_member_data_jsons("education.json")[
    ["start_date", "end_date", "institution", "subject", "degree"]
]

In [53]:
edu_df['end_date'] = pd.to_datetime(edu_df['end_date'], format='%Y-%m-%d')
edu_df['start_date'] = pd.to_datetime(edu_df['start_date'], format='%Y-%m-%d')


def most_recent_row(group):
    return group[group["start_date"] == group["start_date"].max()]
edu_df_most_recent = (
    edu_df.groupby("id").apply(most_recent_row).droplevel(0)
)

edu_df_most_recent['academic_role'] = ""
for edu_mem_id, edu_mem_value in edu_df_most_recent.iterrows():
    if edu_mem_value['institution'] == INSTITUTION_FILTER:
        if edu_mem_value['degree'] == "Bachelors":
            edu_df_most_recent.at[edu_mem_id, 'academic_role'] = "Undergraduate Student"
        elif edu_mem_value['degree'] in ["PhD", "Masters"]:
            edu_df_most_recent.at[edu_mem_id, 'academic_role'] = "Graduate Student"
edu_df_most_recent_diff_suffix = edu_df_most_recent.add_suffix("_edu")

In [54]:
social_link_list = []
for single_member_file_path in MEMBERS_DIR_PATH.rglob("social_links.json"):
    with open(single_member_file_path, "r") as fname:
        member_social_link = json.load(fname)
    info_json_file_path = single_member_file_path.parent.parent / "info.json"
    with open(info_json_file_path, "r") as file_info:
        member_info_data = json.load(file_info)
    mem_id = member_info_data["id"]
    member_social_link["id"] = mem_id
    social_link_list.append(member_social_link)
social_links_df = pd.DataFrame(social_link_list)
social_links_df.set_index("id", inplace=True)
# social_links_df.fillna("", inplace=True)

In [55]:
recent_content = article_content_df.sort_values(
    by=["category", "date"], ascending=[True, False]
)
# Get the first row for each category using groupby and head
recent_content = recent_content.groupby("category").head(1).copy()

# Page Creation

Function to create a page

In [56]:
def create_page(template, html, **kwargs):
    """
    Create an HTML page using a Jinja2 template and save it to a specified path.

    Parameters:
    ----------
    template : str
        The filename of the Jinja2 template to be used.
    html : str
        The filename of the HTML file to be generated.
    **kwargs : dict
        Additional keyword arguments to be passed to the Jinja2 template for rendering.

    Returns:
    -------
    None

    """
    page_template = environment.get_template(template)
    template_level = html.count("/")
    page_html_path = HOSTING_PATH / html
    page_html_path.parent.mkdir(parents=True, exist_ok=True)
    page_content = page_template.render(TEMPLATE_LEVEL=template_level, **kwargs)
    with open(page_html_path, mode="w", encoding="utf-8") as page:
        page.write(page_content)

# Processing List Of JSON files

In [57]:
# Function Call
general = loading_website_data("general")
homepage = loading_website_data("homepage")
contact = loading_website_data("contact")
research = loading_website_data("research_categories")
support = loading_website_data("support")

# Homepage

Storing selected columns for Homepage only

In [58]:
create_page(
    "homepage.html.j2",
    "index.html",
    general=general,
    homepage=homepage,
    recent_content=recent_content.to_dict(orient="records"),
)

# Current Members Page

In [59]:
exp_df['end_date'] = pd.to_datetime(exp_df['end_date'], format='%Y-%m-%d')
exp_df = exp_df.fillna("")
filtered_exp_df = exp_df[(exp_df["end_date"].isna()) | (exp_df["end_date"].dt.date >= datetime.now().date())]
def most_recent_row(group):
    sorted_group = group.sort_values(by=['start_date', 'end_date'], ascending=[False, True])
    # Filter the sorted group by the condition that the group name is in GROUP_FILTER
    relevant_group = sorted_group[sorted_group['group'].str.contains('|'.join(GROUP_FILTER))]
    # Return the most recent relevant experience
    return relevant_group.head(1)
filtered_exp_df_most_recent = exp_df.groupby("id").apply(most_recent_row).droplevel(0)
exp_df_most_recent = exp_df.groupby("id").apply(most_recent_row).droplevel(0)
exp_df_most_recent_diff_suffix = exp_df_most_recent.add_suffix('_exp')

In [60]:
merged_edu_exp_df = exp_df_most_recent_diff_suffix.merge(edu_df_most_recent_diff_suffix, on='id', how='outer')

In [82]:
merged_edu_exp_df['isCurrent'] = False
merged_edu_exp_df['current_role'] = ""
for merged_mem_id, merged_mem_value in merged_edu_exp_df.iterrows():
    if merged_mem_value['institution_edu'] == INSTITUTION_FILTER and (pd.isna(merged_mem_value['end_date_edu']) or merged_mem_value['end_date_edu'] >= datetime.now()):
        merged_mem_value['isCurrent'] = True
        acad_role = merged_mem_value.get('academic_role_edu')
        if acad_role:
            merged_mem_value['current_role'] = merged_mem_value["academic_role_edu"]
    elif merged_mem_value['group_exp'] in GROUP_FILTER and (pd.isna(merged_mem_value['end_date_exp']) or merged_mem_value['end_date_exp'] >= datetime.now()):
            merged_mem_value['isCurrent'] = True
            merged_mem_value['current_role'] = merged_mem_value["role_exp"]
    else:
        merged_mem_value['isCurrent'] = False
        if pd.notna(merged_mem_value.get("role_exp")):
            merged_mem_value['current_role'] = merged_mem_value.get("role_exp")
        else:
            merged_mem_value['current_role'] = merged_mem_value.get("academic_role_edu")
    merged_edu_exp_df.loc[merged_mem_id] = merged_mem_value
merged_edu_exp_df['current_role'] = merged_edu_exp_df['current_role'].replace(ROLE_MAP)

merged_edu_exp_df


Unnamed: 0_level_0,role_exp,start_date_exp,end_date_exp,institution_exp,group_exp,start_date_edu,end_date_edu,institution_edu,subject_edu,degree_edu,academic_role_edu,isCurrent,current_role
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
abhinav_ohri,Research Software Engineer,2023-11-13,NaT,DeepThought Initiative,DTI,2021-01-01,2023-07-06,Sanatan Dharma College,Commerce(Honours),Bachelors,,True,Research Software Engineer
alexander_grunewald,Research Assistant,2023-01-01,NaT,,kerzendorf,2021-01-01,NaT,Michigan State University,Statistics and Data Science,Bachelors,Undergraduate Student,True,Undergraduate Student
andrew_fullard,Researcher,2024-01-01,NaT,Michigan State University,kerzendorf,2014-01-01,2020-12-31,University of Denver,Astronomy,PhD,,True,Researcher
anirban_dutta,Postdoctoral Researcher,2024-03-01,NaT,Michigan State University,kerzendorf,2018-01-01,NaT,Indian Institute of Astrophysics,Astrophysics,PhD,,True,Postdoctoral Researcher
atharva_arya,Research Software Engineer,2023-10-01,NaT,DeepThought Initiative,DTI,2019-01-01,2023-12-31,RCOEM,Engineering,Bachelors,,True,Research Software Engineer
bea_lu,Professorial Assistant,2021-09-01,NaT,Michigan State University,kerzendorf,2021-09-01,NaT,Michigan State University,Computational Data Science,Bachelors,Undergraduate Student,True,Undergraduate Student
benjamin_mellon,Research Assistant,2023-11-13,NaT,,kerzendorf,2021-09-01,2025-05-01,Michigan State University,Physics and Computational Mathematics,Bachelors,Undergraduate Student,True,Undergraduate Student
cecelia_powers,Professorial Assistant,2023-09-01,NaT,Michigan State University,kerzendorf,2023-09-01,NaT,Michigan State University,Astrophysics,Bachelors,Undergraduate Student,True,Undergraduate Student
deeksha_mohanty,,,NaT,,,2023-08-28,2025-05-01,Michigan State University,Computer Science,Masters,Graduate Student,True,Graduate Student
erin_visser,Professorial Assistant,2023-08-28,NaT,Michigan State University,kerzendorf,2023-09-01,NaT,Michigan State University,"Physics and Mathematics, Advanced",Bachelors,Undergraduate Student,True,Undergraduate Student


In [62]:
current_member_df = merged_edu_exp_df[merged_edu_exp_df['isCurrent'] == True]

In [63]:
alumni_member_df = merged_edu_exp_df[merged_edu_exp_df['isCurrent'] == False]

In [64]:
projects_df = read_member_data_jsons("projects.json").sort_values(
    by=["end_date"], ascending=False
)

In [65]:
for mem_key, mem_value in current_member_df.iterrows():
    if mem_key in projects_df.index:
        mem_projects = projects_df.loc[mem_key]
        if not mem_projects.empty:
            if isinstance(mem_projects, pd.Series):
                current_project_title = mem_projects["project_title"]
            else:
                current_project_title = mem_projects.iloc[0]["project_title"]
    else:
        current_project_title = ""
    current_member_df.loc[mem_key, "current_project_title"] = current_project_title

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_member_df.loc[mem_key, "current_project_title"] = current_project_title


Function to sort the members on basis of their roles

In [66]:
def custom_role_sort(roles):
    """
    Sorts a list of roles based on academic role hierarchy and then by name.

    Parameters
    ----------
    roles : list[dict]
        List of dictionaries representing roles. Each dictionary should contain at least the keys 'academic_role' and 'name'.

    Returns
    -------
    list[dict]
        A sorted list of roles based on the academic role hierarchy and, in case of ties, sorted by name in ascending order.
    """
    with open(ROLE_HIERARCHY_PATH, "r") as file_name:  
        role_hierarchy = json.load(file_name)
    sorted_roles = sorted(roles, key=lambda x: (role_hierarchy.get(x['academic_role'], float('inf')), x['name']))
    return sorted_roles

Current Members Page

In [67]:
create_page(
    "current_members.html.j2",
    "current_members.html",
    general=general,
    current_members=current_member_df,
    socials=social_links_df.to_dict("index"),
)

Alumni Members Page

In [68]:
create_page(
    "alumni_members.html.j2",
    "alumni_members.html",
    general=general,
    alumni_members=alumni_member_df,
)

## Individual People Page

In [69]:
def group_df(df):
    new_df = (df.groupby("id")
    .apply(lambda x: x.to_dict(orient="records"))
    .reset_index(name="info")
    .set_index("id")
    .to_dict(orient="index"))
    return new_df

In [70]:
document_df = read_member_data_jsons("documents.json")

In [71]:
outreach_df = read_member_data_jsons("outreach.json")
if not outreach_df.empty:
    outreach_grouped = group_df(outreach_df)

In [72]:
awards_df = read_member_data_jsons("awards.json")
awards_grouped = group_df(awards_df)

exp_grouped = group_df(exp_df)
edu_grouped = group_df(edu_df)
projects_grouped = group_df(projects_df)

In [73]:
for member_id, member_data in info_json_df.iterrows():
    if member_id in current_member_df.index:
        info_json_df.at[member_id, "academic_role"] = current_member_df.loc[
            member_id, "current_role"
        ]
        info_json_df.at[member_id, "current_project_title"] = current_member_df.loc[
            member_id, "current_project_title"
        ]
    elif member_id in alumni_member_df.index:
        info_json_df.at[member_id, "academic_role"] = alumni_member_df.loc[
            member_id, "current_role"
        ]
alumni_member_df.replace("nan", np.nan, inplace=True)
alumni_member_df.fillna("", inplace=True)
current_member_df.fillna("", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_member_df.replace("nan", np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alumni_member_df.fillna("", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_member_df.fillna("", inplace=True)


In [74]:
for person_id, person_data in info_json_df.iterrows():
    create_page(
        "individual_person.html.j2",
        f"members/{person_id}/{person_id}.html",
        general=general,
        member_id=person_id,
        member_data=person_data,
        socials=social_links_df.to_dict("index"),
        documents=document_df.to_dict("index"),
        education=edu_grouped,
        experience=exp_grouped,
        projects=projects_grouped,
        awards=awards_grouped,
        outreach=outreach_df,
        section_headings=INDIVIDUAL_MEMBER_SECTION_MAP,
        content=article_content_df.to_dict("index"),
    )

# Contact Page

In [75]:
create_page(
    "contact.html.j2",
    "Contact.html",
    general=general,
    contact=contact
)

# Support Page

In [76]:
create_page(
    "support.html.j2",
    "Support.html",
    general=general,
    support=support
)

# Research Front Page

For adding more columns in dataframe to render front pages and individual article pages

In [77]:
create_page(
    "research.html.j2",
    "Research.html",
    general=general,
    content=research_content_df,
    research=research,
    current_members=info_json_dict,
)

In [78]:
SUB_RESEARCH_PATH.mkdir(parents=True, exist_ok=True)

for category in article_content_df.loc[
    article_content_df.category != "News", "category"
].unique():
    create_page(
        "sub_research_frontpage.html.j2",
        f"sub_research/{page_link(category.lower())}.html",
        general=general,
        research=research,
        content=research_content_df,
        category=category,
        current_members=info_json_dict,
    )

Individual Research Page


In [79]:
for ind_research_keys, ind_research_values in research_content_df.iterrows():
    destination_research_path = f"sub_research/{page_link(ind_research_values.category.lower())}/{page_link(ind_research_values.article_id.lower())}.html"
    if ind_research_values['category'] == "Software":
        destination_research_path = f"sub_research/{page_link(ind_research_values.article_id.lower())}.html"

    folder_path = SUB_RESEARCH_PATH / page_link(ind_research_values.category.lower())
    folder_path.mkdir(parents=True, exist_ok=True)
    create_page(
        "research_page_no_twitter.html.j2",
        destination_research_path,
        general=general,
        content=ind_research_values,
        member_data=info_json_dict,
        article_id=ind_research_values["article_id"],
    )

# News Page

In [80]:
create_page(
    "news.html.j2",
    "News.html",
    general=general,
    content=news_content_df,
    category="News",
    member_data=info_json_dict,
)

Individual News Page

In [81]:
for ind_news_keys, ind_news_values in news_content_df.iterrows():
    folder_path = HOSTING_PATH / "news" / page_link(ind_news_values.article_id.lower())
    create_page(
        "news_page_no_twitter.html.j2",
        f"news/{page_link(ind_news_values.article_id.lower())}.html",
        general=general,
        content=ind_news_values,
        member_data=info_json_dict,
        category="News"
    )