### This notebook consist of code for creating the html files for the website each time data is updated.

# Set-up

Importing classes

In [36]:
import json
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from jinja2.exceptions import UndefinedError
from pathlib import Path
import shutil
from datetime import datetime, date

Defining paths

In [37]:
GROUP_DATA_DIR = Path("../../group-data")
TEMPLATE_DIR_PATH = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "templates"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data/"
HOSTING_PATH = GROUP_DATA_DIR.parent / "kerzendorf-group.github.io"
ARTICLE_DIR_PATH = Path("../../research_news/articles")
ARTICLE_IMAGE_PATH = Path("../../research_news/images")
ARTICLE_IMAGE_DESTINATION_DIR = (HOSTING_PATH / "website_files" / "images" / "article_content")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"
CONTENT_DIR_PATH = WEBSITE_DATA_PATH / "content"
SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"

ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"

Function to create proper HTML file names by replacing spaces with underscores

In [38]:
def page_link(a):
    """Return the HTML file name after replacing blank spaces(" ") with underscores("-")"""
    return a.replace(" ", "_") if " " in a else a

 Creating an instance of the Environment class that looks for templates. Page_link is set to the global variable so that it can be accessed by all templates

In [39]:
environment = Environment(
    loader=FileSystemLoader(TEMPLATE_DIR_PATH), extensions=["jinja2.ext.loopcontrols", "jinja2.ext.do"]
)
environment.globals["page_link"] = page_link

# Data Processing Setup

Data Processing Parameters

In [40]:
# List of JSON files to be processed
JSON_FILES_TO_LOAD = [
    "general",
    "homepage",
    "research_categories",
    "support",
    "contact",
]

# Needed columns for articles
ARTICLE_METADATA_FIELDS = [
    "article_id",
    "category",
    "date",
    "tags",
    "title",
    "cover_image",
    "short_description"
]
# Groups and institution used in filtering data
GROUP_FILTER = ["DTI", "TARDIS", "ICER", "kerzendorf"]
INSTITUTION_FILTER = "Michigan State University"

# Map roles to standardized roles for consistency
ROLE_MAP = {
    "Assistant Professor": "Professor",
    "Professor": "Professor",
    "Visualization Consultant": "Visualization Consultant",
    "Research Consultant": "Research Consultant",
    "Research Software Engineer": "Research Software Engineer",
    "Professorial Assistant": "Undergraduate Student",
    "Visiting Researcher": "Postdoctoral Researcher",
    "Postdoctoral Researcher": "Postdoctoral Researcher",
}

# Map degrees to standardized academic levels
DEGREE_MAP = {
    "Masters": "Graduate Student",
    "PhD": "Postdoctorate",  #  if end_date is present
    "Bachelors": "Undergraduate Student",
}

INDIVIDUAL_MEMBER_SECTION_MAP = {
    "education": "Education",
    "experiences": "Experience",
    "projects": "Projects",
    "awards": "Awards & Recognition",
    "outreach": "Outreach Programs",
}

# Functions for Data Handling

Function for looping through JSON files and loading their content into the 'data' dictionary 

In [41]:
def loading_json_files(json_data_list):
    """
    Load data from JSON files specified in a list of file names.

    Parameters:
    ----------
    json_data_list : list of str
        A list of file names (without extension) to load as JSON.

    Returns:
    -------
    dict
        A dictionary where keys are file names and values are the corresponding JSON data.

    Raises:
    ------
    FileNotFoundError:
        If a specified file does not exist.
    json.JSONDecodeError:
        If there's an issue decoding the JSON content from a file.

    """
    loaded_data = {}
    for data_id in json_data_list:
        file_matches = list(WEBSITE_DATA_PATH.glob(f"{data_id}.json"))

        if file_matches:
            try:
                with open(file_matches[0], "r") as json_file:
                    loaded_data[data_id] = json.load(json_file)
            except json.JSONDecodeError:
                print(f"Error decoding JSON in '{file_matches[0]}'.")
        else:
            print(f"File '{data_id}.json' not found.")

    return loaded_data

# DataFrame Creation and Processing

Creating dataframes for articles which can be updated further 

In [42]:
def set_new_image_path(old_image_path):
    image_source = ARTICLE_IMAGE_PATH / old_image_path.name
    image_destination = ARTICLE_IMAGE_DESTINATION_DIR / old_image_path.name
    website_files_index = image_destination.parts.index("website_files")
    new_image_path = Path(*image_destination.parts[website_files_index:])
    shutil.copy2(image_source, image_destination)
    return str(new_image_path)

In [44]:
# Reading all articles
article_content_list = []
today = date.today()
for content_file_name in ARTICLE_DIR_PATH.iterdir():
    with open(content_file_name, "r") as fcontent:
        article_content = json.load(fcontent)
    today_datetime = datetime.combine(today, datetime.min.time())
    article_date = datetime.strptime(article_content["date"], "%m-%d-%Y")
    if "kg" in article_content["platforms"] and article_date <= today_datetime:
        image_path = Path(article_content["cover_image"])
        article_content["cover_image"] = set_new_image_path(image_path)
        for content_key, content_value in article_content["content"].items():
            if "img" in content_key:
                new_content_value = set_new_image_path(Path(content_value))
                article_content["content"][content_key] = new_content_value
        article_content_list.append(article_content)
article_content_df = pd.DataFrame(article_content_list)

article_content_df["date"] = pd.to_datetime(
    article_content_df["date"], format="%m-%d-%Y"
)

article_content_df["cover_image_height"] = (
    article_content_df["cover_image_height"].fillna("330px").replace("", "330px")
)
article_content_df["cover_image_width"] = (
    article_content_df["cover_image_width"].fillna("520px").replace("", "520px")
)

#THis line is only for kerzendorf lab and is not needed on dti
article_content_df["category"] = article_content_df["category"].replace(
    "Overview", "Computational Metascience"
)

Extract the Latest Content for Each Category from a DataFrame

In [45]:
def get_latest_content_df(input_data):
    """
    Extract the latest content for each category from a DataFrame.

    Parameters
    ----------
    input_data : pandas.DataFrame
        The input DataFrame containing content information.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the latest content for each category.

    """
    # Sort the entire DataFrame by "category" and "date" in descending order
    sorted_data = input_data.sort_values(
        by=["category", "date"], ascending=[True, False]
    )

    # Get the first row for each category using groupby and head
    latest_data = sorted_data.groupby("category").head(1).copy()
    latest_data["date"] = pd.to_datetime(
        latest_data["date"], format="%m-%d-%Y"
    )
    latest_data = latest_data.sort_values(by="date", ascending=False)

    return latest_data

# Page Creation

Function to create a page

In [46]:
def create_page(template, html, **kwargs):
    """
    Create an HTML page using a Jinja2 template and save it to a specified path.

    Parameters:
    ----------
    template : str
        The filename of the Jinja2 template to be used.
    html : str
        The filename of the HTML file to be generated.
    **kwargs : dict
        Additional keyword arguments to be passed to the Jinja2 template for rendering.

    Returns:
    -------
    None

    """
    page_template = environment.get_template(template)
    template_level = html.count("/")
    page_html_path = HOSTING_PATH / html
    page_html_path.parent.mkdir(parents=True, exist_ok=True)
    page_content = page_template.render(TEMPLATE_LEVEL=template_level, **kwargs)
    with open(page_html_path, mode="w", encoding="utf-8") as page:
        page.write(page_content)

# Processing List Of JSON files

In [47]:
# Function Call
data = loading_json_files(JSON_FILES_TO_LOAD)

# Homepage

Storing selected columns for Homepage only

In [48]:
content_df = article_content_df[ARTICLE_METADATA_FIELDS]
latest_content_df = get_latest_content_df(content_df)

In [49]:
create_page(
    "homepage.html.j2",
    "index.html",
    general=data["general"],
    homepage=data["homepage"],
    recent_content=latest_content_df.to_dict(orient="records"),
)

# People Page

Filtering based on group and institution

In [50]:
def filter_edu_exp_data(df, valid_groups,valid_institution):
    """
    Filter education and experience data based on specified criteria.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing education and experience data.
    valid_groups : list
        List of valid groups to include in the filtered data.
    valid_institution : str
        The valid institution to include in the filtered data.

    Returns
    -------
    pandas.DataFrame
        A filtered DataFrame containing only the rows that meet the specified criteria.
    """
    group_mask = False
    institution_mask = False

    # Check if 'group' column exists and update mask accordingly
    if "group" in df.columns:
        group_mask = df["group"].isin(valid_groups)

    # Check if 'institution' column exists and update mask accordingly
    if "institution" in df.columns:
        institution_mask = df["institution"] == valid_institution

    final_mask = group_mask | institution_mask
    return df[final_mask]

Function to load education data

In [51]:
def load_education_experience_data(directory):
    """
    Load education and experience data from JSON files, filter based on criteria, and perform preprocessing.

    Parameters
    ----------
    directory : str or pathlib.Path
        The directory path containing "experiences.json" and "education.json" files.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing filtered and preprocessed education and experience data.
    """
    filtered_records = []
    file_names = ["experiences.json", "education.json"]
    for file_name in file_names:
        file_path = directory / file_name
        if file_path.exists():
            # Reading JSON data directly into a DataFrame
            records = pd.read_json(file_path)

            # filtering based on group and institution
            valid_records = filter_edu_exp_data(records, GROUP_FILTER, INSTITUTION_FILTER)

            filtered_records.append(valid_records)
        else:
            print(f"{file_path} does not exist")

    if filtered_records:
        combined_edu_exp_records = pd.concat(filtered_records, ignore_index=True)
    else:
        combined_edu_exp_records = pd.DataFrame()

    # if start_date column exists, fill with NaN if it doesn't
    if "start_date" not in combined_edu_exp_records:
        combined_edu_exp_records["start_date"] = pd.NaT

    # Convert start_date to datetime format
    combined_edu_exp_records["start_date"] = pd.to_datetime(combined_edu_exp_records["start_date"], errors="coerce")

    # Sort the DataFrame based on start_date
    combined_edu_exp_records = combined_edu_exp_records.sort_values(by="start_date", ascending=False)
    return combined_edu_exp_records

Function load social links directly

In [52]:
def load_social_links(social_dir):
    """
    Load social links from a JSON file.

    Parameters
    ----------
    social_dir : str or pathlib.Path
        The directory path containing the "social_links.json" file.

    Returns
    -------
    dict or None
        A dictionary containing social links or None if the file doesn't exist.
    """
    social_links = None
    social_links_file_path = social_dir / "social_links.json"
    if social_links_file_path.exists():
        with open(social_links_file_path, "r") as f:
            social_links = json.load(f)
    return social_links

Function to load topmost project title

In [53]:
def load_latest_project_title(project_dir):
    """
    Load the title of the topmost project from a JSON file.

    Parameters
    ----------
    project_dir : str or pathlib.Path
        The directory path containing the "projects.json" file.

    Returns
    -------
    str or None
        The title of the topmost project or None if the file doesn't exist or is empty.
    """
    projects_file_path = project_dir / "projects.json"
    topmost_project_title = None
    if projects_file_path.exists():
        projects_df = pd.read_json(projects_file_path)
        if not projects_df.empty:
            # Fetching the project title from the first row of the DataFrame
            topmost_project_title = projects_df.iloc[0].get("project_title")
    return topmost_project_title

Funtion to parse member data

In [54]:
def parse_member_data(member_dir):
    """
    Parse member-related data from JSON files in the specified directory.

    Parameters
    ----------
    member_dir : str or pathlib.Path
        The directory path containing member-related JSON files.

    Returns
    -------
    tuple
        A tuple containing education and experience DataFrame, social links dictionary,
        and the title of the current project.
    """
    member_json_dir = member_dir / "jsons"
    education_experience_df = load_education_experience_data(member_json_dir)
    current_project_title = load_latest_project_title(member_json_dir)
    social_links = load_social_links(member_json_dir)

    return education_experience_df, social_links, current_project_title

Function to extract academic roles from education and experience data

In [55]:
def extract_member_academic_role(education_experience_df):
    """
    Extract the current academic role of a member based on education and experience data.

    Parameters
    ----------
    education_experience_df : pandas.DataFrame
        DataFrame containing education and experience data.

    Returns
    -------
    Tuple[str, bool]
        A tuple containing:
        - str: The current academic role of the member.
        - bool: True if the member is currently active, False otherwise.
    """

    # Check if these columns exist in dataframe
    for column in ["end_date", "group", "institution"]:
        if column not in education_experience_df.columns:
            education_experience_df[column] = None

    current_academic_role = None

    for _, row in education_experience_df.iterrows():
        role = row.get("role", None)
        degree = row.get("degree", None)
        member_institution = row.get('institution', None)
        member_group = row.get('group', None)

        if not current_academic_role:
            current_academic_role = ROLE_MAP.get(role, "")

            if degree == "PhD" or degree == "Masters" and pd.isna(row["end_date"]):
                current_academic_role = "Graduate Student"  # if end_date is NaN
                
            elif degree == "Bachelors" and pd.isna(row["end_date"]):
                if member_institution == INSTITUTION_FILTER:
                    if member_group in GROUP_FILTER:
                        current_academic_role = "Undergraduate Student"
                    else:
                        current_academic_role = DEGREE_MAP[degree]
            elif not current_academic_role and degree in DEGREE_MAP:
                current_academic_role = DEGREE_MAP[degree]

    # Check for end dates outside the loop
    has_end_date = all(
        not pd.isna(date) for date in education_experience_df["end_date"]
    )
    is_current_member = not has_end_date

    return current_academic_role, is_current_member

Function to sort the members on basis of their roles

In [56]:
def custom_role_sort(roles):
    """
    Sorts a list of roles based on academic role hierarchy and then by name.

    Parameters
    ----------
    roles : list[dict]
        List of dictionaries representing roles. Each dictionary should contain at least the keys 'academic_role' and 'name'.

    Returns
    -------
    list[dict]
        A sorted list of roles based on the academic role hierarchy and, in case of ties, sorted by name in ascending order.
    """
    with open(ROLE_HIERARCHY_PATH, "r") as file_name:  
        role_hierarchy = json.load(file_name)
    sorted_roles = sorted(roles, key=lambda x: (role_hierarchy.get(x['academic_role'], float('inf')), x['name']))
    return sorted_roles

Function to store data for current and alumni members

In [57]:
def fetch_member_data():
    """
    Fetch and process member data from directories in the specified members' directory.

    Returns
    -------
    Tuple[list, list]
        A tuple containing two lists:
        1. List of dictionaries representing current members' data.
        2. List of dictionaries representing alumni membersw' data.

    """
    current_people_page_list = []
    alumni_people_page_list = []
    # Looping through member directories to fetch and process member data
    for member_dir in MEMBERS_DIR_PATH.glob("*"):
        if not (member_info_fname := member_dir / "info.json").exists():
            continue
        with open(member_info_fname, "r") as file_name:
            member_info = json.load(file_name)
        (
            education_experience_df,
            social_links,
            current_project_title,
        ) = parse_member_data(member_dir)
        current_academic_role, is_current_member = extract_member_academic_role(
            education_experience_df
        )
        first_name = member_info["first_name"]
        last_name = member_info["last_name"]
        nick_name = member_info.get("nick_name")
        member_id = member_info["id"]
        image_path = member_info["image_path"]
        cover_image_path = member_info["cover_image_path"]

        name = f"{nick_name if nick_name else first_name} {last_name}"
        member_data = {
            "name": name,
            "academic_role": current_academic_role,
            "id": member_id,
            "current_project_title": current_project_title,
            "image_path": image_path,
            "cover_image_path": cover_image_path,
        }

        if social_links is not None:
            member_data["social_links"] = social_links
            
        if is_current_member:
            current_people_page_list.append(member_data)
        else:
            alumni_people_page_list.append(member_data)

    # Sort current members by role
    current_people_page_list = custom_role_sort(current_people_page_list)
    
    # Sort alumni members by role
    alumni_people_page_list = custom_role_sort(alumni_people_page_list)
    return current_people_page_list, alumni_people_page_list

In [58]:
current_people_page_list, alumni_people_page_list = fetch_member_data()
all_people_page_list = current_people_page_list + alumni_people_page_list
all_people_data = {person["id"]: person for person in all_people_page_list}

../../group-data/members/jack_o_brien/jsons/experiences.json does not exist
../../group-data/members/hayden_monk/jsons/experiences.json does not exist
../../group-data/members/vicente_amado/jsons/experiences.json does not exist
../../group-data/members/yuki_matsumura/jsons/experiences.json does not exist
../../group-data/members/alexander_grunewald/jsons/experiences.json does not exist


  combined_edu_exp_records["start_date"] = pd.to_datetime(combined_edu_exp_records["start_date"], errors="coerce")


Current Members Page

In [59]:
create_page(
    "current_members.html.j2",
    "current_members.html",
    general=data["general"],
    current_members=current_people_page_list,
)

Alumni Members Page

In [60]:
create_page(
    "alumni_members.html.j2",
    "alumni_members.html",
    general=data["general"],
    alumni_members=alumni_people_page_list,
)

In [61]:
def load_individual_member_data(member_id):
    """
    Load data for an individual member based on their unique ID.

    Parameters
    ----------
    member_id : str
        Unique identifier for the member.

    Returns
    -------
    tuple
        A tuple containing:
        - dict: Basic information about the member loaded from 'info.json'.
        - dict: Dictionary containing various categories of member data loaded from respective JSON files.
            The keys correspond to categories mapped in INDIVIDUAL_MEMBER_SECTION_MAP,
            and values are dictionaries containing data for each category.
    """
    member_dir = MEMBERS_DIR_PATH / member_id
    member_jsons_dir = member_dir / "jsons"
    member_info_dir = member_dir / "info.json"

    with open(member_info_dir, "r") as file_name:
        basic_info = json.load(file_name)

    member_all_data = {}
    for category in INDIVIDUAL_MEMBER_SECTION_MAP:
        file_path = member_jsons_dir / f"{category}.json"
        if file_path.exists():
            with open(file_path, "r") as file_name:
                member_all_data[category] = json.load(file_name)
    
    return basic_info, member_all_data

In [62]:
columns = ["people_involved_ids", "category", "date", "title", "article_id"]
content_df = article_content_df[columns]

for person_id, person_data in all_people_data.items():
    basic_info, member_all_data = load_individual_member_data(person_id)
    create_page(
        "individual_person.html.j2",
        f"members/{person_id}/{person_id}.html",
        general=data["general"],
        member_id=person_id,
        member_data=person_data,
        basic_info=basic_info,
        category_data=member_all_data,
        section_headings=INDIVIDUAL_MEMBER_SECTION_MAP,
        content=content_df.to_dict(orient='records')
    )

# Contact Page

In [63]:
create_page(
    "contact.html.j2",
    "Contact.html",
    general=data["general"],
    contact=data["contact"]
)

# Support Page

In [64]:
create_page(
    "support.html.j2",
    "Support.html",
    general=data["general"],
    support=data["support"]
)

# Research Front Page

For adding more columns in dataframe to render front pages and individual article pages

In [65]:
columns_extended = ARTICLE_METADATA_FIELDS + ["author_id"]
content_df = article_content_df[columns_extended]
research_content_df = content_df[content_df["category"] != "News"].sort_values(
    by=["category", "date"], ascending=[True, False]
)
latest_content_df = get_latest_content_df(content_df)

In [66]:
create_page(
    "research.html.j2",
    "Research.html",
    general=data["general"],
    content=research_content_df,
    research=data["research_categories"],
    current_members=current_people_page_list,
)

In [67]:
folder_path = Path(HOSTING_PATH) / "sub_research"
folder_path.mkdir(parents=True, exist_ok=True)

for category in content_df.loc[content_df.category != "News", "category"].unique():
    create_page(
        "sub_research_frontpage.html.j2",
        f"sub_research/{page_link(category.lower())}.html",
        general=data["general"],
        research=data["research_categories"],
        content=research_content_df,
        category=category,
        current_members=current_people_page_list
    )

Individual Research Page


In [68]:
columns_extended = ARTICLE_METADATA_FIELDS + ["author_id", "people_involved_ids", "links", "content", "long_description"]
content_df = article_content_df[columns_extended]
ind_research_content_df = content_df[content_df["category"] != "News"].sort_values(
    by=["category", "date"], ascending=[True, False]
)


for ind_research_keys, ind_research_values in ind_research_content_df.iterrows():
    
    folder_path = Path(HOSTING_PATH) / "sub_research" / page_link(ind_research_values.category.lower())
    folder_path.mkdir(parents=True, exist_ok=True)
    create_page(
        "research_page_no_twitter.html.j2",
        f"sub_research/{page_link(ind_research_values.category.lower())}/{page_link(ind_research_values.article_id.lower())}.html",
        general=data["general"],
        content=ind_research_values,
        member_data=all_people_data,
        article_id=ind_research_values['article_id']
    )

# News Page

In [69]:
columns_extended = ARTICLE_METADATA_FIELDS + ["author_id", "people_involved_ids", "content", "long_description"]
content_df = article_content_df[columns_extended]

news_content_df = content_df[
    (content_df["category"] == "News") | (content_df["tags"].apply(lambda x: "news" in x if isinstance(x, list) else False))
].sort_values(by=["date"], ascending=[False])

create_page(
    "news.html.j2",
    "News.html",
    general=data["general"],
    research=data["research_categories"],
    content=news_content_df,
    member_data=all_people_data,
    category="News"
)

Individual News Page

In [70]:
for ind_news_keys, ind_news_values in news_content_df.iterrows():
    folder_path = Path(HOSTING_PATH) / "news" / page_link(ind_news_values.article_id.lower())
    create_page(
        "news_page_no_twitter.html.j2",
        f"news/{page_link(ind_news_values.article_id.lower())}.html",
        general=data["general"],
        content=ind_news_values,
        member_data=all_people_data,
        category="News"
    )