### This notebook consist of code for creating the html files for the website each time data is updated.

### Set-up

Importing classes

In [75]:
import json
import os
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from jinja2.exceptions import UndefinedError
from pathlib import Path

Defining paths

In [76]:
GROUP_DATA_DIR = Path("../..//group-data")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data/"
CONTENT_DIR_PATH = WEBSITE_DATA_PATH / "content"
TEMPLATE_DIR_PATH = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "templates"
HOSTING_PATH = GROUP_DATA_DIR.parent / "abhinav.github.io"

Function to create proper HTML file names by replacing spaces with underscores

In [77]:
def page_link(a):
    """Return the HTML file name after replacing blank spaces(" ") with underscores("-")"""
    return a.replace(" ", "_") if " " in a else a

 Creating an instance of the Environment class that looks for templates. Page_link is set to the global variable so that it can be accessed by all templates

In [78]:
environment = Environment(
    loader=FileSystemLoader(TEMPLATE_DIR_PATH), extensions=["jinja2.ext.loopcontrols"]
)
environment.globals["page_link"] = page_link

Function for looping through JSON files and loading their content into the 'data' dictionary 

In [79]:
def loading_json_files(json_file_names):
    """
    Load data from JSON files specified in a list of file names.

    Parameters:
    ----------
    json_file_names : list of str
        A list of file names (without extension) to load as JSON.

    Returns:
    -------
    dict 
        A dictionary where keys are file names and values are the corresponding JSON data.
    
    """
    data = {}
    for json_file in json_file_names:
        json_file_path = WEBSITE_DATA_PATH / f"{json_file}.json"
    
        try:
            with open(json_file_path, "r") as json_var:
                data[json_file] = json.load(json_var)
        except (FileNotFoundError, json.JSONDecodeError):
            pass
    return data

Function to create a page

In [80]:
def create_page(template, html, **kwargs):
    """
    Create an HTML page using a Jinja2 template and save it to a specified path.

    Parameters:
    ----------
    template : str
        The filename of the Jinja2 template to be used.
    html : str
        The filename of the HTML file to be generated.
    **kwargs : dict
        Additional keyword arguments to be passed to the Jinja2 template for rendering.

    Returns:
    -------
    None
    
    """
    page_template = environment.get_template(template)
    page_html_path = HOSTING_PATH / html
    page_content = page_template.render(**kwargs)
    with open(page_html_path, mode='w', encoding='utf-8') as page:
        page.write(page_content)

Creating dataframes for articles which can be updated further 

In [81]:
def load_content_from_files(columns):
    """
    Load content data from JSON files into a Pandas DataFrame.

    Parameters
    ----------
    columns : list of str
        A list of column names to extract from the JSON files.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the specified columns from the loaded JSON files.

    """
    content_data = {col: [] for col in columns}

    for json_file in os.listdir(CONTENT_DIR_PATH):
        if json_file.endswith(".json"):
            json_path = os.path.join(CONTENT_DIR_PATH, json_file)
            with open(json_path, "r") as file:
                info = json.load(file)
                # Only load those articles where display is True
                if info.get("display"):
                    for col in columns:
                        content_data[col].append(info.get(col))

    content_df = pd.DataFrame(content_data)
    content_df["date"] = pd.to_datetime(content_df["date"], format="%m-%d-%Y")
    return content_df


Extract the Latest Content for Each Category from a DataFrame

In [82]:
def get_latest_content_df(content_df):
    """
    Extract the latest content for each category from a DataFrame.

    Parameters
    ----------
    content_df : pandas.DataFrame
        The input DataFrame containing content information.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the latest content for each category.
    
    """
    # Sort the entire DataFrame by "category" and "date" in descending order
    sorted_content_df = content_df.sort_values(
        by=["category", "date"], ascending=[True, False]
    )

    # Get the first row for each category using groupby and head
    latest_content_df = sorted_content_df.groupby("category").head(1).copy()
    latest_content_df["date"] = pd.to_datetime(
        latest_content_df["date"], format="%m-%d-%Y"
    )
    latest_content_df = latest_content_df.sort_values(by="date", ascending=False)

    return latest_content_df

### List of JSON files to be processed

In [83]:
json_files = [
    "general",
    "homepage",
    "research",
    "support",
    "contact",
]

#Function Call
data = loading_json_files(json_files)

### Homepage

Storing selected columns for Homepage only

In [84]:
# Needed columns for homepage
article_columns_initial = [
    "article_id",
    "category",
    "date",
    "tags",
    "title",
    "cover_image",
    "short_description",
]
content_df = load_content_from_files(article_columns_initial)
latest_content_df = get_latest_content_df(content_df)
latest_content_df

Unnamed: 0,article_id,category,date,tags,title,cover_image,short_description
1,reu_student_announcement,News,2023-06-23,"[news, research, Internship]",Summer REU Students Join Kerzendorf Group,website_files/images/article_content/nsflogo.jpg,Two undergraduate research assistants have joi...
3,snr0509_josh_paper,Astrophysical Transients,2023-05-28,"[research, news]",A comprehensive SN Ia companion search in SNR ...,website_files/images/article_content/snr0509_v...,A search for a surviving companion to a 400 ye...
4,midsure22_poster_bea,Computational Metascience,2022-07-22,[research],MIDSURE 2022,website_files/images/article_content/bea_midsu...,Poster presentation at the Mid-Michigan Sympos...


In [85]:
create_page('homepage.html.j2', 'Index.html', general=data["general"], homepage=data["homepage"], recent_content=latest_content_df.to_dict(orient="records"))

### People Page

Function to parse member data from JSON files

In [86]:
#Load education data
def load_education_experience_data(edu_exp_dir):
    edu_exp_list = []
    file_names = ["experiences.json", "education.json"]
    valid_groups = ["DTI", "TARDIS", "ICER", "kerzendorf"]
    valid_institution = "Michigan State University"
    for file_name in file_names:
        file_path = edu_exp_dir / file_name
        if file_path.exists():
            # Reading JSON data directly into a DataFrame
            df = pd.read_json(file_path)
            
            # filtering based on group and institution
            filtered_df = filter_data(df, valid_groups, valid_institution)
            
            edu_exp_list.append(filtered_df)
        else:
            print(f"{file_path} does not exist")

    if edu_exp_list:
        edu_exp_df = pd.concat(edu_exp_list, ignore_index=True)
    else:
        edu_exp_df = pd.DataFrame()

    # if start_date column exists, fill with NaN if it doesn't
    if 'start_date' not in edu_exp_df:
        edu_exp_df['start_date'] = pd.NaT
    
    # Convert start_date to datetime format
    edu_exp_df['start_date'] = pd.to_datetime(edu_exp_df['start_date'], errors='coerce')
    
    # Sort the DataFrame based on start_date
    edu_exp_df = edu_exp_df.sort_values(by='start_date', ascending=False)
    return edu_exp_df

# filtering based on group and institution
def filter_data(df, valid_groups, valid_institution):
    mask = df.apply(lambda x: x.get('group') in valid_groups or x.get('institution') == valid_institution, axis=1)
    return df[mask]

# Load social links directly
def load_social_links(social_dir):
    social_links=None
    social_links_file_path = social_dir / "social_links.json"
    if social_links_file_path.exists():
        with open(social_links_file_path, "r") as f:
            social_links = json.load(f)
    return social_links

# Load topmost project title
def load_latest_project_title(project_dir):
    projects_file_path = project_dir / "projects.json"
    topmost_project_title= None
    if projects_file_path.exists():
        projects_df = pd.read_json(projects_file_path)
        if not projects_df.empty:
            topmost_project_title = projects_df.iloc[0].get("project_title")
    return topmost_project_title

def parse_member_data(member_dir):
    member_json_dir = member_dir / "jsons"
    education_experience_df = load_education_experience_data(member_json_dir)
    current_project_title = load_latest_project_title(member_json_dir)
    social_links = load_social_links(member_json_dir)
    return education_experience_df, social_links, current_project_title

In [87]:
# Function to extract academic roles from education and experience data
def extract_member_academic_role(education_experience_df):
    """
    Extract the current academic role of a member based on education and experience data.

    Parameters
    ----------
    education_experience_df : pandas.DataFrame
        DataFrame containing education and experience data.

    Returns
    -------
    Tuple[str, bool]
        A tuple containing:
        - str: The current academic role of the member.
        - bool: True if the member is currently active, False otherwise.
    """

    # Check if these columns exist in dataframe
    for column in ["end_date", "group", "institution"]:
        if column not in education_experience_df.columns:
            education_experience_df[column] = None

    current_academic_role = None

    role_map = {
        "Assistant Professor": "Professor",
        "Professor": "Professor",
        "Visualization Consultant": "Visualization Consultant",
        "Research Consultant": "Research Consultant",
        "Research Software Engineer": "Research Software Engineer",
        "Professorial Assistant": "Undergraduate",
        "Visiting Researcher": "Postdoctoral Researcher",
        "Postdoctoral Researcher": "Postdoctoral Researcher",
    }

    degree_map = {
        "Masters": "Graduate Student",
        "PhD": "Postdoctorate",  #  if end_date is present
        "Bachelors": "Graduate Student",
    }

    for _, row in education_experience_df.iterrows():
        role = row.get("role", None)
        degree = row.get("degree", None)

        if not current_academic_role:
            current_academic_role = role_map.get(role, "")

            if degree == "PhD" and pd.isna(row["end_date"]):
                current_academic_role = "Graduate Student"  # if end_date is NaN
            elif degree == "Bachelors" and pd.isna(row["end_date"]):
                current_academic_role = "Undergraduate Student"
            elif not current_academic_role and degree in degree_map:
                current_academic_role = degree_map[degree]

    # Check for end dates outside the loop
    has_end_date = all(
        not pd.isna(date) for date in education_experience_df["end_date"]
    )
    is_current_member = not has_end_date

    return current_academic_role, is_current_member

In [88]:
# Lists to store data for current and alumni members
def fetch_member_data():
    """
    Fetch and process member data from directories in the specified members' directory.

    Returns
    -------
    Tuple[list, list]
        A tuple containing two lists:
        1. List of dictionaries representing current members' data.
        2. List of dictionaries representing alumni members' data.

    """
    current_people_page_list = []
    alumni_people_page_list = []
    # Looping through member directories to fetch and process member data
    for member_dir in MEMBERS_DIR_PATH.glob("*"):
        print(member_dir)
        if not (member_info_fname := member_dir / "info.json").exists():
            continue
        else:
            member_info = json.load(open(member_info_fname, "r"))
        education_experience_df, social_links, current_project_title = parse_member_data(
            member_dir
        )
        current_academic_role, is_current_member = extract_member_academic_role(
            education_experience_df
        )
    
        first_name = member_info["first_name"]
        last_name = member_info["last_name"]
        nickname = member_info.get("nick_name", None)
        id = member_info["id"]
        image_path = member_info["image_path"]
        cover_image_path = member_info["cover_image_path"]
    
        name = f"{nickname if nickname else first_name} {last_name}"
    
        member_data = {
            "name": name,
            "academic_role": current_academic_role,
            "id": id,
            "current_project_title": current_project_title,
            "image_path": image_path,
            "cover_image_path": cover_image_path,
        }
    
        if social_links is not None:
            member_data.update(social_links)
    
        if is_current_member:
            current_people_page_list.append(member_data)
        else:
            alumni_people_page_list.append(member_data)
    return current_people_page_list, alumni_people_page_list

In [89]:
current_people_page_list, alumni_people_page_list = fetch_member_data()

../../group-data/members/sofia_biriouk
../../group-data/members/jack_o_brien
../../group-data/members/jack_o_brien/jsons/experiences.json does not exist
../../group-data/members/josh_shields
../../group-data/members/hayden_monk
../../group-data/members/hayden_monk/jsons/experiences.json does not exist
../../group-data/members/vicente_amado
../../group-data/members/vicente_amado/jsons/experiences.json does not exist
../../group-data/members/jing_lu
../../group-data/members/kevin_cawley
../../group-data/members/yuki_matsumura
../../group-data/members/yuki_matsumura/jsons/experiences.json does not exist
../../group-data/members/atharva_arya
../../group-data/members/wolfgang_kerzendorf
../../group-data/members/abhinav_ohri
../../group-data/members/sona_chitchyan
../../group-data/members/sona_chitchyan/jsons/education.json does not exist
../../group-data/members/bea_lu
../../group-data/members/alexander_grunewald
../../group-data/members/alexander_grunewald/jsons/experiences.json does not e

In [90]:
create_page('people.html.j2', 'People.html',  general=data["general"], current_members=current_people_page_list, alumni_members=alumni_people_page_list)

### Contact Page

In [91]:
create_page('contact.html.j2', 'Contact.html', general=data["general"], contact=data["contact"])

### Support Page

In [92]:
create_page('support.html.j2', 'Support.html', general=data["general"], support=data["support"])

### Research Front Page

For adding more columns in dataframe to render front pages and individual article pages

In [93]:
columns_extended = article_columns_initial + ["author_id"]
content_df = load_content_from_files(columns_extended)
research_content_df = content_df[content_df['category'] != 'News'].sort_values(by=['category', 'date'], ascending=[True, False])
latest_content_df = get_latest_content_df(content_df)

In [94]:
create_page('research.html.j2', 'Research.html', general=data["general"], content=research_content_df, research=data["research"] , current_members=current_people_page_list)

In [95]:
sub_research_template = environment.get_template("sub_research_frontpage.html.j2")


for category in content_df.loc[content_df.category != "News", "category"].unique():
        sub_research_content = sub_research_template.render(general=data["general"], 
                                                            research=data["research"], 
                                                            content = latest_content_df,
                                                            category = category,
                                                            current_members=current_people_page_list
                                                            )
        folder_path = f"{HOSTING_PATH}/sub_research/{page_link(category.lower())}"
        os.makedirs(folder_path, exist_ok=True)
        with open(f"{folder_path}.html", mode="w", encoding="utf-8") as sub_research:
            sub_research.write(sub_research_content)