In [1]:
import json
import pandas as pd
from pathlib import Path
import shutil
import re
from datetime import datetime

# Constants
GROUP_DATA_DIR = Path("../../group-data")
ARTICLE_DIR_PATH = Path("../../research_news/articles")
HOSTING_PATH = GROUP_DATA_DIR.parent / "kerzendorf-lab.github.io"
ARTICLE_IMAGE_DESTINATION_DIR = HOSTING_PATH / "website_files" / "images" / "article_content"

DEFAULT_COVER_IMAGE_HEIGHT = "330px"
DEFAULT_COVER_IMAGE_WIDTH = "520px"

In [17]:

# Utility function
def urlize_content(content_text, members_df, current_members_df):
    """Replace [member_id] with linked names"""
    def replace_id(match):
        id_to_fetch = match.group(1)
        if id_to_fetch in members_df.index:
            name = members_df.loc[id_to_fetch, 'full_name']
            if id_to_fetch in current_members_df.index:
                return f'<a href="../members/{id_to_fetch}/{id_to_fetch}.html" target="_blank">{name}</a>'
            return name
        return id_to_fetch.replace('_', ' ').title()

    return re.sub(r'\[(\w+)\]', replace_id, content_text)


In [18]:

class ArticleDataLoader:
    def __init__(self, article_dir: Path, image_dest_dir: Path, members_df: pd.DataFrame, current_members_df: pd.DataFrame, platform_filter: str = "kg"):
        self.article_dir = article_dir
        self.image_dest_dir = image_dest_dir
        self.members_df = members_df
        self.current_members_df = current_members_df
        self.platform_filter = platform_filter
        self.category_replacements = {"Overview": "Computational Metascience"} if platform_filter == "kg" else {}

    def _copy_image(self, source_dir, image_path_str):
        """Copy image from article media to destination, return new path"""
        # Skip URLs
        if image_path_str.startswith(('http://', 'https://')):
            return image_path_str

        image_name = Path(image_path_str).name
        source = source_dir.parent / "media" / "images" / image_name
        dest = self.image_dest_dir / image_name
        dest.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, dest)
        return f"website_files/images/article_content/{image_name}"

    def _process_images(self, article, source_dir):
        """Process all images in article (cover + content)"""
        if article["cover_image"]:
            article["cover_image"] = self._copy_image(source_dir, article["cover_image"])

        for key, val in article["content"].items():
            if "img" in key and val:
                article["content"][key] = self._copy_image(source_dir, val)

    def split_news_research(self):
        """Split articles into news and research dataframes"""
        is_news = (
            (self.articles_df["category"] == "News") |
            self.articles_df["tags"].apply(lambda x: "news" in x if isinstance(x, list) else False)
        )

        self.news_df = self.articles_df[is_news].sort_values("date", ascending=False)
        self.research_df = self.articles_df[~is_news].sort_values(["category", "date"], ascending=[True, False])

    def load_all_articles(self):
        """Load articles filtered by platform and date"""
        articles = []
        today = datetime.now()

        for info_json in self.article_dir.rglob('info.json'):
            article = json.loads(info_json.read_text())

            if self.platform_filter not in article["platforms"]:
                continue

            article_date = pd.to_datetime(article["date"], format="%m-%d-%Y")
            if article_date > today:
                continue

            article["date"] = article_date
            self._process_images(article, info_json)

            if article["category"] == "News" or ("news" in article["tags"]):
                for key, val in article["content"].items():
                    if "para" in key:
                        article["content"][key] = urlize_content(val, self.members_df, self.current_members_df)

            articles.append(article)

        self.articles_df = pd.DataFrame(articles).set_index('article_id')
        self.articles_df["cover_image_height"] = self.articles_df["cover_image_height"].fillna(DEFAULT_COVER_IMAGE_HEIGHT).replace("", DEFAULT_COVER_IMAGE_HEIGHT)
        self.articles_df["cover_image_width"] = self.articles_df["cover_image_width"].fillna(DEFAULT_COVER_IMAGE_WIDTH).replace("", DEFAULT_COVER_IMAGE_WIDTH)
        self.articles_df["category"] = self.articles_df["category"].replace(self.category_replacements)
        self.articles_df['image_name'] = self.articles_df['cover_image'].apply(lambda x: Path(x).name)

        self.split_news_research()



In [19]:
members_df = pd.read_csv("members.csv", index_col=0)
current_members_df = pd.read_csv("current_members.csv", index_col=0)


In [10]:
members_df.head(3)

Unnamed: 0_level_0,first_name,last_name,image_path,cover_image_path,introduction,full_name,github_handle,linkedin,email,nick_name,website,orcid,twitter_handle,linkedin_handle,ads,academic_role,current_project_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
gracie_tvrdik,Gracie,Tvrdik,media/images/gracie.jpg,media/images/cover.jpg,I am an undergraduate student at Bowling Green...,Gracie Tvrdik,gracietv,www.linkedin.com/in/grayson-tvrdik-34b7872a7,graysontvrdik1@gmail.com,,,,,,,REU student,
josh_shields,Joshua,Shields,media/images/josh_photo.jpg,media/images/cover.jpg,Josh is a senior graduate student in astrophys...,Josh Shields,jvshields,,shield90@msu.edu,Josh,https://jvshields.github.io/,0000-0002-1560-5286,,,,Graduate Student,
anirban_dutta,Anirban,Dutta,media/images/anirban_dutta.jpg,media/images/cover.jpg,Hi there! This is Anirban.,Anirban Dutta,Knights-Templars,,anirbaniamdutta@gmail.com,,https://sites.google.com/view/anirbaniamdutta,0000-0002-7708-3831,Anirban29Dutta,anirban-dutta-6a0377238,,Postdoctoral Researcher,


In [11]:
current_members_df.head(3)

Unnamed: 0,current_role,first_name,last_name,image_path,cover_image_path,introduction,full_name,github_handle,linkedin,email,nick_name,website,orcid,twitter_handle,linkedin_handle,ads,current_project_title
wolfgang_kerzendorf,Professor,Wolfgang,Kerzendorf,media/images/wolfgang.jpg,media/images/cover.jpg,I am an astrophysicist deeply intrigued by nuc...,Wolfgang Kerzendorf,wkerzendorf,,wkerzend@msu.edu,,https://wolfgangkerzendorf.com,0000-0002-0479-7235,wkerzendorf,wolfgang-kerzendorf-598a0466,,Supernovae & Computational Metaresearch
connor_mcclellan,Postdoctoral Researcher,Connor,McClellan,media/images/profile.png,media/images/cover.jpg,I joined the TARDIS group in 2025 as a post-do...,Connor McClellan,,,,,,,,,,
jing_lu,Postdoctoral Researcher,Jing,Lu,media/images/jing.jpg,media/images/cover.jpg,I will be joining TARDIS group in summer 2023 ...,Jing Lu,DeerWhale,,lujingeve158@gmail.com,,,0000-0002-3900-1452,,jing-lu-bb89211bb,,Explore the hidden Helium in Type Ic Supernovae


In [9]:
article_loader = ArticleDataLoader(
    ARTICLE_DIR_PATH,
    ARTICLE_IMAGE_DESTINATION_DIR,
    members_df,
    current_members_df
)
article_loader.load_all_articles()


In [12]:
article_loader.articles_df.head(3)

Unnamed: 0_level_0,title,author_id,display,date,category,tags,platforms,short_description,cover_image,content,people_involved_ids,links,twitter,cover_image_height,cover_image_width,research_id,image_name
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
geonintern_international_ben,Unveiling Earth's Secrets with AI: Our Undergr...,benjamin_mellon,True,2024-03-18,News,"[undergraduate, internship]","[kg, dti]","This upcoming August, Benjamin Mellon and fell...",website_files/images/article_content/2BCAFnorw...,"{'1_para': 'This upcoming August, Benjamin Mel...",[benjamin_mellon],{},,330px,520px,,2BCAFnorway_geo.jpg
reu_student_announcement,Summer REU Students Join Kerzendorf Group,richard_dow,True,2023-06-23,News,"[New Team Member, undergraduate]",[kg],Two undergraduate research assistants have joi...,website_files/images/article_content/nsflogo.jpg,{'1_para': 'Tripp Dow and Iliomar Rodriguez Ra...,"[richard_dow, iliomar_rodriguez_ramos]",{},,330px,520px,,nsflogo.jpg
prur_conference,Peer Review Under Review - Workshop at Europea...,vicente_amado,True,2023-02-12,News,"[Metascience, Conference]","[dti, kg]",DeepThought Initiative and collaborators organ...,website_files/images/article_content/img_PRUR.png,{'1_para': 'Wolfgang Kerzendorf and collaborat...,"[vicente_amado, wolfgang_kerzendorf]",{'NASA ADS': 'https://ui.adsabs.harvard.edu/ab...,,330px,520px,,img_PRUR.png


In [13]:
article_loader.news_df.head(3)

Unnamed: 0_level_0,title,author_id,display,date,category,tags,platforms,short_description,cover_image,content,people_involved_ids,links,twitter,cover_image_height,cover_image_width,research_id,image_name
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
stardis_release,Introducing STARDIS - An Open and Modular Stel...,josh_shields,True,2025-08-28,News,"[paper, Astrophysics]","[kg, tardis]","We introduce STARDIS, a new open-source Python...",website_files/images/article_content/halpha_so...,{'1_para': 'We are excited to announce the rel...,"[josh_shields, wolfgang_kerzendorf, ryan_grone...",{'manuscript': 'https://iopscience.iop.org/art...,,330px,520px,,halpha_sol.png
tardis_summer_school_25,TARDIS Summer School 2025: Explosive Transient...,josh_shields,True,2025-08-15,News,"[Education, Summer School, TARDIS, Radiative T...","[kg, tardis]",We hosted a week-long summer school where 14 p...,website_files/images/article_content/cachedIma...,{'1_para': 'We successfully hosted the TARDIS ...,"[josh_shields, wolfgang_kerzendorf, jing_lu, a...",{},,330px,520px,,cachedImage.PNG
thesis_defense_deeksha,Deeksha Mohanty Defends Master's Thesis on Enh...,deeksha_mohanty,True,2025-07-07,News,"[Master's Thesis, talk]","[kg, tardis]",Deeksha Mohanty successfully defended her mast...,website_files/images/article_content/defense_b...,{'1_para': 'We congratulate Deeksha Mohanty on...,[deeksha_mohanty],{},,390px,520px,,defense_before.jpeg


In [15]:
article_loader.research_df

Unnamed: 0_level_0,title,author_id,display,date,category,tags,platforms,short_description,cover_image,content,people_involved_ids,links,twitter,cover_image_height,cover_image_width,research_id,image_name
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
midsure22_poster_bea,MIDSURE 2022,bea_lu,True,2022-07-22,Computational Metascience,[research],"[kg, dti]",Poster presentation at the Mid-Michigan Sympos...,website_files/images/article_content/bea_midsu...,{'1_para': 'Abstract: Interdisciplinary scient...,"[bea_lu, vicente_amado, wolfgang_kerzendorf]",{},,330px,520px,,bea_midsure_poster.jpg
uuraf21_poster_vicente,MSU UURAF 2021,vicente_amado,True,2021-04-19,Computational Metascience,[research],"[kg, dti]",Poster presentation for MSU's University Under...,website_files/images/article_content/MAST_Post...,{'1_para': 'Abstract: The modern scientific co...,"[vicente_amado, wolfgang_kerzendorf, jack_o_br...",{},,330px,520px,,MAST_Poster.jpg


In [16]:
article_loader.articles_df.to_csv("articles.csv")
article_loader.news_df.to_csv("news.csv")
article_loader.research_df.to_csv("research.csv")
