In [None]:
import json
import pandas as pd
from pathlib import Path
import shutil
import re
from datetime import datetime

# Load configuration
CONFIG_PATH = Path("..") / "config.json"
with open(CONFIG_PATH, 'r') as f:
    config = json.load(f)
CONFIG_DIR = CONFIG_PATH.parent

# Constants - resolve paths relative to config directory
GROUP_DATA_DIR = (CONFIG_DIR / config["paths"]["group_data_dir"]).resolve()
ARTICLE_DIR_PATH = (CONFIG_DIR / config["paths"]["article_dir"]).resolve()
HOSTING_PATH = (CONFIG_DIR / config["paths"]["hosting_path"]).resolve()
ARTICLE_IMAGE_DESTINATION_DIR = HOSTING_PATH / "website_files" / "images" / "article_content"

DEFAULT_COVER_IMAGE_HEIGHT = config.get("default_cover_image_height", "330px")
DEFAULT_COVER_IMAGE_WIDTH = config.get("default_cover_image_width", "520px")

In [2]:

# Utility function
def urlize_content(content_text, members_df, current_members_df):
    """Replace [member_id] with linked names"""
    def replace_id(match):
        id_to_fetch = match.group(1)
        if id_to_fetch in members_df.index:
            name = members_df.loc[id_to_fetch, 'full_name']
            if id_to_fetch in current_members_df.index:
                return f'<a href="../members/{id_to_fetch}/{id_to_fetch}.html" target="_blank">{name}</a>'
            return name
        return id_to_fetch.replace('_', ' ').title()

    return re.sub(r'\[(\w+)\]', replace_id, content_text)


In [3]:
class ArticleDataLoader:
    def __init__(self, article_dir: Path, image_dest_dir: Path, members_df: pd.DataFrame, current_members_df: pd.DataFrame, platform_filter: str = None, category_replacements: dict = None):
        self.article_dir = article_dir
        self.image_dest_dir = image_dest_dir
        self.members_df = members_df
        self.current_members_df = current_members_df
        self.platform_filter = platform_filter if platform_filter is not None else config.get("platform_filter", "kg")
        self.category_replacements = category_replacements if category_replacements is not None else config.get("category_replacements", {})

    def _copy_image(self, source_dir, image_path_str):
        """Copy image from article media to destination, return new path"""
        # Skip URLs
        if image_path_str.startswith(('http://', 'https://')):
            return image_path_str

        image_name = Path(image_path_str).name
        source = source_dir.parent / "media" / "images" / image_name
        dest = self.image_dest_dir / image_name
        dest.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, dest)
        return f"website_files/images/article_content/{image_name}"

    def _process_images(self, article, source_dir):
        """Process all images in article (cover + content)"""
        if article["cover_image"]:
            article["cover_image"] = self._copy_image(source_dir, article["cover_image"])

        for key, val in article["content"].items():
            if "img" in key and val:
                article["content"][key] = self._copy_image(source_dir, val)

    def split_news_research(self):
        """Split articles into news and research dataframes"""
        is_news = (
            (self.articles_df["category"] == "News") |
            self.articles_df["tags"].apply(lambda x: "news" in x if isinstance(x, list) else False)
        )

        self.news_df = self.articles_df[is_news].sort_values("date", ascending=False)
        self.research_df = self.articles_df[~is_news].sort_values(["category", "date"], ascending=[True, False])

    def load_all_articles(self):
        """Load articles filtered by platform and date"""
        articles = []
        today = datetime.now()

        for info_json in self.article_dir.rglob('info.json'):
            article = json.loads(info_json.read_text())

            if self.platform_filter not in article["platforms"]:
                continue

            article_date = pd.to_datetime(article["date"], format="%m-%d-%Y")
            if article_date > today:
                continue

            article["date"] = article_date
            self._process_images(article, info_json)

            if article["category"] == "News" or ("news" in article["tags"]):
                for key, val in article["content"].items():
                    if "para" in key:
                        article["content"][key] = urlize_content(val, self.members_df, self.current_members_df)

            articles.append(article)

        if articles:
            self.articles_df = pd.DataFrame(articles).set_index('article_id')
            self.articles_df["cover_image_height"] = self.articles_df["cover_image_height"].fillna(DEFAULT_COVER_IMAGE_HEIGHT).replace("", DEFAULT_COVER_IMAGE_HEIGHT)
            self.articles_df["cover_image_width"] = self.articles_df["cover_image_width"].fillna(DEFAULT_COVER_IMAGE_WIDTH).replace("", DEFAULT_COVER_IMAGE_WIDTH)
            self.articles_df["category"] = self.articles_df["category"].replace(self.category_replacements)
            self.articles_df['image_name'] = self.articles_df['cover_image'].apply(lambda x: Path(x).name)
            self.split_news_research()
        else:
            self.articles_df = pd.DataFrame()
            self.news_df = pd.DataFrame()
            self.research_df = pd.DataFrame()


In [4]:
# Create output directory
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)

# Load member data from CSVs in output directory
members_df = pd.read_csv(OUTPUT_DIR / "members.csv", index_col=0)
current_members_df = pd.read_csv(OUTPUT_DIR / "current_members.csv", index_col=0)

print(f"Loaded {len(members_df)} members")
print(f"Loaded {len(current_members_df)} current members")

Loaded 10 members
Loaded 4 current members


In [5]:
members_df.head(3)

Unnamed: 0_level_0,first_name,last_name,image_path,cover_image_path,introduction,full_name,github_handle,email,website,twitter_handle,linkedin_handle,orcid,linkedin,nick_name,academic_role,current_project_title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
abhinav_ohri,Abhinav,Ohri,media/images/abhinav_ohri.jpg,media/images/cover.jpg,Hi there! This is Abhinav.,Abhinav Ohri,KasukabeDefenceForce,abhinavohri13@gmail.com,,,,,,,Research Software Engineer,
atharva_arya,Atharva,Arya,media/images/atharva.jpg,media/images/cover.jpg,I joined TARDIS as a GSoC 2021 student and I h...,Atharva Arya,atharva-2001,aryaatharva18@gmail.com,https://www.atharvaarya.tech/,2001_atharva,atharva-arya,,,,Research Software Engineer,Mitigating open science sustainability issues
bea_lu,Bea,Lu,media/images/bea_lu.jpg,media/images/cover.jpg,"Hi, my name is Bea and I am currently a studen...",Bea Lu,bumblebealu,lubangji@msu.edu,,,,0000-0002-3393-2424,,,Undergraduate Student,


In [6]:
current_members_df.head(3)

Unnamed: 0,current_role,first_name,last_name,image_path,cover_image_path,introduction,full_name,github_handle,email,website,twitter_handle,linkedin_handle,orcid,linkedin,nick_name,current_project_title
wolfgang_kerzendorf,Assistant Professor,Wolfgang,Kerzendorf,media/images/wolfgang.jpg,media/images/cover.jpg,I am an astrophysicist deeply intrigued by nuc...,Wolfgang Kerzendorf,wkerzendorf,wkerzend@msu.edu,https://wolfgangkerzendorf.com,wkerzendorf,wolfgang-kerzendorf-598a0466,0000-0002-0479-7235,,,Supernovae & Computational Metaresearch
vicente_amado,Graduate Student,Vicente,Amado Olivo,media/images/ESD_headshot.jpg,media/images/cover.jpg,I am a graduate student at Michigan State Univ...,Vicente Amado Olivo,,amadovic@msu.edu,,vamadolivo,,0000-0003-2248-0941,,,Development Of A Global Registry For Peer Revi...
atharva_arya,Research Software Engineer,Atharva,Arya,media/images/atharva.jpg,media/images/cover.jpg,I joined TARDIS as a GSoC 2021 student and I h...,Atharva Arya,atharva-2001,aryaatharva18@gmail.com,https://www.atharvaarya.tech/,2001_atharva,atharva-arya,,,,Mitigating open science sustainability issues


In [7]:
# Load articles
article_loader = ArticleDataLoader(
    ARTICLE_DIR_PATH,
    ARTICLE_IMAGE_DESTINATION_DIR,
    members_df,
    current_members_df
)
article_loader.load_all_articles()

print(f"Articles: {len(article_loader.articles_df)}")
print(f"News articles: {len(article_loader.news_df)}")
print(f"Research articles: {len(article_loader.research_df)}")

Articles: 12
News articles: 8
Research articles: 4


In [8]:
article_loader.articles_df.head(3)

Unnamed: 0_level_0,title,author_id,display,date,category,tags,platforms,short_description,cover_image,content,people_involved_ids,links,twitter,cover_image_height,cover_image_width,image_name
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
geonintern_international_ben,Unveiling Earth's Secrets with AI: Our Undergr...,benjamin_mellon,True,2024-03-18,News,"[undergraduate, internship]","[kg, dti]","This upcoming August, Benjamin Mellon and fell...",website_files/images/article_content/2BCAFnorw...,"{'1_para': 'This upcoming August, Benjamin Mel...",[benjamin_mellon],{},,330px,520px,2BCAFnorway_geo.jpg
prur_conference,Peer Review Under Review - Workshop at Europea...,vicente_amado,True,2023-02-12,News,"[Metascience, Conference]","[dti, kg]",DeepThought Initiative and collaborators organ...,website_files/images/article_content/img_PRUR.png,{'1_para': 'Wolfgang Kerzendorf and collaborat...,"[vicente_amado, wolfgang_kerzendorf]",{'NASA ADS': 'https://ui.adsabs.harvard.edu/ab...,,330px,520px,img_PRUR.png
uuraf21_poster_vicente,MSU UURAF 2021,vicente_amado,True,2021-04-19,Overview,[research],"[kg, dti]",Poster presentation for MSU's University Under...,website_files/images/article_content/MAST_Post...,{'1_para': 'Abstract: The modern scientific co...,"[vicente_amado, wolfgang_kerzendorf, jack_o_br...",{},,330px,520px,MAST_Poster.jpg


In [9]:
article_loader.news_df.head(3)

Unnamed: 0_level_0,title,author_id,display,date,category,tags,platforms,short_description,cover_image,content,people_involved_ids,links,twitter,cover_image_height,cover_image_width,image_name
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
identify_telescope_machine_vicente,Identifying Telescope Usage in Astrophysics P...,vicente_amado,True,2024-11-12,News,"[paper, Metaresearch]","[kg, dti]","Vicente, our graduate student and collaborator...",website_files/images/article_content/mastvsnom...,"{'1_para': '<a href=""../members/vicente_amado/...","[josh_shields, wolfgang_kerzendorf, vicente_am...",{'full-paper': 'https://arxiv.org/abs/2411.009...,,330px,520px,mastvsnomast.png
gracie_at_acres_msu_nsf,Our Student Gracie wins best talk at the ACRES...,gracie_tvrdik,True,2024-07-30,News,"[undergraduate, Award, conference]","[kg, dti]",Gracie Tvrdik presented her research on the ev...,website_files/images/article_content/image1.png,{'1_para': 'Gracie Tvrdik presented her resea...,[],{},,330px,520px,image1.png
ICSSI_poster_vicente,Vicente Amado Olivo Bridges Machine Learning a...,vicente_amado,True,2024-07-01,News,"[poster, metaresearch, conference]","[kg, dti]",Who's Behind the Paper? Machine learning enabl...,website_files/images/article_content/icssi_pos...,"{'1_para': 'Our graduate student <a href=""../m...","[vicente_amado, wolfgang_kerzendorf]",{},,330px,520px,icssi_poster.jpeg


In [10]:
article_loader.research_df

Unnamed: 0_level_0,title,author_id,display,date,category,tags,platforms,short_description,cover_image,content,people_involved_ids,links,twitter,cover_image_height,cover_image_width,image_name
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
midsure22_poster_bea,MIDSURE 2022,bea_lu,True,2022-07-22,Overview,[research],"[kg, dti]",Poster presentation at the Mid-Michigan Sympos...,website_files/images/article_content/bea_midsu...,{'1_para': 'Abstract: Interdisciplinary scient...,"[bea_lu, vicente_amado, wolfgang_kerzendorf]",{},,330px,520px,bea_midsure_poster.jpg
uuraf21_poster_vicente,MSU UURAF 2021,vicente_amado,True,2021-04-19,Overview,[research],"[kg, dti]",Poster presentation for MSU's University Under...,website_files/images/article_content/MAST_Post...,{'1_para': 'Abstract: The modern scientific co...,"[vicente_amado, wolfgang_kerzendorf, jack_o_br...",{},,330px,520px,MAST_Poster.jpg
dti_reviewer,DTI Reviewer,,True,2025-06-16,Software,[research],[dti],<a href='http://31.97.40.150:3000/'>DTI Review...,website_files/images/article_content/dti_revie...,{'1_para': '<a href='http://31.97.40.150:3000/...,"[abhinav_ohri, vicente_amado, wolfgang_kerzend...",{'DTI Reviewer': 'http://31.97.40.150:3000/'},,330px,520px,dti_reviewer.gif
pacmanweb,PACMan Web Application,,True,2024-03-18,Software,[research],[dti],Web application for PACMan a machine-learning ...,website_files/images/article_content/pacmanweb...,{'1_para': '<a href='https://github.com/spacet...,"[abhinav_ohri, atharva_arya, vicente_amado, wo...",{},,330px,520px,pacmanweb.png


In [11]:
# Save to CSV in output directory
article_loader.articles_df.to_csv(OUTPUT_DIR / "articles.csv")
article_loader.news_df.to_csv(OUTPUT_DIR / "news.csv")
article_loader.research_df.to_csv(OUTPUT_DIR / "research.csv")

print(f"Saved articles.csv ({len(article_loader.articles_df)} articles)")
print(f"Saved news.csv ({len(article_loader.news_df)} news articles)")
print(f"Saved research.csv ({len(article_loader.research_df)} research articles)")

Saved articles.csv (12 articles)
Saved news.csv (8 news articles)
Saved research.csv (4 research articles)
