In [None]:
import json
import pandas as pd
from pathlib import Path
import shutil
import re
from datetime import datetime
from PIL import Image
from jinja2 import Environment, FileSystemLoader

# Constants

In [None]:
GROUP_DATA_DIR = Path("/Users/atharva/workspace/code/tardis-main/lab/group-data")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"
ARTICLE_DIR_PATH = Path("/Users/atharva/workspace/code/tardis-main/website/research_news/articles")
HOSTING_PATH = Path("/Users/atharva/workspace/code/tardis-main/lab/kerzendorf-lab.github.io")
ARTICLE_IMAGE_DESTINATION_DIR = HOSTING_PATH / "website_files" / "images" / "article_content"
TEMPLATE_DIR_PATH = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "templates"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data/"
ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"
GALLERY_CONTENT_SOURCE = WEBSITE_DATA_PATH / "content" / "gallery"
SOURCE_ASSETS = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "assets"
SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"
OPPORTUNITIES_PATH = WEBSITE_DATA_PATH / "content" / "opportunities.json"

# Data Mapping Constants
ROLE_MAP = {
    "Assistant Professor": "Professor",
    "Professorial Assistant": "Undergraduate Student",
    "Visiting Researcher": "Postdoctoral Researcher"
}

DEGREE_MAP = {
    "Masters": "Graduate Student",
    "PhD": "Postdoctorate",
    "Bachelors": "Undergraduate Student",
}

INDIVIDUAL_MEMBER_SECTION_MAP = {
    "education": "Education",
    "experiences": "Experience",
    "projects": "Projects",
    "awards": "Awards & Recognition",
    "outreach": "Outreach Programs",
}

# Filtering Constants
GROUP_FILTER = ["DTI", "TARDIS", "ICER", "kerzendorf"]
INSTITUTION_FILTER = "Michigan State University"

TAG_COLORS = {
    'paper': '#FF6B6B',
    'poster': '#4ECDC4',
    'talk': '#45B7D1',
    'award': '#96CEB4',
    'new team member': '#FFBE0B',
    'phd': '#9B5DE5',
    'conference': '#FF006E',
    'undergraduate': '#8338EC',
    'event': '#3A86FF',
    'achievement': '#FB5607',
    'astrophysics': '#2EC4B6',
    'machine learning': '#FF9F1C',
    'software': '#E71D36',
    'research': '#011627',
    'news': '#41EAD4'
}

DEFAULT_COVER_IMAGE_HEIGHT = "330px"
DEFAULT_COVER_IMAGE_WIDTH = "520px"


# Utility functions

In [None]:

# Helper Functions
def page_link(a):
    """Return the HTML file name after replacing blank spaces with underscores"""
    return a.replace(" ", "_") if " " in a else a

def group_df(df):
    """Group dataframe by index and convert to nested dict format for templates"""
    return df.fillna("").groupby(level=0).apply(lambda x: x.to_dict('records')).to_frame('info').to_dict('index')

def urlize_content(content_text, members_df, current_members_df):
    """Replace [member_id] with linked names"""
    def replace_id(match):
        id_to_fetch = match.group(1)
        if id_to_fetch in members_df.index:
            name = members_df.loc[id_to_fetch, 'full_name']
            if id_to_fetch in current_members_df.index:
                return f'<a href="../members/{id_to_fetch}/{id_to_fetch}.html" target="_blank">{name}</a>'
            return name
        return id_to_fetch.replace('_', ' ').title()

    return re.sub(r'\[(\w+)\]', replace_id, content_text)

def get_tag_color(tag):
    """Get color for a specific tag, with fallback to default"""
    return TAG_COLORS.get(tag.lower(), '#6c757d')

def create_page(template, html, **kwargs):
    """Create an HTML page using a Jinja2 template and save it to a specified path"""
    page_template = environment.get_template(template)
    template_level = html.count("/")
    page_html_path = HOSTING_PATH / html
    page_html_path.parent.mkdir(parents=True, exist_ok=True)
    page_content = page_template.render(TEMPLATE_LEVEL=template_level, **kwargs)
    with open(page_html_path, mode="w", encoding="utf-8") as page:
        page.write(page_content)


# Setup Jinja2 environment

In [None]:
# Setup Jinja2 environment
environment = Environment(
    loader=FileSystemLoader(TEMPLATE_DIR_PATH), extensions=["jinja2.ext.loopcontrols", "jinja2.ext.do"]
)
environment.globals["page_link"] = page_link
environment.globals['tag_colors'] = TAG_COLORS
environment.globals['get_tag_color'] = get_tag_color


# Member Data Loader

In [None]:

class MemberDataLoader:
    def __init__(self, members_dir: Path = MEMBERS_DIR_PATH):
        self.members_dir = members_dir

    def _load_records(self, jsons_dir, filename, member_id):
        path = jsons_dir / filename
        if not path.exists():
            return []
        records = json.loads(path.read_text())
        for record in records:
            record['member_id'] = member_id
        return records

    def _parse_dates(self, records, date_fields, member_id=None):
        for record in records:
            for field in date_fields:
                if field not in record:
                    continue

                if not record[field]:
                    record[field] = pd.NaT
                    continue

                try:
                    record[field] = pd.to_datetime(record[field])
                except ValueError as e:
                    if member_id:
                        print(e, member_id)
        return records

    def load_all_data(self):
        data_types = ['education', 'experiences', 'projects', 'awards', 'outreach', 'documents', 'posters', 'publications']
        data_config = {dt: f"{dt}.json" for dt in data_types}
        data = {key: [] for key in data_config}

        start_end_dates = ['education.json', 'experiences.json', 'projects.json', 'outreach.json']
        single_date_with_errors = ['publications.json']
        dual_date_format = ['awards.json']  # Supports both 'date' and 'start_date/end_date'
        

        members_data = []

        for member_dir in self.members_dir.glob("*"):
            info_path = member_dir / "info.json"
            member_info = json.loads(info_path.read_text())
            member_id = member_info["id"]

            full_name = (
                f"{member_info.get('nick_name', member_info.get('first_name', ''))} {member_info.get('last_name', '')}"
                if member_info.get('nick_name')
                else f"{member_info.get('first_name', '')} {member_info.get('last_name', '')}"
            )
            member_info['full_name'] = full_name.strip()

            jsons_dir = member_dir / "jsons"

            social_path = jsons_dir / "social_links.json"
            if social_path.exists():
                social_data = json.loads(social_path.read_text())
                member_info.update(social_data)

            members_data.append(member_info)

            for key, filename in data_config.items():
                records = self._load_records(jsons_dir, filename, member_id)
                if filename in start_end_dates:
                    records = self._parse_dates(records, ['start_date', 'end_date'])
                elif filename in single_date_with_errors:
                    records = self._parse_dates(records, ['date'], member_id)
                elif filename in dual_date_format:
                    # Parse both single 'date' and 'start_date/end_date' formats
                    records = self._parse_dates(records, ['date', 'start_date', 'end_date'], member_id)
                data[key].extend(records)

        members_df = pd.DataFrame(members_data).set_index('id')
        self.members_df = members_df

        for key in data:
            df = pd.DataFrame(data[key]).set_index('member_id')
            setattr(self, f"{key}_df", df)


# Current/Alumni Member Processing

In [None]:

class CurrentMemberProcessor:
    def __init__(self, members_df, education_df, experiences_df, projects_df):
        self.members_df = members_df
        self.education_df = education_df
        self.experiences_df = experiences_df
        self.projects_df = projects_df

        with open(ROLE_HIERARCHY_PATH, "r") as file_name:
            self.role_hierarchy = json.load(file_name)

    def process_education(self):
        """Get most recent education and determine academic role"""
        def most_recent_row(group):
            sorted_group = group.sort_values(by=['start_date', 'end_date'], ascending=[False, True])
            return sorted_group.iloc[0:1]

        self.edu_most_recent = self.education_df.groupby(level=0).apply(most_recent_row).droplevel(0)

        self.edu_most_recent['academic_role'] = ""
        msu_mask = self.edu_most_recent['institution'] == INSTITUTION_FILTER
        bachelors_mask = msu_mask & (self.edu_most_recent['degree'] == "Bachelors")
        grad_mask = msu_mask & (self.edu_most_recent['degree'].isin(["PhD", "Masters"]))

        self.edu_most_recent.loc[bachelors_mask, 'academic_role'] = "Undergraduate Student"
        self.edu_most_recent.loc[grad_mask, 'academic_role'] = "Graduate Student"

    def process_experiences(self):
        """Get most recent experience per member"""
        self.experiences_df = self.experiences_df.fillna("")

        def most_recent_row(group):
            sorted_group = group.sort_values(by=['start_date', 'end_date'], ascending=[False, True])
            relevant_group = sorted_group[sorted_group['group'].str.contains('|'.join(GROUP_FILTER))]
            return relevant_group.iloc[0:1] if not relevant_group.empty else sorted_group.iloc[0:1]

        self.exp_most_recent = self.experiences_df.groupby(level=0).apply(most_recent_row).droplevel(0)

    def _merge_edu_exp(self):
        """Merge education and experience dataframes"""
        exp_suffixed = self.exp_most_recent.add_suffix('_exp')
        edu_suffixed = self.edu_most_recent.add_suffix('_edu')
        return exp_suffixed.merge(edu_suffixed, left_index=True, right_index=True, how='outer')

    def _determine_status_and_role(self, row):
        """Determine if member is current and their role"""
        if row['institution_edu'] == INSTITUTION_FILTER:
            is_current_edu = pd.isna(row['end_date_edu']) or row['end_date_edu'] >= datetime.now()
            has_ended_exp = pd.notna(row['end_date_exp'])
            is_current = is_current_edu and not has_ended_exp

            if row['academic_role_edu']:
                current_role = row['academic_role_edu']
            else:
                current_role = row['role_exp']

            return pd.Series({'isCurrent': is_current, 'current_role': current_role})
        elif row['group_exp'] in GROUP_FILTER and (pd.isna(row['end_date_exp']) or row['end_date_exp'] >= datetime.now()):
            return pd.Series({'isCurrent': True, 'current_role': row['role_exp']})
        else:
            current_role = row['academic_role_edu'] if row['academic_role_edu'] else row['role_exp']
            return pd.Series({'isCurrent': False, 'current_role': current_role})

    def _add_projects(self, df):
        """Add current project titles to members"""
        df["current_project_title"] = ""

        common_members = df.index.intersection(self.projects_df.index)
        projects_first = self.projects_df.loc[common_members].groupby(level=0).first()
        df.loc[common_members, "current_project_title"] = projects_first["project_title"]

    def _sort_by_hierarchy(self, df):
        """Sort members by role hierarchy"""
        df['rank'] = df['current_role'].map(self.role_hierarchy)
        df = df.sort_values(by='rank')
        return df.drop(columns='rank')

    def merge_and_determine_status(self):
        """Merge edu/exp and determine current vs alumni status"""
        merged = self._merge_edu_exp()
        status_role = merged.apply(self._determine_status_and_role, axis=1)
        merged = pd.concat([merged, status_role], axis=1)
        merged['current_role'] = merged['current_role'].replace(ROLE_MAP)

        self.current_members = merged[merged['isCurrent']][["current_role"]]
        self.alumni_members = merged[~merged['isCurrent']][["current_role"]]

        self.current_members_with_info = pd.merge(self.current_members, self.members_df, left_index=True, right_index=True, how='inner')
        self.alumni_members_with_info = pd.merge(self.alumni_members, self.members_df, left_index=True, right_index=True, how='inner')[['current_role', 'full_name']]

        self._add_projects(self.current_members_with_info)
        self.current_members_with_info = self._sort_by_hierarchy(self.current_members_with_info)

    def process(self):
        """Run full pipeline"""
        self.process_education()
        self.process_experiences()
        self.merge_and_determine_status()



# Gallery Data Loader(Unused)

In [None]:

class GalleryDataLoader:
    def __init__(self, gallery_dir: Path, image_dest_dir: Path):
        self.gallery_dir = gallery_dir
        self.image_dest_dir = image_dest_dir
        self.events = []

    def _scale_image_dimensions(self, image_path, scale_factor=0.7):
        """Get scaled image dimensions"""
        with Image.open(image_path) as img:
            width, height = img.size
            return int(width * scale_factor), int(height * scale_factor)

    def load_all_events(self):
        """Load all gallery events"""
        for event_file in self.gallery_dir.rglob("info.json"):
            event_data = json.loads(event_file.read_text())

            event_data["date"] = pd.to_datetime(event_data["date"])

            event_id = event_data["event_id"]

            dest_image_dir = self.image_dest_dir / event_id / "media" / "images"
            dest_image_dir.mkdir(parents=True, exist_ok=True)

            source_image_dir = event_file.parent / "media" / "images"
            if source_image_dir.exists():
                shutil.copytree(source_image_dir, dest_image_dir, dirs_exist_ok=True)

            for image in event_data["images"]:
                image_path = self.gallery_dir / event_id / image["image_path"]
                scaled_width, scaled_height = self._scale_image_dimensions(image_path)
                image["scaled_width"] = scaled_width
                image["scaled_height"] = scaled_height

            self.events.append(event_data)



# Article Data Loading

In [None]:

class ArticleDataLoader:
    def __init__(self, article_dir: Path, image_dest_dir: Path, members_df: pd.DataFrame, current_members_df: pd.DataFrame, platform_filter: str = "kg"):
        self.article_dir = article_dir
        self.image_dest_dir = image_dest_dir
        self.members_df = members_df
        self.current_members_df = current_members_df
        self.platform_filter = platform_filter
        self.category_replacements = {"Overview": "Computational Metascience"} if platform_filter == "kg" else {}

    def _copy_image(self, source_dir, image_path_str):
        """Copy image from article media to destination, return new path"""
        # Skip URLs
        if image_path_str.startswith(('http://', 'https://')):
            return image_path_str

        image_name = Path(image_path_str).name
        source = source_dir.parent / "media" / "images" / image_name
        dest = self.image_dest_dir / image_name
        dest.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(source, dest)
        return f"website_files/images/article_content/{image_name}"

    def _process_images(self, article, source_dir):
        """Process all images in article (cover + content)"""
        if article["cover_image"]:
            article["cover_image"] = self._copy_image(source_dir, article["cover_image"])

        for key, val in article["content"].items():
            if "img" in key and val:
                article["content"][key] = self._copy_image(source_dir, val)

    def split_news_research(self):
        """Split articles into news and research dataframes"""
        is_news = (
            (self.articles_df["category"] == "News") |
            self.articles_df["tags"].apply(lambda x: "news" in x if isinstance(x, list) else False)
        )

        self.news_df = self.articles_df[is_news].sort_values("date", ascending=False)
        self.research_df = self.articles_df[~is_news].sort_values(["category", "date"], ascending=[True, False])

    def get_recent_content(self):
        """Get most recent article for each category"""
        self.recent_content_df = self.articles_df.sort_values(
            ["category", "date"], ascending=[True, False]
        ).groupby("category").head(1)

    def load_all_articles(self):
        """Load articles filtered by platform and date"""
        articles = []
        today = datetime.now()

        for info_json in self.article_dir.rglob('info.json'):
            article = json.loads(info_json.read_text())

            if self.platform_filter not in article["platforms"]:
                continue

            article_date = pd.to_datetime(article["date"], format="%m-%d-%Y")
            if article_date > today:
                continue

            article["date"] = article_date
            self._process_images(article, info_json)

            if article["category"] == "News" or ("news" in article["tags"]):
                for key, val in article["content"].items():
                    if "para" in key:
                        article["content"][key] = urlize_content(val, self.members_df, self.current_members_df)

            articles.append(article)

        self.articles_df = pd.DataFrame(articles).set_index('article_id')
        self.articles_df["cover_image_height"] = self.articles_df["cover_image_height"].fillna(DEFAULT_COVER_IMAGE_HEIGHT).replace("", DEFAULT_COVER_IMAGE_HEIGHT)
        self.articles_df["cover_image_width"] = self.articles_df["cover_image_width"].fillna(DEFAULT_COVER_IMAGE_WIDTH).replace("", DEFAULT_COVER_IMAGE_WIDTH)
        self.articles_df["category"] = self.articles_df["category"].replace(self.category_replacements)
        self.articles_df['image_name'] = self.articles_df['cover_image'].apply(lambda x: Path(x).name)
        
        self.split_news_research()
        self.get_recent_content()


In [None]:
loader = MemberDataLoader()
loader.load_all_data()

print(f"Members: {len(loader.members_df)}")
print(f"Education records: {len(loader.education_df)}")
print(f"Experiences records: {len(loader.experiences_df)}")
print(f"Projects records: {len(loader.projects_df)}")
print(f"Awards records: {len(loader.awards_df)}")
print(f"Outreach records: {len(loader.outreach_df)}")
print(f"Documents records: {len(loader.documents_df)}")
print(f"Posters records: {len(loader.posters_df)}")
print(f"Publications records: {len(loader.publications_df)}")


In [None]:

processor = CurrentMemberProcessor(loader.members_df, loader.education_df, loader.experiences_df, loader.projects_df)
processor.process()

loader.members_df["academic_role"] = ""
loader.members_df["current_project_title"] = ""

loader.members_df.loc[processor.current_members_with_info.index, "academic_role"] = processor.current_members_with_info["current_role"]
loader.members_df.loc[processor.current_members_with_info.index, "current_project_title"] = processor.current_members_with_info["current_project_title"]

alumni_only = processor.alumni_members_with_info.index.difference(processor.current_members_with_info.index)
loader.members_df.loc[alumni_only, "academic_role"] = processor.alumni_members_with_info.loc[alumni_only, "current_role"]

processor.alumni_members_with_info = processor.alumni_members_with_info.replace("nan", pd.NA)
processor.alumni_members_with_info = processor.alumni_members_with_info.fillna("")
processor.current_members_with_info = processor.current_members_with_info.fillna("")


In [None]:
processor.alumni_members_with_info.head()

In [None]:
processor.current_members_with_info.head()

In [None]:
article_loader = ArticleDataLoader(ARTICLE_DIR_PATH, ARTICLE_IMAGE_DESTINATION_DIR, loader.members_df, processor.current_members_with_info)
article_loader.load_all_articles()
print(f"Articles: {len(article_loader.articles_df)}")

In [None]:
education = group_df(loader.education_df)
experience = group_df(loader.experiences_df)
projects = group_df(loader.projects_df)
awards = group_df(loader.awards_df)
outreach = group_df(loader.outreach_df)

In [None]:
social_cols = ['website', 'github_handle', 'twitter_handle', 'linkedin_handle', 'email', 'orcid']
socials = loader.members_df[social_cols].fillna('').to_dict('index')

general = json.loads((WEBSITE_DATA_PATH / "general.json").read_text())

documents = group_df(loader.documents_df)

all_members_dict = loader.members_df.to_dict("index")
all_articles_dict = {
    aid: {**data, 'article_id': aid}
    for aid, data in article_loader.articles_df.to_dict("index").items()
}


In [None]:
for person_id, person_data in loader.members_df.iterrows():
    create_page(
        "individual_person.html.j2",
        f"members/{person_id}/{person_id}.html",
        general=general,
        member_id=person_id,
        member_data=person_data,
        socials=socials,
        documents=documents,
        education=education,
        experience=experience,
        projects=projects,
        awards=awards,
        outreach=outreach,
        section_headings=INDIVIDUAL_MEMBER_SECTION_MAP,
        content=all_articles_dict,
    )

print(f"Created {len(loader.members_df)} individual member pages")

In [None]:
# Gallery page- unused
gallery_loader = GalleryDataLoader(GALLERY_CONTENT_SOURCE, HOSTING_PATH / "website_files" / "images" / "gallery")
gallery_loader.load_all_events()

create_page(
    "gallery.html.j2",
    "Gallery.html",
    general=general,
    member_data=all_members_dict,
    events=gallery_loader.events
)

In [None]:
shutil.copytree(SOURCE_ASSETS, HOSTING_PATH / "assets", dirs_exist_ok=True)

homepage = json.loads((WEBSITE_DATA_PATH / "homepage.json").read_text())
contact = json.loads((WEBSITE_DATA_PATH / "contact.json").read_text())
support = json.loads((WEBSITE_DATA_PATH / "support.json").read_text())
research = json.loads((WEBSITE_DATA_PATH / "research_categories.json").read_text())


In [None]:

create_page(
    "homepage.html.j2",
    "index.html",
    general=general,
    homepage=homepage,
    recent_content=article_loader.recent_content_df.reset_index().to_dict(orient="records"),
)


In [None]:
create_page(
    "current_members.html.j2",
    "current_members.html",
    general=general,
    current_members=processor.current_members_with_info,
    socials=socials
)


In [None]:

create_page(
    "alumni_members.html.j2",
    "alumni_members.html",
    general=general,
    alumni_members=processor.alumni_members_with_info,
)

In [None]:
create_page(
    "contact.html.j2",
    "Contact.html",
    general=general,
    contact=contact
)


In [None]:
create_page(
    "support.html.j2",
    "Support.html",
    general=general,
    support=support
)


In [None]:
create_page(
    "research.html.j2",
    "Research.html",
    general=general,
    content=article_loader.research_df.reset_index(),
    research=research,
    current_members=all_members_dict,
)

In [None]:
SUB_RESEARCH_PATH.mkdir(parents=True, exist_ok=True)

In [None]:

for category in article_loader.research_df["category"].unique():
    create_page(
        "sub_research_frontpage.html.j2",
        f"sub_research/{page_link(category.lower())}.html",
        general=general,
        research=research,
        content=article_loader.research_df.reset_index(),
        category=category,
        current_members=all_members_dict,
    )

In [None]:
for article_id, ind_research_values in article_loader.research_df.iterrows():
    destination_research_path = f"sub_research/{page_link(ind_research_values.category.lower())}/{page_link(article_id.lower())}.html"
    if ind_research_values['category'] == "Software":
        destination_research_path = f"sub_research/{page_link(article_id.lower())}.html"

    folder_path = SUB_RESEARCH_PATH / page_link(ind_research_values.category.lower())
    folder_path.mkdir(parents=True, exist_ok=True)
    create_page(
        "research_page_no_twitter.html.j2",
        destination_research_path,
        general=general,
        content=ind_research_values,
        member_data=all_members_dict,
        article_id=article_id,
    )

In [None]:
create_page(
    "news.html.j2",
    "News.html",
    general=general,
    content=article_loader.news_df.reset_index(),
    category="News",
    member_data=all_members_dict,
)

In [None]:
news_dict_list = article_loader.news_df.reset_index().to_dict('records')
for news_item in news_dict_list:
    create_page(
        "news_page_no_twitter.html.j2",
        f"news/{page_link(news_item['article_id'].lower())}.html",
        general=general,
        content=news_item,
        member_data=all_members_dict,
        category="News"
    )


In [None]:

with open(OPPORTUNITIES_PATH, 'r') as f_opp:
    opportunities = json.load(f_opp)

create_page(
    "join_us.html.j2",
    "Join_Us.html",
    general=general,
    opportunities=opportunities
)