In [1]:
import json
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from pathlib import Path
import shutil
from datetime import datetime, date
import numpy as np
from dataclasses import dataclass

# Utility

### Paths

In [41]:
GROUP_DATA_DIR = Path("../../group-data")
TEMPLATE_DIR_PATH = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "templates"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data/"
HOSTING_PATH = GROUP_DATA_DIR.parent / "kerzendorf-lab.github.io"
ARTICLE_DIR_PATH = Path("../../research_news/articles")
ARTICLE_IMAGE_DESTINATION_DIR = (HOSTING_PATH / "website_files" / "images" / "article_content")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"
SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"
OPPORTUNITIES_PATH = WEBSITE_DATA_PATH / "content" / "opportunities.json"
ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"

GENERAL_TAGS = [
    "Paper",
    "Poster", 
    "Talk",
    "Award",
    "New Team Member",
    "PhD",
    "Conference",
    "Undergraduate",
    "Event",
    "Achievement"
]

# Define tag colors mapping
TAG_COLORS = {
    'paper': '#FF6B6B',  # Coral red
    'poster': '#4ECDC4', # Turquoise
    'talk': '#45B7D1',   # Light blue
    'award': '#96CEB4',  # Sage green
    'new team member': '#FFBE0B', # Golden yellow
    'phd': '#9B5DE5',    # Purple
    'conference': '#FF006E', # Pink
    'undergraduate': '#8338EC', # Violet
    'event': '#3A86FF',  # Royal blue
    'achievement': '#FB5607', # Orange
    'astrophysics': '#2EC4B6', # Teal
    'machine learning': '#FF9F1C', # Light orange
    'software': '#E71D36', # Bright red
    'research': '#011627', # Dark blue
    'news': '#41EAD4'    # Cyan
}

DATA_FILES = {
    'education': 'education.json',
    'experience': 'experiences.json',
    'project': 'projects.json',
    'award': 'awards.json',
    'outreach': 'outreach.json',
    'document': 'documents.json'
}

### Mappings

In [42]:
# Needed columns for articles
ARTICLE_METADATA_FIELDS = [
    "article_id",
    "category",
    "date",
    "tags",
    "title",
    "cover_image",
    "short_description"
]
# Groups and institution used in filtering data
GROUP_FILTER = ["DTI", "TARDIS", "ICER", "kerzendorf"]
INSTITUTION_FILTER = "Michigan State University"

# Map roles to standardized roles for consistency
ROLE_MAP = {
    "Assistant Professor": "Professor",
    "Professorial Assistant": "Undergraduate Student",
    "Visiting Researcher": "Postdoctoral Researcher"
}

# Map degrees to standardized academic levels
DEGREE_MAP = {
    "Masters": "Graduate Student",
    "PhD": "Postdoctorate",  #  if end_date is present
    "Bachelors": "Undergraduate Student",
}

INDIVIDUAL_MEMBER_SECTION_MAP = {
    "education": "Education",
    "experiences": "Experience",
    "projects": "Projects",
    "awards": "Awards & Recognition",
    "outreach": "Outreach Programs",
}

In [43]:
class MemberDataConfig:
    def __init__(self, group_data_dir: Path = None):
        self.group_data_dir = group_data_dir or Path("/Users/atharva/workspace/code/tardis-main/lab/group-data")
        self.members_dir = self.group_data_dir / "members"


In [44]:
config  = MemberDataConfig()
config.group_data_dir, config.members_dir

(PosixPath('/Users/atharva/workspace/code/tardis-main/lab/group-data'),
 PosixPath('/Users/atharva/workspace/code/tardis-main/lab/group-data/members'))

In [49]:
class MemberDataLoader:
    def __init__(self, config: MemberDataConfig = None):
        self.config = config or MemberDataConfig()

    def load_all_member_data(self) -> pd.DataFrame:
        all_rows = []

        for member_dir in self.config.members_dir.glob("*"):
            info_path = member_dir / "info.json"

            # Load member info
            member_info = json.loads(info_path.read_text())
            member_id = member_info["id"]
            
            # Add full name
            member_info["full_name"] = (
                f"{member_info.get('nick_name', member_info.get('first_name'))} {member_info['last_name']}"
                if pd.notna(member_info.get('nick_name'))
                else f"{member_info.get('first_name', '')} {member_info['last_name']}"
            )
            
            # Add info record
            info_row = member_info.copy()
            info_row['record_type'] = 'info'
            info_row['record_index'] = 0
            info_row['member_id'] = member_id
            all_rows.append(info_row)
            
            jsons_dir = member_dir / "jsons"
            
            # Load social links
            social_path = jsons_dir / "social_links.json"
            if social_path.exists():
                social_data = json.loads(social_path.read_text())
                social_data['record_type'] = 'social'
                social_data['record_index'] = 0
                social_data['member_id'] = member_id
                all_rows.append(social_data)
            
            
            for record_type, filename in DATA_FILES.items():
                file_path = jsons_dir / filename
                if file_path.exists():
                    records = json.loads(file_path.read_text())
                    
                    # Parse dates for specific record types
                    if record_type in ['education', 'experience', 'project']:
                        for record in records:
                            for date_col in ['start_date', 'end_date']:
                                if date_col in record and record[date_col]:
                                    record[date_col] = pd.to_datetime(record[date_col], format='%Y-%m-%d')

                    
                    for idx, record in enumerate(records):
                        record['record_type'] = record_type
                        record['record_index'] = idx
                        record['member_id'] = member_id
                        all_rows.append(record)

        
        if not all_rows:
            raise ValueError(f"No member data found in: {self.config.members_dir.resolve()}")
        
        # Create multi-index DataFrame
        result_df = pd.DataFrame(all_rows)
        result_df.set_index(['member_id', 'record_type', 'record_index'], inplace=True)
        result_df = result_df.fillna("")
        
        return result_df

In [50]:
config = MemberDataConfig()
loader = MemberDataLoader(config)


In [51]:
members_df = loader.load_all_member_data()

In [52]:
unique_members = members_df.index.get_level_values('member_id').unique()
unique_members

Index(['gracie_tvrdik', 'josh_shields', 'anirban_dutta', 'erin_visser',
       'abhinav_ohri', 'deeksha_mohanty', 'vicente_amado', 'yuki_matsumura',
       'ryan_groneck', 'andrew_fullard', 'isaac_smith', 'hayden_monk',
       'atharva_arya', 'richard_dow', 'bea_lu', 'sona_chitchyan',
       'morgan_sandler', 'connor_mcclellan', 'iliomar_rodriguez_ramos',
       'jaladh_singhal', 'jack_o_brien', 'cecelia_powers', 'clyde_watson',
       'benjamin_mellon', 'kevin_cawley', 'sofia_biriouk', 'jing_lu',
       'jared_goldberg', 'alexander_grunewald', 'harshul_gupta',
       'logan_mcclellan', 'wolfgang_kerzendorf', 'kiyah_young-wilson'],
      dtype='object', name='member_id')

In [53]:
record_types = members_df.index.get_level_values('record_type').value_counts()
for record_type, count in record_types.items():
            print(f"  {record_type}: {count}")

  experience: 52
  education: 42
  info: 33
  project: 31
  award: 31
  social: 27
  outreach: 3
  document: 1


In [54]:
first_member_id = unique_members[0]
members_df.xs(first_member_id, level='member_id').fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,first_name,last_name,image_path,cover_image_path,introduction,id,full_name,github_handle,linkedin,email,...,orcid,title,award_name,date,description,program_name,twitter_handle,linkedin_handle,document_name,link
record_type,record_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
info,0,Gracie,Tvrdik,media/images/gracie.jpg,media/images/cover.jpg,I am an undergraduate student at Bowling Green...,gracie_tvrdik,Gracie Tvrdik,,,,...,,,,,,,,,,
social,0,,,,,,,,gracietv,www.linkedin.com/in/grayson-tvrdik-34b7872a7,graysontvrdik1@gmail.com,...,,,,,,,,,,
education,0,,,,,,,,,,,...,,,,,,,,,,
experience,0,,,,,,,,,,,...,,,,,,,,,,
experience,1,,,,,,,,,,,...,,,,,,,,,,
project,0,,,,,,,,,,,...,,,,,,,,,,


In [55]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(members_df.sort_index(level='member_id').fillna(''))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,first_name,last_name,image_path,cover_image_path,introduction,id,full_name,github_handle,linkedin,email,degree,subject,institution,city,state,country,start_date,group,role,end_date,project_title,nick_name,website,orcid,title,award_name,date,description,program_name,twitter_handle,linkedin_handle,document_name,link
member_id,record_type,record_index,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
abhinav_ohri,education,0,,,,,,,,,,,Bachelors,Commerce(Honours),Sanatan Dharma College,Hoshiarpur,Punjab,India,2021-01-01 00:00:00,,,2023-07-06,,,,,,,,,,,,,
abhinav_ohri,experience,0,,,,,,,,,,,,,DeepThought Initiative,,,,2023-11-13 00:00:00,DTI,Research Software Engineer,2025-07-31,,,,,,,,,,,,,
abhinav_ohri,info,0,Abhinav,Ohri,media/images/abhinav_ohri.jpg,media/images/cover.jpg,Hi there! This is Abhinav.,abhinav_ohri,Abhinav Ohri,,,,,,,,,,,,,NaT,,,,,,,,,,,,,
abhinav_ohri,social,0,,,,,,,,KasukabeDefenceForce,,abhinavohri13@gmail.com,,,,,,,,,,NaT,,,,,,,,,,,,,
alexander_grunewald,education,0,,,,,,,,,,,Bachelors,Statistics and Data Science,Michigan State University,East Lansing,Michigan,USA,2021-01-01 00:00:00,,,NaT,,,,,,,,,,,,,
alexander_grunewald,experience,0,,,,,,,,,,,,,,,,,2023-01-01 00:00:00,kerzendorf,Research Assistant,2023-08-31,,,,,,,,,,,,,
alexander_grunewald,info,0,Alexander,Grunewald,media/images/alexander.jpg,media/images/cover.jpg,Alexander Grunewald is an undergraduate studen...,alexander_grunewald,Alexander Grunewald,,,,,,,,,,,,,NaT,,,,,,,,,,,,,
alexander_grunewald,project,0,,,,,,,,,,,,,,,,,,,,NaT,Emulator Project,,,,,,,,,,,,
alexander_grunewald,social,0,,,,,,,,AlexanderGrunewald,,grunew14@msu.edu,,,,,,,,,,NaT,,,,,,,,,,,alexander-grunewald-8007a51b5,,
andrew_fullard,education,0,,,,,,,,,,,PhD,Astronomy,University of Denver,Denver,Colorado,USA,2014-01-01 00:00:00,,,2020-12-31,,,,,,,,,,,,,
