In [7]:
import json
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from pathlib import Path
import shutil
from datetime import datetime, date
import numpy as np
from PIL import Image
from dataclasses import dataclass

# Utility

### Paths

In [2]:
GROUP_DATA_DIR = Path("../../group-data")
TEMPLATE_DIR_PATH = GROUP_DATA_DIR.parent / "groupwebsite_generator" / "templates"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data/"
HOSTING_PATH = GROUP_DATA_DIR.parent / "kerzendorf-lab.github.io"
ARTICLE_DIR_PATH = Path("../../research_news/articles")
ARTICLE_IMAGE_DESTINATION_DIR = (HOSTING_PATH / "website_files" / "images" / "article_content")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"
SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"
OPPORTUNITIES_PATH = WEBSITE_DATA_PATH / "content" / "opportunities.json"
ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"

GENERAL_TAGS = [
    "Paper",
    "Poster", 
    "Talk",
    "Award",
    "New Team Member",
    "PhD",
    "Conference",
    "Undergraduate",
    "Event",
    "Achievement"
]

# Define tag colors mapping
TAG_COLORS = {
    'paper': '#FF6B6B',  # Coral red
    'poster': '#4ECDC4', # Turquoise
    'talk': '#45B7D1',   # Light blue
    'award': '#96CEB4',  # Sage green
    'new team member': '#FFBE0B', # Golden yellow
    'phd': '#9B5DE5',    # Purple
    'conference': '#FF006E', # Pink
    'undergraduate': '#8338EC', # Violet
    'event': '#3A86FF',  # Royal blue
    'achievement': '#FB5607', # Orange
    'astrophysics': '#2EC4B6', # Teal
    'machine learning': '#FF9F1C', # Light orange
    'software': '#E71D36', # Bright red
    'research': '#011627', # Dark blue
    'news': '#41EAD4'    # Cyan
}

### Mappings

In [4]:
# Needed columns for articles
ARTICLE_METADATA_FIELDS = [
    "article_id",
    "category",
    "date",
    "tags",
    "title",
    "cover_image",
    "short_description"
]
# Groups and institution used in filtering data
GROUP_FILTER = ["DTI", "TARDIS", "ICER", "kerzendorf"]
INSTITUTION_FILTER = "Michigan State University"

# Map roles to standardized roles for consistency
ROLE_MAP = {
    "Assistant Professor": "Professor",
    "Professorial Assistant": "Undergraduate Student",
    "Visiting Researcher": "Postdoctoral Researcher"
}

# Map degrees to standardized academic levels
DEGREE_MAP = {
    "Masters": "Graduate Student",
    "PhD": "Postdoctorate",  #  if end_date is present
    "Bachelors": "Undergraduate Student",
}

INDIVIDUAL_MEMBER_SECTION_MAP = {
    "education": "Education",
    "experiences": "Experience",
    "projects": "Projects",
    "awards": "Awards & Recognition",
    "outreach": "Outreach Programs",
}

In [13]:
class MemberDataConfig:
    def __init__(self, group_data_dir: Path = None):
        self.group_data_dir = group_data_dir or Path("/Users/atharva/workspace/code/tardis-main/lab/group-data")
        self.members_dir = self.group_data_dir / "members"


In [11]:
config  = MemberDataConfig()
config.group_data_dir, config.members_dir

(PosixPath('../../group-data'), PosixPath('../../group-data/members'))

In [None]:
class MemberDataLoader:
    def __init__(self, config: MemberDataConfig = None):
        self.config = config or MemberDataConfig()

    def load_all_member_data(self) -> pd.DataFrame:
        # Load base member info
        info_df = self.load_member_info()

        # Load all related data
        experiences_df = self.load_member_data_with_dates("experiences.json", "experiences")
        education_df = self.load_member_data_with_dates("education.json", "education")
        projects_df = self.load_member_data_with_dates("projects.json", "projects")
        awards_df = self.load_member_json_data("awards.json")
        outreach_df = self.load_member_json_data("outreach.json")
        documents_df = self.load_member_json_data("documents.json")
        social_links_df = self.load_social_links()

        merged_df = self.merge_all_data(
            info_df,
            experiences_df,
            education_df,
            projects_df,
            awards_df,
            outreach_df,
            documents_df,
            social_links_df
        )

        return merged_df

    def load_member_info(self) -> pd.DataFrame:
        info_list = []
        
        for info_path in self.config.members_dir.glob("*/info.json"):
            member_data = json.loads(info_path.read_text())
            info_list.append(member_data)

        df = pd.DataFrame(info_list)
        df = df.set_index("id")

        df["full_name"] = df.apply(
            lambda row: (
                f"{row.get('nick_name', row.get('first_name'))} {row['last_name']}"
                if pd.notna(row.get('nick_name'))
                else f"{row.get('first_name', '')} {row['last_name']}"
            ),
            axis=1,
        )

        return df.fillna("")

    def load_member_json_data(self, json_filename, prefix=None):
        data_frames = []

        for info_path in self.config.members_dir.glob("*/info.json"):
            try:
                member_info = json.loads(info_path.read_text())
                member_id = member_info["id"]
                data_path = info_path.parent / "jsons" / json_filename

                if data_path.exists():
                    member_data = json.loads(data_path.read_text())

                    # Add member_id to each entry
                    for entry in member_data:
                        entry["id"] = member_id

                    df = pd.DataFrame(member_data)

                    # Add prefix to columns if specified
                    if prefix:
                        df = df.rename(
                            columns={
                                col: f"{prefix}_{col}"
                                for col in df.columns if col != "id"
                            }
                        )

                    data_frames.append(df)

            except (json.JSONDecodeError, KeyError) as e:
                print(f"Error loading data from {info_path}: {e}")
                continue

        if not data_frames:
            return pd.DataFrame()

        combined_df = pd.concat(data_frames, ignore_index=True)
        combined_df.set_index("id", inplace=True)
        return combined_df.fillna("")

    

    def load_member_data_with_dates(self, json_filename, data_type):
        df = self.load_member_json_data(json_filename)
        if not df.empty:
            # Parse dates
            date_columns = ['start_date', 'end_date']
            for col in date_columns:
                if col in df.columns:
                    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='coerce')
        return df


    def load_social_links(self):
        social_links = []

        for social_path in self.config.members_dir.rglob("social_links.json"):
            try:
                links = json.loads(social_path.read_text())
                info_path = social_path.parent.parent / "info.json"
                info = json.loads(info_path.read_text())
                links["id"] = info["id"]
                social_links.append(links)
            except (json.JSONDecodeError, KeyError) as e:
                logger.warning(f"Error loading social links from {social_path}: {e}")
                continue

        if not social_links:
            return pd.DataFrame()

        df = pd.DataFrame(social_links)
        df.set_index("id", inplace=True)
        return df.fillna("") 
    
    def merge_all_data(
        self,
        info_df: pd.DataFrame,
        experiences_df: pd.DataFrame,
        education_df: pd.DataFrame,
        projects_df: pd.DataFrame,
        awards_df: pd.DataFrame,
        outreach_df: pd.DataFrame,
        documents_df: pd.DataFrame,
        social_links_df: pd.DataFrame
    ):
        all_rows = []

        # Process each member
        for member_id in info_df.index:
            member_info = info_df.loc[member_id].to_dict()

            # Add info record
            info_row = member_info.copy()
            info_row['record_type'] = 'info'
            info_row['record_index'] = 0
            info_row['member_id'] = member_id
            all_rows.append(info_row)

            # Add social links record
            if member_id in social_links_df.index:
                social_row = social_links_df.loc[member_id].to_dict()
                social_row['record_type'] = 'social'
                social_row['record_index'] = 0
                social_row['member_id'] = member_id
                all_rows.append(social_row)

            # Add multi-record data
            data_sources = {
                'education': education_df,
                'experience': experiences_df,
                'project': projects_df,
                'award': awards_df,
                'outreach': outreach_df,
                'document': documents_df
            }

            for record_type, df in data_sources.items():
                if df.empty:
                    continue

                # Get all records for this member
                member_records = df.loc[df.index == member_id] if member_id in df.index else pd.DataFrame()

                for idx, (_, record) in enumerate(member_records.iterrows()):
                    record_row = record.to_dict()
                    record_row['record_type'] = record_type
                    record_row['record_index'] = idx
                    record_row['member_id'] = member_id
                    all_rows.append(record_row)

        # Create DataFrame from all rows
        if not all_rows:
            return pd.DataFrame()

        result_df = pd.DataFrame(all_rows)

        # Set multi-index
        result_df.set_index(['member_id', 'record_type', 'record_index'], inplace=True)

        return result_df


In [28]:
config = MemberDataConfig()
loader = MemberDataLoader(config)


In [29]:
loader.load_all_member_data()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,first_name,last_name,image_path,cover_image_path,introduction,nick_name,full_name,github_handle,email,website,...,title,group,role,project_title,description,award_name,date,program_name,document_name,link
member_id,record_type,record_index,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
gracie_tvrdik,info,0,Gracie,Tvrdik,media/images/gracie.jpg,media/images/cover.jpg,I am an undergraduate student at Bowling Green...,,Gracie Tvrdik,,,,...,,,,,,,,,,
gracie_tvrdik,social,0,,,,,,,,gracietv,graysontvrdik1@gmail.com,,...,,,,,,,,,,
gracie_tvrdik,education,0,,,,,,,,,,,...,,,,,,,,,,
gracie_tvrdik,experience,0,,,,,,,,,,,...,,kerzendorf,REU student,,,,,,,
gracie_tvrdik,experience,1,,,,,,,,,,,...,,,Student Learning Analyst,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
kiyah_young-wilson,social,0,,,,,,,,Youngwilson,kiyahyoungwilson@yahoo.com,,...,,,,,,,,,,
kiyah_young-wilson,education,0,,,,,,,,,,,...,,,,,,,,,,
kiyah_young-wilson,experience,0,,,,,,,,,,,...,,kerzendorf,REU student,,,,,,,
kiyah_young-wilson,experience,1,,,,,,,,,,,...,,,Co-nuclear reaction simulator,,,,,,,


In [32]:
members_df.index.get_level_values('member_id').unique()

Index(['gracie_tvrdik', 'josh_shields', 'anirban_dutta', 'erin_visser',
       'abhinav_ohri', 'deeksha_mohanty', 'vicente_amado', 'yuki_matsumura',
       'ryan_groneck', 'andrew_fullard', 'isaac_smith', 'hayden_monk',
       'atharva_arya', 'richard_dow', 'bea_lu', 'sona_chitchyan',
       'morgan_sandler', 'connor_mcclellan', 'iliomar_rodriguez_ramos',
       'jaladh_singhal', 'jack_o_brien', 'cecelia_powers', 'clyde_watson',
       'benjamin_mellon', 'kevin_cawley', 'sofia_biriouk', 'jing_lu',
       'jared_goldberg', 'alexander_grunewald', 'harshul_gupta',
       'logan_mcclellan', 'wolfgang_kerzendorf', 'kiyah_young-wilson'],
      dtype='object', name='member_id')

In [35]:
record_types = members_df.index.get_level_values('record_type').value_counts()
for record_type, count in record_types.items():
            print(f"  {record_type}: {count}")

  experience: 52
  education: 42
  info: 33
  project: 31
  award: 31
  social: 27
  outreach: 3
  document: 1


In [41]:
first_member_id = unique_members[0]
members_df.xs(first_member_id, level='member_id').fillna('')

Unnamed: 0_level_0,Unnamed: 1_level_0,first_name,last_name,image_path,cover_image_path,introduction,nick_name,full_name,github_handle,email,website,...,title,group,role,project_title,description,award_name,date,program_name,document_name,link
record_type,record_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
info,0,Gracie,Tvrdik,media/images/gracie.jpg,media/images/cover.jpg,I am an undergraduate student at Bowling Green...,,Gracie Tvrdik,,,,...,,,,,,,,,,
social,0,,,,,,,,gracietv,graysontvrdik1@gmail.com,,...,,,,,,,,,,
education,0,,,,,,,,,,,...,,,,,,,,,,
experience,0,,,,,,,,,,,...,,kerzendorf,REU student,,,,,,,
experience,1,,,,,,,,,,,...,,,Student Learning Analyst,,,,,,,
project,0,,,,,,,,,,,...,,,,Identifying Co-Author Connections from Astroph...,,,,,,


In [43]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(members_df.sort_index(level='member_id').fillna(''))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,first_name,last_name,image_path,cover_image_path,introduction,nick_name,full_name,github_handle,email,website,twitter_handle,linkedin_handle,orcid,linkedin,degree,subject,institution,city,state,country,start_date,end_date,title,group,role,project_title,description,award_name,date,program_name,document_name,link
member_id,record_type,record_index,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
abhinav_ohri,education,0,,,,,,,,,,,,,,,Bachelors,Commerce(Honours),Sanatan Dharma College,Hoshiarpur,Punjab,India,2021-01-01 00:00:00,2023-07-06,,,,,,,,,,
abhinav_ohri,experience,0,,,,,,,,,,,,,,,,,DeepThought Initiative,,,,2023-11-13 00:00:00,2025-07-31,,DTI,Research Software Engineer,,,,,,,
abhinav_ohri,info,0,Abhinav,Ohri,media/images/abhinav_ohri.jpg,media/images/cover.jpg,Hi there! This is Abhinav.,,Abhinav Ohri,,,,,,,,,,,,,,,NaT,,,,,,,,,,
abhinav_ohri,social,0,,,,,,,,KasukabeDefenceForce,abhinavohri13@gmail.com,,,,,,,,,,,,,NaT,,,,,,,,,,
alexander_grunewald,education,0,,,,,,,,,,,,,,,Bachelors,Statistics and Data Science,Michigan State University,East Lansing,Michigan,USA,2021-01-01 00:00:00,NaT,,,,,,,,,,
alexander_grunewald,experience,0,,,,,,,,,,,,,,,,,,,,,2023-01-01 00:00:00,2023-08-31,,kerzendorf,Research Assistant,,,,,,,
alexander_grunewald,info,0,Alexander,Grunewald,media/images/alexander.jpg,media/images/cover.jpg,Alexander Grunewald is an undergraduate studen...,,Alexander Grunewald,,,,,,,,,,,,,,,NaT,,,,,,,,,,
alexander_grunewald,project,0,,,,,,,,,,,,,,,,,,,,,,NaT,,,,Emulator Project,,,,,,
alexander_grunewald,social,0,,,,,,,,AlexanderGrunewald,grunew14@msu.edu,,,alexander-grunewald-8007a51b5,,,,,,,,,,NaT,,,,,,,,,,
andrew_fullard,education,0,,,,,,,,,,,,,,,PhD,Astronomy,University of Denver,Denver,Colorado,USA,2014-01-01 00:00:00,2020-12-31,,,,,,,,,,
