In [None]:
import json
import pandas as pd
from pathlib import Path



In [None]:

GROUP_DATA_DIR = Path("../../group-data")
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members/"

ROLE_MAP = {
    "Assistant Professor": "Professor",
    "Professorial Assistant": "Undergraduate Student",
    "Visiting Researcher": "Postdoctoral Researcher"
}

DEGREE_MAP = {
    "Masters": "Graduate Student",
    "PhD": "Postdoctorate",
    "Bachelors": "Undergraduate Student",
}


In [None]:

class MemberDataLoader:
    def __init__(self, members_dir: Path = MEMBERS_DIR_PATH):
        self.members_dir = members_dir

    def load_records(self, jsons_dir, filename, member_id):
        path = jsons_dir / filename
        if not path.exists():
            # in case files are missing
            return []
        records = json.loads(path.read_text())
        for record in records:
            record['member_id'] = member_id
        return records

    def parse_dates(self, records, date_fields, member_id=None):
        for record in records:
            for field in date_fields:
                if field in record and record[field]:
                    try:
                        record[field] = pd.to_datetime(record[field])
                    except ValueError as e: # hard to check dateparseerrors
                        if member_id:
                            print(e, member_id)
        return records

    def load_all_data(self):
        data_types = ['education', 'experiences', 'projects', 'awards', 'outreach', 'documents', 'posters', 'publications']
        data_config = {dt: f"{dt}.json" for dt in data_types}
        data = {key: [] for key in data_config}

        start_end_dates = ['education.json', 'experiences.json', 'projects.json', 'outreach.json']
        single_date = ['awards.json', 'publications.json']

        members_data = []

        for member_dir in self.members_dir.glob("*"):
            info_path = member_dir / "info.json"
            member_info = json.loads(info_path.read_text())
            member_id = member_info["id"]

            full_name = (
                f"{member_info.get('nick_name', member_info.get('first_name', ''))} {member_info.get('last_name', '')}"
                if member_info.get('nick_name')
                else f"{member_info.get('first_name', '')} {member_info.get('last_name', '')}"
            )
            member_info['full_name'] = full_name.strip()

            jsons_dir = member_dir / "jsons"

            social_path = jsons_dir / "social_links.json"
            if social_path.exists():
                social_data = json.loads(social_path.read_text())
                member_info.update(social_data)

            members_data.append(member_info)

            for key, filename in data_config.items():
                records = self.load_records(jsons_dir, filename, member_id)
                if filename in start_end_dates:
                    records = self.parse_dates(records, ['start_date', 'end_date'])
                elif filename in single_date:
                    records = self.parse_dates(records, ['date'], member_id)
                data[key].extend(records)

        members_df = pd.DataFrame(members_data).set_index('id')
        members_df.index.name = 'member_id'
        self.members_df = members_df

        for key in data:
            df = pd.DataFrame(data[key]).set_index('member_id')
            setattr(self, f"{key}_df", df)


loader = MemberDataLoader()
loader.load_all_data()



In [None]:
loader.members_df

In [None]:
loader.education_df

In [None]:
loader.experiences_df

In [None]:
loader.projects_df

In [None]:
loader.awards_df

In [None]:
loader.outreach_df

In [None]:
loader.documents_df

In [None]:
loader.posters_df

In [None]:
loader.publications_df