# Setup

In [2028]:
# Environment setup
import pandas as pd
import numpy as np
# import requests
import os
from pathlib import Path
from dotenv import load_dotenv

# Load .env from the project root (parent of data_cleaning)
load_dotenv(Path.cwd().resolve().parent / ".env")
GOOGLE_SHEETS_API_KEY = os.getenv('GOOGLE_SHEETS_API_KEY')
PROJECTS_SHEET_ID = os.getenv('PROJECTS_SHEET_ID')
ACTIVES_SHEET_ID = os.getenv('ACTIVES_SHEET_ID')
FALL_ATTENDANCE_SHEET_ID = os.getenv('FALL_ATTENDANCE_SHEET_ID')
BASE_URL = "https://sheets.googleapis.com/v4/spreadsheets"
load_dotenv(Path.cwd().resolve().parent / ".env")

True

In [2029]:
#def get_sheet_titles(spreadsheet_id: str, api_key: str) -> list[str]:
#    """Return all sheet/tab titles in the spreadsheet."""
#    resp = requests.get(
#        f"{BASE_URL}/{spreadsheet_id}",
#        params={
#            "fields": "sheets(properties(title))",
#            "key": api_key,
#        },
#        timeout=30,
#    )
#    resp.raise_for_status()
#    data = resp.json()
#    return [s["properties"]["title"] for s in data.get("sheets", [])]

In [2030]:
#def fetch_values_batch(spreadsheet_id: str, api_key: str, sheet_titles: list[str]) -> dict[str, list[list]]:
#    """Batch fetch values for provided sheet titles.
#    Returns mapping of title -> 2D list of cell values (including header row).
#    """
#    if not sheet_titles:
#        return {}
#    # Multiple 'ranges' params are supported by the API
#    params = [("key", api_key), ("valueRenderOption", "UNFORMATTED_VALUE"), ("dateTimeRenderOption", "FORMATTED_STRING")]
#    params.extend(("ranges", title) for title in sheet_titles)
#    resp = requests.get(
#        f"{BASE_URL}/{spreadsheet_id}/values:batchGet",
#        params=params,
#        timeout=60,
#    )
#    resp.raise_for_status()
#    payload = resp.json()

#    values_by_title: dict[str, list[list]] = {}
#    for vr, title in zip(payload.get("valueRanges", []), sheet_titles):
#        values_by_title[title] = vr.get("values", [])
#    return values_by_title

In [2031]:
#def values_to_dataframe(values: list[list]) -> pd.DataFrame:
#    """Convert a 2D list from Sheets API to a DataFrame, using first row as header.
#    Pads short rows so all rows match header length.
#    """
#    if not values:
#        return pd.DataFrame()
#    header = [str(h) for h in values[0]]
#    rows = values[1:] if len(values) > 1 else []
#    normalized_rows = [row + [""] * (len(header) - len(row)) for row in rows]
#    return pd.DataFrame(normalized_rows, columns=header)

In [2032]:
#def fetch_spreadsheet_as_dataframes(spreadsheet_id: str, api_key: str) -> dict[str, pd.DataFrame]:
#    """Fetch all sheets in a spreadsheet and return {sheet_title: DataFrame}."""
#    titles = get_sheet_titles(spreadsheet_id, api_key)
#    if not titles:
#        return {}
#    values_by_title = fetch_values_batch(spreadsheet_id, api_key, titles)
#    return {title: values_to_dataframe(values_by_title.get(title, [])) for title in titles}

# Load data

In [2033]:
# # fetch all data from Google Sheets into DataFrames per sheet
# # Build DataFrames per sheet for each spreadsheet ID found in .env
# PROJECTS_DFS = fetch_spreadsheet_as_dataframes(PROJECTS_SHEET_ID, GOOGLE_SHEETS_API_KEY) if PROJECTS_SHEET_ID else {}
# ACTIVES_DFS = fetch_spreadsheet_as_dataframes(ACTIVES_SHEET_ID, GOOGLE_SHEETS_API_KEY) if ACTIVES_SHEET_ID else {}
# FALL_ATTENDANCE_DFS = fetch_spreadsheet_as_dataframes(FALL_ATTENDANCE_SHEET_ID, GOOGLE_SHEETS_API_KEY) if FALL_ATTENDANCE_SHEET_ID else {}
# Load data from CSV URLs exported from Google Sheets
projects_df = pd.read_csv(os.getenv('PROJECTS_SHEET_URL'))
actives_df = pd.read_csv(os.getenv('ACTIVES_SHEET_URL'))
attendance_df = pd.read_csv(os.getenv('ATTENDANCE_URL'))
membership_df = pd.read_csv(os.getenv('MEMBERSHIP_URL'))

In [2034]:
projects_df['Quarter'] = projects_df['Quarter'].apply(lambda x: ("SU" + x[1:]) if isinstance(x, str) and x.startswith("S") and x[1:].isdigit() else x
)

In [2035]:
actives_df['Timestamp'] = pd.to_datetime(actives_df['Timestamp'])
actives_df['Name'] = actives_df['Name'].str.lower()
actives_df['Year'] = actives_df['Year'].map({'1st': 1, '2nd': 2, '3rd': 3, '4th': 4, '5th': 5}).fillna(actives_df['Year'])
actives_df['Are you planning to be an active member this quarter? (All actives have to pay dues)'] = actives_df.iloc[:, 3].apply(lambda x: True if x == 'Yes' else False)

In [2036]:
#actives_df.dtypes

In [2037]:
projects_df['Status'] = projects_df['Status'].fillna("")
projects_df.columns = ['quarter', 'company', 'point_of_contact', 'project_manager', 'associates',
       'nda', 'status', 'description']

In [2038]:
projects_df['nda'] = projects_df['nda'].apply(lambda x: True if x == "Yes" else False)

In [2039]:
projects_df.head()

Unnamed: 0,quarter,company,point_of_contact,project_manager,associates,nda,status,description
0,SU23,CARI Health,,Ashley Lee & Edmond Hong,,False,,The project focused on two primary objectives:...
1,FA23,UCSD Craft Center,,Sydney Jang,,True,,TCG identified strategies to enhance client en...
2,FA23,MD Revolution,,Dhathry Doppalapudi,,False,,The project focused on completing a competitiv...
3,FA23,Atmo Biosciences,,Isabel Wang,,True,,This project focused on conducting market rese...
4,WI24,Empirical,,Daniel Woo,,True,,Created a list of companies aligning with Empi...


In [2040]:
import re
import pandas as pd

# 1. Identify rows that contain "brain corp" (case-insensitive)
is_brain_corp = projects_df['company'].str.contains(r"brain\s*corp", case=False, na=False)

# 2. Clean company names (remove parentheses + inside), EXCEPT brain corp
cleaned_company = projects_df['company'].copy()

cleaned_company[~is_brain_corp] = (
    cleaned_company[~is_brain_corp]
    .str.replace(r"\s*\(.*?\)", "", regex=True)
    .str.strip()
)

# 3. Build unique company mapping *from cleaned names*
unique_companies = cleaned_company.unique()
companies_range = range(len(unique_companies))
company_inds = dict(zip(unique_companies, companies_range))

# 4. Apply mapping back to projects_df (brain corp keeps its cleaned name)
projects_df['company'] = cleaned_company.replace(company_inds)

# 5. Construct companies_df lookup table
companies_df = pd.DataFrame({
    "company_id": companies_range,
    "name": unique_companies
})


  projects_df['company'] = cleaned_company.replace(company_inds)


# company

In [2041]:
companies_df

Unnamed: 0,company_id,name
0,0,CARI Health
1,1,UCSD Craft Center
2,2,MD Revolution
3,3,Atmo Biosciences
4,4,Empirical
5,5,Kurin
6,6,Robust Physics
7,7,Brain Corp
8,8,Cordial
9,9,Persperion Diagnostics


# project

In [2042]:
projects_df['status'] = projects_df['status'].replace('', '-')
projects_df['donated'] = (projects_df['status'] == 'Donated')
projects_df['dnf'] = (projects_df['status'] == 'Did not finish')
projects_df['donated'] = (projects_df['status'] == 'Donated')
projects_df['point_of_contact'] = projects_df['point_of_contact'].replace(np.nan, '-')
projects_df['associates'] = projects_df['associates'].replace(np.nan, '-')

In [2043]:
#projects_df.head()

# quarter

In [2044]:
projects_df['quarter'] = (projects_df['quarter'].str.upper()
    .str.replace(r'\bS(\d{2})\b', r'SP\1', regex=True)
    .str.replace(r'\bF(\d{2})\b', r'FA\1', regex=True)
    .str.replace(r'\bW(\d{2})\b', r'WI\1', regex=True)
)
quarter_df = pd.DataFrame({"quarter_id": projects_df['quarter'].unique()})
#quarter_df.head()

In [2045]:
# projects_df['associates']

# member

In [2046]:
coffee = pd.read_csv('coffee.csv')
emails = pd.read_csv('emails.csv')
meme = pd.read_csv('members.csv')

In [2047]:
# Starting with membership_df (your raw form responses)
df = membership_df.copy()

# Create the members DataFrame with the correct columns
members = pd.DataFrame(columns=[
    'member_id', 'PID', 'name', 'quarter_entered', 'quarter_graduating',
    'role', 'ucsd_email', 'personal_email', 'track'
])

# Assign the fields
members['member_id'] = range(len(df))   # UID starting at 0
members['PID'] = df['PID:']

# Build full name and convert to Title Case
members['name'] = (
    df['First Name:'].str.strip() + " " + df['Last Name:'].str.strip()
).str.title()

members['quarter_entered'] = df['Quarter Entered TCG:']
members['quarter_graduating'] = df['Planned Graduation Quarter:']
members['role'] = df['Current Position in TCG (If you are a board member, type your role in Other)']
members['ucsd_email'] = df['UCSD Email: (include @ucsd.edu)']
members['personal_email'] = df['Primary Personal Email:']
members['track'] = df['Track:']

#members.head()


In [2048]:
import pandas as pd

# ============================================================
# 1. Normalize members + meme names for storage
# ============================================================
members['name'] = members['name'].astype(str).str.strip().str.title()
meme['name'] = meme['name'].astype(str).str.strip().str.title()

# ============================================================
# 2. Build fast lookup sets (DO NOT store these in the DF)
# ============================================================
name_set = set(members['name'].astype(str).str.strip().str.lower())
ucsd_email_set = set(members['ucsd_email'].fillna("").astype(str).str.strip().str.lower())
personal_email_set = set(members['personal_email'].fillna("").astype(str).str.strip().str.lower())

# ============================================================
# 3. Helper: check if email already exists anywhere
# ============================================================
def email_exists(email):
    if not isinstance(email, str):
        return False
    e = email.strip().lower()
    return (e in ucsd_email_set) or (e in personal_email_set)

# ============================================================
# 4. Prepare logging
# ============================================================
skipped = []   # store skipped rows for printing

# Next available unique member_id
next_id = members['member_id'].max() + 1

# ============================================================
# 5. Merge loop
# ============================================================
for _, row in meme.iterrows():

    raw_name = row['name'].strip().title()      # clean name
    clean_name = raw_name.lower()               # for comparison only

    # FIX: normalize email before using it
    email = row['email'] if isinstance(row['email'], str) else None
    if isinstance(email, str) and email.strip().lower() in ["", "-", "na", "n/a", "none", "null", "nan"]:
        email = None

    # --- Duplicate checks ---
    name_exists_flag = clean_name in name_set
    email_exists_flag = email_exists(email)

    # --- If duplicate, store in skipped list and continue ---
    if name_exists_flag or email_exists_flag:
        reason = []
        if name_exists_flag:
            reason.append("name match")
        if email_exists_flag:
            reason.append("email match")
        skipped.append({
            "name": raw_name,
            "email": email,
            "reason": ", ".join(reason)
        })
        continue


    # ============================================================
    # 6. Determine email placement
    # ============================================================
    if email and email.endswith("ucsd.edu"):
        ucsd_email = email
        personal_email = pd.NA
    elif email:
        ucsd_email = pd.NA
        personal_email = email
    else:
        ucsd_email = pd.NA
        personal_email = pd.NA

    # ============================================================
    # 7. Build and append the new row
    # ============================================================
    new_row = {
        'member_id': next_id,
        'PID': pd.NA,
        'name': raw_name,
        'quarter_entered': pd.NA,
        'quarter_graduating': pd.NA,
        'status': row.get('status', 0),
        'role': row.get('role', pd.NA),
        'ucsd_email': ucsd_email,
        'personal_email': personal_email,
        'track': pd.NA
    }

    members = pd.concat([members, pd.DataFrame([new_row])], ignore_index=True)

    # ============================================================
    # 8. Update matching sets so later rows detect duplicates
    # ============================================================
    name_set.add(clean_name)

    if isinstance(ucsd_email, str):
        ucsd_email_set.add(ucsd_email.strip().lower())
    if isinstance(personal_email, str):
        personal_email_set.add(personal_email.strip().lower())

    next_id += 1

# ============================================================
# 9. Print all skipped entries
# ============================================================
"""print("\n=== Skipped Entries (Not Added) ===")
for entry in skipped:
    print(f"- {entry['name']} | {entry['email']}  --> {entry['reason']}")"""


'print("\n=== Skipped Entries (Not Added) ===")\nfor entry in skipped:\n    print(f"- {entry[\'name\']} | {entry[\'email\']}  --> {entry[\'reason\']}")'

In [2049]:
members['status'] = True
members.loc[
    members['name'].str.contains("Aathi|Michael Wang|Veyna", case=False, na=False),
    'status'
] = False

members['role'] = members['role'].apply(lambda x: np.nan if x == '-' else x)

members

Unnamed: 0,member_id,PID,name,quarter_entered,quarter_graduating,role,ucsd_email,personal_email,track,status
0,0,A17991015,Noah Golder,FA25,SP27,Analyst,ngolder@ucsd.edu,nhgolder@gmail.com,Tech,True
1,1,A17845085,Holly Zhang,FA24,WI27,Associate,hoz022@ucsd.edu,hollyzhang05@gmail.com,Non-Tech,True
2,2,A17275299,Mihir Joshi,FA22,WI26,Associate,myjoshi@ucsd.edu,mihiryj@gmail.com,Tech,True
3,3,A18373882,Max Dreben,FA25,SP28,Analyst,mdreben@ucsd.edu,maxkd7735@gmail.com,Non-Tech,True
4,4,A18362535,Sonia Sahu,WI25,SP28,Associate,Sosahu@ucsd.edu,soniasahu00@gmail.com,Non-Tech,True
...,...,...,...,...,...,...,...,...,...,...
85,85,,Shray Kudva,,,,skudva@ucsd.edu,,,True
86,86,,Sparsh Jha,,,,,sparsh.jha12@gmail.com,,True
87,87,,Veyna Karanth,,,,,,,False
88,88,,Zachary Ishida,,,,,,,True


# assignment

In [2050]:
assignments = pd.DataFrame(columns=['member_id', 'project_id'])
assignments['member_id'] = members['member_id']
assignments['project_id'] = '-'
#assignments.head()

# GBM

In [2051]:
import pandas as pd
import datetime

# 1. Convert Timestamp → datetime
attendance_df['Timestamp'] = pd.to_datetime(attendance_df['Timestamp'], errors='coerce')

# 2. Extract unique calendar dates (sorted)
dates = sorted(attendance_df['Timestamp'].dt.date.unique())

# 3. Filter out adjacent days (keep earlier day only)
filtered_dates = []
for d in dates:
    if not filtered_dates:
        filtered_dates.append(d)
    else:
        prev = filtered_dates[-1]
        # Keep only if NOT the next day
        if (d - prev).days != 1:
            filtered_dates.append(d)

# 4. Convert date objects → datetime at midnight
final_dates = [datetime.datetime.combine(d, datetime.time.min) for d in filtered_dates]

# 5. Quarter logic (FA25 only: Sep–Dec)
def get_quarter(dt):
    if dt.month in [9, 10, 11, 12]:
        return 'FA25'
    return None

# 6. Build GBM table
gbm = pd.DataFrame({
    'gbm_id': range(len(final_dates)),
    'quarter_id': [get_quarter(d) for d in final_dates],
    'date': final_dates
})

gbm = gbm.sort_values('date').reset_index(drop=True)

gbm


Unnamed: 0,gbm_id,quarter_id,date
0,0,FA25,2025-10-02
1,1,FA25,2025-10-22
2,2,FA25,2025-10-29
3,3,FA25,2025-11-12
4,4,FA25,2025-11-19


# attendance

In [2052]:
# will take a bit to clean
attendance = pd.DataFrame(columns=['member_id', 'gbm_id', 'status'])
attendance['member_id'] = members['member_id']
attendance['gbm_id'] = gbm['gbm_id']
attendance['status'] = '-'
attendance.head()

attendance['status'] = np.random.choice([True, False], len(attendance))

#attendance

# enrollment

In [2053]:
enrollment = pd.DataFrame(columns=['member_id', 'quarter_id'])
enrollment['member_id'] = members[members['status'] == 1]['member_id']
enrollment['quarter_id'] = 'FA25'
enrollment.head()

Unnamed: 0,member_id,quarter_id
0,0,FA25
1,1,FA25
2,2,FA25
3,3,FA25
4,4,FA25


# assignment & project (2)

In [2054]:
#projects_df.head()

In [2055]:
#projects_df['associates']

In [2056]:
assig_arr = {"member_id": [], "project_id": []}

In [2057]:
def match_project_assignments(project_id, names_string):
    names = [n.strip() for n in names_string.split("\n") if n.strip()]
    for name in names:
        if (members['name'] == name).any():
            assig_arr['member_id'].append(members.loc[members['name'] == name].iloc[0]['member_id'])
            assig_arr['project_id'].append(project_id)
    return

In [2058]:
for index, row in projects_df.iterrows():
    match_project_assignments(index, row['associates'])

In [2059]:
assig_df = pd.DataFrame(assig_arr)
#assig_df

assig_df

Unnamed: 0,member_id,project_id
0,19,18
1,76,18
2,61,18
3,10,19
4,1,19
...,...,...
64,29,36
65,53,36
66,74,37
67,65,37


In [2060]:
projects_df['project_id'] = range(0, len(projects_df))
projects_df = projects_df.rename(columns={'quarter': 'quarter_id', 'company': 'company_id'})

In [2061]:
projects_df = projects_df.drop(['associates', 'status'], axis=1)
projects_df = projects_df[['project_id', 'quarter_id', 'company_id', 'point_of_contact', 'project_manager', 'nda', 'donated', 'dnf', 'description']]

In [2062]:
#### Add project_manager column to assig_df ####

# ==========================================
# 1. CLEAN NAMES FOR RELIABLE MATCHING
# ==========================================

member_name_map = {
    n.strip().lower(): mid
    for mid, n in zip(members['member_id'], members['name'])
}


# ==========================================
# 2. BUILD PROJECT → MANAGER_ID LIST (handles newline, &, and, comma)
# ==========================================

project_manager_map = {}

for project_id, pm_string in zip(projects_df['project_id'], projects_df['project_manager']):
    
    if pd.isna(pm_string) or str(pm_string).strip() in ['', '-']:
        project_manager_map[project_id] = []
        continue

    pm_string = str(pm_string)

    # Normalize all separators:
    # - newline → " & "
    # - " and " → " & "
    # - comma → " & "
    pm_string = (
        pm_string
        .replace("\n", " & ")
        .replace(" and ", " & ")
        .replace(",", " & ")
    )

    # Split into individual names
    pm_names = [n.strip() for n in pm_string.split("&") if n.strip()]

    manager_ids = []
    for name in pm_names:
        clean_name = name.strip().lower()
        if clean_name in member_name_map:
            manager_ids.append(member_name_map[clean_name])
        else:
            print(f"⚠ Warning: Project manager '{name}' not found in members")

    project_manager_map[project_id] = manager_ids


# ==========================================
# 3. ADD MISSING ASSIGNMENTS FOR PROJECT MANAGERS
# ==========================================

new_assignments = []

for project_id, manager_ids in project_manager_map.items():
    for manager_id in manager_ids:

        already_assigned = (
            (assig_df['member_id'] == manager_id) &
            (assig_df['project_id'] == project_id)
        ).any()

        if not already_assigned:
            new_assignments.append({
                'member_id': manager_id,
                'project_id': project_id,
                'project_manager': True
            })

if new_assignments:
    assig_df = pd.concat([assig_df, pd.DataFrame(new_assignments)], ignore_index=True)
    print(f"Added {len(new_assignments)} new assignment(s) for project managers")
else:
    print("No new project manager assignments were added.")


# ==========================================
# 4. UPDATE project_manager BOOLEAN COLUMN
# ==========================================

if 'project_manager' not in assig_df.columns:
    assig_df['project_manager'] = False

assig_df['project_manager'] = assig_df.apply(
    lambda r: r['member_id'] in project_manager_map.get(r['project_id'], []),
    axis=1
)

# Summary
print(f"Total assignments: {len(assig_df)}")
print(f"Project manager assignments: {assig_df['project_manager'].sum()}")


Added 27 new assignment(s) for project managers
Total assignments: 96
Project manager assignments: 27


In [2063]:
projects_df = projects_df.drop(columns = ['project_manager'])

## All the Dataframes

#### Projects ####

In [2064]:
#projects_df.head

In [2065]:
#### Parse point_of_contact to extract all contacts ####

import re

# Function to parse a single contact string like "Name <email@domain.com>"
def parse_contact(contact_str):
    """Parse a contact string to extract name and email.
    Format: 'Name <email@domain.com>'
    Returns: (name, email) tuple or (None, None) if parsing fails
    """
    contact_str = str(contact_str).strip()
    if not contact_str or contact_str == '-' or contact_str == 'nan':
        return None, None
    
    # Match pattern: Name <email>
    match = re.match(r'^(.+?)\s*<([^>]+)>$', contact_str)
    if match:
        name = match.group(1).strip()
        email = match.group(2).strip().lower()
        return name, email
    return None, None

# Collect all unique contacts and project-contact relationships
all_contacts = {}  # email -> name
project_contact_pairs = []  # (project_id, email) tuples

for idx, row in projects_df.iterrows():
    project_id = row['project_id']
    poc_str = row['point_of_contact']
    
    if pd.isna(poc_str) or poc_str == '-' or str(poc_str).strip() == '':
        continue
    
    # Split by newline for multiple contacts
    contacts = [c.strip() for c in str(poc_str).split('\n') if c.strip()]
    
    for contact in contacts:
        name, email = parse_contact(contact)
        if name and email:
            # Store contact info (use most recent name if email appears multiple times)
            all_contacts[email] = name
            # Add project-contact relationship
            project_contact_pairs.append((project_id, email))

print(f"Found {len(all_contacts)} unique contacts")
print(f"Found {len(project_contact_pairs)} project-contact relationships")


Found 21 unique contacts
Found 36 project-contact relationships


In [2066]:
#### Create contacts table ####

contacts_df = pd.DataFrame([
    {'email': email, 'name': name}
    for email, name in all_contacts.items()
])

# Set email as index
contacts_df = contacts_df.set_index('email').sort_index()

print(f"Created contacts table with {len(contacts_df)} rows")
#contacts_df.head(10)


Created contacts table with 21 rows


In [2067]:
#### Create project_contacts junction table ####

project_contacts_df = pd.DataFrame(project_contact_pairs, columns=['project_id', 'contact_email'])

# Set composite index (project_id, contact_email)
project_contacts_df = project_contacts_df.set_index(['project_id', 'contact_email']).sort_index()

print(f"Created project_contacts table with {len(project_contacts_df)} rows")
#project_contacts_df.head(10)


Created project_contacts table with 36 rows


In [2068]:
#### Remove point_of_contact from projects_df ####

projects_df = projects_df.drop(columns=['point_of_contact'])

print(f"Removed point_of_contact column from projects_df")
print(f"Projects table now has {len(projects_df.columns)} columns")
#projects_df.head()


Removed point_of_contact column from projects_df
Projects table now has 7 columns


In [2069]:
contacts_df = contacts_df.reset_index()


In [2070]:
attendance['gbm_id'] = attendance['gbm_id'].apply(lambda x: np.nan if pd.isna(x) else int(x))

In [2071]:
# ==========================================
# Prepare cleaned fields
# ==========================================

# Clean member emails
members['ucsd_email_clean'] = members['ucsd_email'].str.strip().str.lower()
members['personal_email_clean'] = members['personal_email'].str.strip().str.lower()

# Clean submitted attendance emails
attendance_df['email_clean'] = attendance_df['Email Address'].str.strip().str.lower()

# Clean names
members['name_clean'] = members['name'].str.strip().str.lower()
attendance_df['name_clean'] = attendance_df['Name'].str.strip().str.lower()


# ==========================================
# 1. MATCH BY EMAIL FIRST
# ==========================================

# Build master email → member_id map
email_map = pd.concat([
    members[['member_id', 'ucsd_email_clean']].rename(columns={'ucsd_email_clean': 'email_clean'}),
    members[['member_id', 'personal_email_clean']].rename(columns={'personal_email_clean': 'email_clean'})
])

email_map = email_map.dropna().drop_duplicates().set_index('email_clean')['member_id']

# Email match
attendance_df['member_id'] = attendance_df['email_clean'].map(email_map)


# ==========================================
# 2. MATCH REMAINING BY FULL CLEANED NAME
# ==========================================

missing = attendance_df['member_id'].isna()

name_map = members.set_index('name_clean')['member_id']

attendance_df.loc[missing, 'member_id'] = (
    attendance_df.loc[missing, 'name_clean'].map(name_map)
)


# ==========================================
# 3. MAP TO GBM BY DATE
# ==========================================

attendance_df['Timestamp'] = pd.to_datetime(attendance_df['Timestamp'], errors='coerce')
attendance_df['date_only'] = attendance_df['Timestamp'].dt.normalize()

gbm_dates = pd.DataFrame({
    'gbm_id': gbm['gbm_id'],
    'date_only': pd.to_datetime(gbm['date']).dt.normalize()
})
date_map = gbm_dates.set_index('date_only')['gbm_id']

attendance_df['gbm_id'] = attendance_df['date_only'].map(date_map)


# ==========================================
# 4. BUILD FINAL ATTENDANCE TABLE
# ==========================================

# Create member × gbm grid
attendance = members[['member_id']].merge(gbm[['gbm_id']], how='cross')
attendance['status'] = False

# Mark observed attendance
observed = (
    attendance_df[['member_id', 'gbm_id']]
    .dropna()
    .drop_duplicates()
)
observed['status'] = True

attendance = attendance.merge(
    observed,
    on=['member_id', 'gbm_id'],
    how='left',
    suffixes=('', '_obs')
)

attendance['status'] = attendance['status_obs'].fillna(False).astype(bool)
attendance = attendance[['member_id', 'gbm_id', 'status']]

temp_cols = [
    'ucsd_email_clean',
    'personal_email_clean',
    'name_clean'
]

members.drop(columns=temp_cols, inplace=True, errors='ignore')



  attendance['status'] = attendance['status_obs'].fillna(False).astype(bool)


In [2072]:
enrollment = enrollment.reset_index(drop=True)
project_contacts_df = project_contacts_df.reset_index()


In [2073]:
project_contacts_df = project_contacts_df.rename(columns = {'contact_email': 'email'})

In [2074]:
import pandas as pd
import numpy as np

# ------------------------------------------------
# LOOKUP MAPS
# ------------------------------------------------

# (1) Member-name → member_id
name_to_member_id = {
    n.strip().lower(): mid
    for mid, n in zip(members['member_id'], members['name'])
}

# (2) Company-name → company_id
company_to_company_id = {
    str(name).strip().lower(): int(cid)
    for name, cid in zip(companies_df['name'], companies_df['company_id'])
    if pd.notna(name)
}

# ------------------------------------------------
# FIX & VALIDATE projects_df BEFORE BUILDING MAPS
# ------------------------------------------------

# Ensure scalar integer company_id
projects_df['company_id'] = projects_df['company_id'].astype(int)

# Ensure scalar string quarter_id
projects_df['quarter_id'] = projects_df['quarter_id'].astype(str)

# Debug: identify malformed rows (Series / lists instead of scalars)
bad_rows = projects_df[
    ~projects_df['company_id'].apply(np.isscalar) |
    ~projects_df['quarter_id'].apply(np.isscalar)
]

if len(bad_rows):
    print("⚠️ BAD ROWS FOUND IN projects_df:")
    print(bad_rows)
    raise ValueError("company_id or quarter_id contains non-scalar values")

# ------------------------------------------------
# BUILD: (company_id, quarter_id) → [project_id list]
# ------------------------------------------------

company_quarter_to_project_ids = (
    projects_df
    .groupby(['company_id', 'quarter_id'])['project_id']
    .apply(list)
    .to_dict()
)

# ------------------------------------------------
# EXISTING ASSIGNMENTS
# ------------------------------------------------
existing_assignments = set(
    (int(r['member_id']), int(r['project_id']))
    for _, r in assig_df.iterrows()
)

new_assignments = []

# For printing/debug — optional
member_id_to_name = dict(zip(members['member_id'], members['name']))
company_id_to_name = dict(zip(companies_df['company_id'], companies_df['name']))
project_id_to_company = dict(zip(projects_df['project_id'], projects_df['company_id']))
project_id_to_quarter = dict(zip(projects_df['project_id'], projects_df['quarter_id']))

# ------------------------------------------------
# MAIN LOOP — PARSE MEMBERSHIP FORM & MATCH PROJECTS
# ------------------------------------------------

for _, row in membership_df.iterrows():

    # Build full name
    full_name = f"{row['First Name:'].strip().title()} {row['Last Name:'].strip().title()}"
    member_key = full_name.lower()

    if member_key not in name_to_member_id:
        continue

    member_id = int(name_to_member_id[member_key])

    # No projects
    projects_raw = row["Projects Worked On: (Choose N/A if no project yet)"]
    if pd.isna(projects_raw) or str(projects_raw).strip().lower() == "n/a":
        continue

    # Split into project entries
    project_entries = [p.strip() for p in str(projects_raw).split(",") if p.strip()]

    for entry in project_entries:

        # Remove "(PM X)" or anything inside parentheses
        clean_entry = entry.split("(", 1)[0].strip()

        # Normalize dash types to "-"
        for dash in ["—", "–", "‒", "−"]:
            clean_entry = clean_entry.replace(dash, "-")

        # Split into [quarter, company]
        parts = [p.strip() for p in clean_entry.split("-", 1) if p.strip()]
        if len(parts) < 2:
            continue

        quarter_id = parts[0]
        company_name_raw = parts[1]
        company_name = company_name_raw.lower().strip()

        # Company not found
        if company_name not in company_to_company_id:
            continue

        company_id = company_to_company_id[company_name]

        # Lookup projects
        key = (company_id, quarter_id)
        project_ids = company_quarter_to_project_ids.get(key, [])

        if not project_ids:
            continue

        # Add assignments
        for project_id in project_ids:
            pair = (member_id, int(project_id))

            if pair in existing_assignments:
                continue

            existing_assignments.add(pair)
            new_assignments.append(pair)

# ------------------------------------------------
# APPEND NEW ASSIGNMENTS
# ------------------------------------------------

for member_id, project_id in new_assignments:
    assig_df.loc[len(assig_df)] = {
        "member_id": member_id,
        "project_id": project_id,
        "project_manager": False
    }

# Remove any accidental duplicates
assig_df = assig_df.drop_duplicates(subset=['member_id', 'project_id'])


In [2075]:
# Add track column defaulting to 'non-tech'
projects_df['track'] = 'non-tech'

# List of (quarter_id, company name) pairs that should be TECH
tech_projects = [
    ('FA25', 'Brain Corp'),
    ('SP25', 'KlonIT'),
    ('SP25', 'OutOfTheBlue'),
    ('WI25', 'OutOfTheBlue'),
    ('SP24', 'Cordial'),
    ('WI24', 'Robust Physics')
]

# Build lookup: company_id -> cleaned company name
company_lookup = {
    cid: str(name).strip().lower()
    for cid, name in zip(companies_df['company_id'], companies_df['name'])
}

# Create a set of cleaned tech project keys: (quarter, cleaned company name)
tech_keys = {(q, c.strip().lower()) for q, c in tech_projects}

# Apply tech labels
track_values = []
for _, row in projects_df.iterrows():
    quarter = row['quarter_id']
    company_id = row['company_id']
    company_clean = company_lookup.get(company_id, None)

    if company_clean is not None and (quarter, company_clean) in tech_keys:
        track_values.append('tech')
    else:
        track_values.append('non-tech')

projects_df['track'] = track_values


In [2076]:
projects_df_before = projects_df.copy()

# =======================================================
# Fix Brain Corp grouping + tech track by merging back
# =======================================================

# 1. Merge to restore company names from companies_df
proj = projects_df.merge(
    companies_df[['company_id', 'name']],
    on='company_id',
    how='left'
)

# 2. Identify Brain Corp variants using merged 'name'
mask_brain = proj['name'].str.contains(r'brain corp', case=False, na=False)

# 3. Pick canonical Brain Corp company_id
brain_id = proj.loc[mask_brain, 'company_id'].min()

# Assign all Brain Corp rows the same ID
proj.loc[mask_brain, 'company_id'] = brain_id

# 4. Track = 'tech' if original name had "(Tech)"
mask_brain_tech = proj['name'].str.contains(r'\(.*tech.*\)', case=False, na=False)
proj.loc[mask_brain_tech, 'track'] = 'tech'

# 5. Update projects_df
projects_df = proj.drop(columns=['name']).copy()

# =======================================================
# Remove duplicate Brain Corp entries from companies_df
# =======================================================

# Normalize names to detect all Brain Corp variants
companies_df['name_clean'] = (
    companies_df['name']
    .str.replace(r"\s*\(.*?\)", "", regex=True)
    .str.strip()
    .apply(lambda x: 'Brain Corp' if 'brain corp' in x.lower() else x)
)

# Keep only canonical Brain Corp ID
canonical_id = companies_df.loc[
    companies_df['name_clean'] == 'Brain Corp', 'company_id'
].min()

# Remove all others
companies_df = companies_df[
    ~(
        (companies_df['name_clean'] == 'Brain Corp') &
        (companies_df['company_id'] != canonical_id)
    )
]

companies_df = companies_df.drop(columns=['name_clean']).reset_index(drop=True)


# ====================================================
# Deduplicate projects WITHOUT losing project_id
# ====================================================

project_id_map = {}   # old project_id → new project_id
final_projects = []

# group by company and quarter to detect duplicates
for (_, group) in projects_df.groupby(['company_id', 'quarter_id', 'track'], sort=False):
    
    # If only one project in the group → keep it as is
    if len(group) == 1:
        final_projects.append(group.iloc[0])
        continue
    
    # If duplicates exist → pick the best canonical project
    # Prefer non-null descriptions
    group_sorted = group.sort_values(by=['description'], na_position='last')
    
    survivor = group_sorted.iloc[0]          # the "real" project
    survivor_id = survivor['project_id']
    final_projects.append(survivor)
    
    # Record mappings for all dropped IDs
    for _, row in group_sorted.iloc[1:].iterrows():
        old_id = row['project_id']
        project_id_map[old_id] = survivor_id

# Rebuild projects_df
projects_df = pd.DataFrame(final_projects).reset_index(drop=True)

# ====================================================
# Remap assignment.project_id to canonical ones
# ====================================================
if project_id_map:
    assig_df['project_id'] = assig_df['project_id'].apply(
        lambda x: project_id_map.get(x, x)
    )

# ====================================================
# Remap ALL child tables that reference project_id
# ====================================================

# ====================================================
# Remap all remaining tables that reference project_id
# ====================================================

if project_id_map:

    # Fix assignments
    assig_df['project_id'] = assig_df['project_id'].apply(
        lambda x: project_id_map.get(x, x)
    )

    # Fix project_contacts
    project_contacts_df['project_id'] = project_contacts_df['project_id'].apply(
        lambda x: project_id_map.get(x, x)
    )




## All the Dataframes

#### Projects

In [2077]:
projects_df.head()

Unnamed: 0,project_id,quarter_id,company_id,nda,donated,dnf,description,track
0,0,SU23,0,False,False,False,The project focused on two primary objectives:...,non-tech
1,1,FA23,1,True,False,False,TCG identified strategies to enhance client en...,non-tech
2,2,FA23,2,False,False,False,The project focused on completing a competitiv...,non-tech
3,3,FA23,3,True,False,False,This project focused on conducting market rese...,non-tech
4,4,WI24,4,True,False,False,Created a list of companies aligning with Empi...,non-tech


#### Members ####

In [2078]:
members.head()

Unnamed: 0,member_id,PID,name,quarter_entered,quarter_graduating,role,ucsd_email,personal_email,track,status
0,0,A17991015,Noah Golder,FA25,SP27,Analyst,ngolder@ucsd.edu,nhgolder@gmail.com,Tech,True
1,1,A17845085,Holly Zhang,FA24,WI27,Associate,hoz022@ucsd.edu,hollyzhang05@gmail.com,Non-Tech,True
2,2,A17275299,Mihir Joshi,FA22,WI26,Associate,myjoshi@ucsd.edu,mihiryj@gmail.com,Tech,True
3,3,A18373882,Max Dreben,FA25,SP28,Analyst,mdreben@ucsd.edu,maxkd7735@gmail.com,Non-Tech,True
4,4,A18362535,Sonia Sahu,WI25,SP28,Associate,Sosahu@ucsd.edu,soniasahu00@gmail.com,Non-Tech,True


#### Assignments ####

In [2079]:
assig_df.head()

Unnamed: 0,member_id,project_id,project_manager
0,19,18,False
1,76,18,False
2,61,18,False
3,10,20,False
4,1,20,False


#### Companies ####

In [2080]:
companies_df

Unnamed: 0,company_id,name
0,0,CARI Health
1,1,UCSD Craft Center
2,2,MD Revolution
3,3,Atmo Biosciences
4,4,Empirical
5,5,Kurin
6,6,Robust Physics
7,7,Brain Corp
8,8,Cordial
9,9,Persperion Diagnostics


#### Attendance ####

In [2081]:
attendance.head()

Unnamed: 0,member_id,gbm_id,status
0,0,0,False
1,0,1,True
2,0,2,True
3,0,3,True
4,0,4,True


#### GBM ####

In [2082]:
gbm.head()

Unnamed: 0,gbm_id,quarter_id,date
0,0,FA25,2025-10-02
1,1,FA25,2025-10-22
2,2,FA25,2025-10-29
3,3,FA25,2025-11-12
4,4,FA25,2025-11-19


#### Enrollment ####

In [2083]:
enrollment.head()

Unnamed: 0,member_id,quarter_id
0,0,FA25
1,1,FA25
2,2,FA25
3,3,FA25
4,4,FA25


#### Quarter ####

In [2105]:
quarter_df.head()

Unnamed: 0,quarter_id
0,SU23
1,FA23
2,WI24
3,SP24
4,SU24


#### Point of Contact

In [2085]:
project_contacts_df.head()

Unnamed: 0,project_id,email
0,7,daniel.hoffman@braincorp.com
1,8,angela@cordial.io
2,9,luyin@persperiontech.com
3,11,taskin@anume.com.au
4,12,pmgodoy@ucsd.edu


#### Contacts

In [2086]:
contacts_df.head()

Unnamed: 0,email,name
0,alejandra.chaidez@sdcounty.ca.gov,Alejandra Chaidez
1,alexphan@ucsd.edu,Alex Phan
2,angela@cordial.io,Angela Wang
3,ank016@ucsd.edu,Anchit Kumar
4,brandon.lewis@sdcounty.ca.gov,Brandon Lewis


# Database Migration

In [2087]:
import psycopg2
from psycopg2.extras import execute_values

# Get database params
DATABASE_URL = os.getenv('DATABASE_URL')

# Connect to the database
try:
    conn = psycopg2.connect(DATABASE_URL)
    print("Connection successful!")
    
    # Create a cursor to execute SQL queries
    cur = conn.cursor()

except Exception as e:
    print(f"Failed to connect: {e}")

Connection successful!


In [2088]:
# Drop existing tables and create new ones
drop_tables_sql = """
DROP TABLE IF EXISTS attendance CASCADE;
DROP TABLE IF EXISTS enrollment CASCADE;
DROP TABLE IF EXISTS assignment CASCADE;
DROP TABLE IF EXISTS project_contacts CASCADE;
DROP TABLE IF EXISTS gbm CASCADE;
DROP TABLE IF EXISTS project CASCADE;
DROP TABLE IF EXISTS member CASCADE;
DROP TABLE IF EXISTS quarter CASCADE;
DROP TABLE IF EXISTS company CASCADE;
DROP TABLE IF EXISTS contacts CASCADE;
"""

create_tables_sql = """
-- Company table
CREATE TABLE IF NOT EXISTS company (
    company_id INTEGER PRIMARY KEY,
    name TEXT NOT NULL
);

-- Quarter table
CREATE TABLE IF NOT EXISTS quarter (
    quarter_id TEXT PRIMARY KEY
);

-- Member table
CREATE TABLE IF NOT EXISTS member (
    member_id INTEGER PRIMARY KEY,
    PID TEXT,
    name TEXT NOT NULL,
    quarter_entered TEXT,
    quarter_graduating TEXT,
    role TEXT,
    ucsd_email TEXT,
    personal_email TEXT,
    track TEXT,
    status BOOLEAN NOT NULL
);

-- Project table
CREATE TABLE IF NOT EXISTS project (
    project_id INTEGER PRIMARY KEY,
    quarter_id TEXT NOT NULL REFERENCES quarter(quarter_id),
    company_id INTEGER NOT NULL REFERENCES company(company_id),
    nda BOOLEAN NOT NULL,
    donated BOOLEAN NOT NULL,
    dnf BOOLEAN NOT NULL,
    description TEXT,
    track TEXT
);

-- Assignment table (many-to-many between member and project)
CREATE TABLE IF NOT EXISTS assignment (
    member_id INTEGER NOT NULL REFERENCES member(member_id),
    project_id INTEGER NOT NULL REFERENCES project(project_id),
    project_manager BOOLEAN NOT NULL,
    PRIMARY KEY (member_id, project_id)
);

-- GBM (General Body Meeting) table
CREATE TABLE IF NOT EXISTS gbm (
    gbm_id INTEGER PRIMARY KEY,
    quarter_id TEXT NOT NULL REFERENCES quarter(quarter_id),
    date TIMESTAMP NOT NULL
);

-- Attendance table
CREATE TABLE IF NOT EXISTS attendance (
    member_id INTEGER NOT NULL REFERENCES member(member_id),
    gbm_id INTEGER NOT NULL REFERENCES gbm(gbm_id),
    status BOOLEAN NOT NULL,
    PRIMARY KEY (member_id, gbm_id)
);

-- Enrollment table (which members are enrolled in which quarters)
CREATE TABLE IF NOT EXISTS enrollment (
    member_id INTEGER NOT NULL REFERENCES member(member_id),
    quarter_id TEXT NOT NULL REFERENCES quarter(quarter_id),
    PRIMARY KEY (member_id, quarter_id)
);

-- Contacts (List of all contact emails and names)
CREATE TABLE IF NOT EXISTS contacts (
    email TEXT PRIMARY KEY,
    name TEXT NOT NULL
);

-- Project contacts (which contacts for which projects)
CREATE TABLE IF NOT EXISTS project_contacts (
    project_id INTEGER NOT NULL REFERENCES project(project_id),
    email TEXT NOT NULL REFERENCES contacts(email),
    PRIMARY KEY (project_id, email)
);

"""

cur.execute(drop_tables_sql)
cur.execute(create_tables_sql)
conn.commit()


In [2089]:
# Insert companies
companies_data = [(int(row['company_id']), str(row['name'])) 
                  for _, row in companies_df.iterrows()]
execute_values(cur, 
    "INSERT INTO company (company_id, name) VALUES %s",
    companies_data)
conn.commit()

In [2090]:
# Insert quarters
quarters_data = [(str(row['quarter_id']),) for _, row in quarter_df.iterrows()]
execute_values(cur, 
    "INSERT INTO quarter (quarter_id) VALUES %s",
    quarters_data)
conn.commit()

In [2091]:
members_data = [
    tuple(
        int(row[col])                                  # member_id only
        if col == 'member_id'
        else (
            bool(row[col])                             # status column stays boolean
            if col == 'status'
            else (
                str(row[col]) if pd.notna(row[col]) else None
            )
        )
        for col in members.columns
    )
    for _, row in members.iterrows()
]


execute_values(
    cur,
    """
    INSERT INTO member (
        member_id,
        PID,
        name,
        quarter_entered,
        quarter_graduating,
        role,
        ucsd_email,
        personal_email,
        track,
        status
    ) VALUES %s
    """,
    members_data
)

conn.commit()


In [2092]:
# Insert projects
projects_data = [
    (int(row['project_id']), str(row['quarter_id']), int(row['company_id']),
    bool(row['nda']), bool(row['donated']), bool(row['dnf']), str(row['description']), str(row['track']))
    for _, row in projects_df.iterrows()
]
execute_values(cur,
    "INSERT INTO project (project_id, quarter_id, company_id, nda, donated, dnf, description, track) VALUES %s",
    projects_data)
conn.commit()

In [2093]:
# Insert assignments
if len(assig_df) > 0:
    assignments_data = [(int(row['member_id']), int(row['project_id']), bool(row['project_manager'])) 
                       for _, row in assig_df.iterrows()]
    execute_values(cur,
        "INSERT INTO assignment (member_id, project_id, project_manager) VALUES %s ON CONFLICT DO NOTHING",
        assignments_data)
    conn.commit()
    print(f"Inserted {len(assignments_data)} assignments")
else:
    print("No assignments to insert")


Inserted 139 assignments


In [2094]:
# Insert GBMs
gbm_data = [(int(row['gbm_id']), str(row['quarter_id']), row['date']) 
            for _, row in gbm.iterrows()]
execute_values(cur,
    "INSERT INTO gbm (gbm_id, quarter_id, date) VALUES %s",
    gbm_data)
conn.commit()


In [2095]:
# Insert attendance (for ALL members)
attendance_data = [(int(row['member_id']), int(row['gbm_id']), bool(row['status']))
                   for _, row in attendance.iterrows()]
execute_values(cur,
    "INSERT INTO attendance (member_id, gbm_id, status) VALUES %s",
    attendance_data)
conn.commit()


In [2096]:
# Insert enrollments
enrollment_data = [(int(row['member_id']), str(row['quarter_id'])) 
                   for _, row in enrollment.iterrows()]
execute_values(cur,
    "INSERT INTO enrollment (member_id, quarter_id) VALUES %s",
    enrollment_data)
conn.commit()


In [2097]:
# Insert project_contacts_df
contacts_data = [(str(row['email']), str(row['name'])) 
                   for _, row in contacts_df.iterrows()]
execute_values(cur,
    "INSERT INTO contacts (email, name) VALUES %s",
    contacts_data)
conn.commit()

In [2098]:
project_contacts_data = [
    (int(row['project_id']), str(row['email']))
    for _, row in project_contacts_df.iterrows()
]

execute_values(
    cur,
    """
    INSERT INTO project_contacts (project_id, email)
    VALUES %s
    ON CONFLICT DO NOTHING
    """,
    project_contacts_data
)
conn.commit()


In [2099]:
# Verify data and show summary
print("Database Summary:")

tables = ['company', 'quarter', 'member', 'project', 'assignment', 'gbm', 'attendance', 'enrollment', 'contacts', 'project_contacts']
for table in tables:
    cur.execute(f"SELECT COUNT(*) FROM {table}")
    count = cur.fetchone()[0]
    print(f"{table.ljust(15)}: {count} rows")

# Close connection
cur.close()
conn.close()


Database Summary:
company        : 26 rows
quarter        : 10 rows
member         : 90 rows
project        : 37 rows
assignment     : 124 rows
gbm            : 5 rows
attendance     : 450 rows
enrollment     : 87 rows
contacts       : 21 rows
project_contacts: 34 rows
