# Setup

In [249]:
# Environment setup
import pandas as pd
import numpy as np
# import requests
import os
from pathlib import Path
from dotenv import load_dotenv

# Load .env from the project root (parent of data_cleaning)
load_dotenv(Path.cwd().resolve().parent / ".env")
GOOGLE_SHEETS_API_KEY = os.getenv('GOOGLE_SHEETS_API_KEY')
PROJECTS_SHEET_ID = os.getenv('PROJECTS_SHEET_ID')
ACTIVES_SHEET_ID = os.getenv('ACTIVES_SHEET_ID')
FALL_ATTENDANCE_SHEET_ID = os.getenv('FALL_ATTENDANCE_SHEET_ID')
BASE_URL = "https://sheets.googleapis.com/v4/spreadsheets"
load_dotenv(Path.cwd().resolve().parent / ".env")

True

In [250]:
#def get_sheet_titles(spreadsheet_id: str, api_key: str) -> list[str]:
#    """Return all sheet/tab titles in the spreadsheet."""
#    resp = requests.get(
#        f"{BASE_URL}/{spreadsheet_id}",
#        params={
#            "fields": "sheets(properties(title))",
#            "key": api_key,
#        },
#        timeout=30,
#    )
#    resp.raise_for_status()
#    data = resp.json()
#    return [s["properties"]["title"] for s in data.get("sheets", [])]

In [251]:
#def fetch_values_batch(spreadsheet_id: str, api_key: str, sheet_titles: list[str]) -> dict[str, list[list]]:
#    """Batch fetch values for provided sheet titles.
#    Returns mapping of title -> 2D list of cell values (including header row).
#    """
#    if not sheet_titles:
#        return {}
#    # Multiple 'ranges' params are supported by the API
#    params = [("key", api_key), ("valueRenderOption", "UNFORMATTED_VALUE"), ("dateTimeRenderOption", "FORMATTED_STRING")]
#    params.extend(("ranges", title) for title in sheet_titles)
#    resp = requests.get(
#        f"{BASE_URL}/{spreadsheet_id}/values:batchGet",
#        params=params,
#        timeout=60,
#    )
#    resp.raise_for_status()
#    payload = resp.json()

#    values_by_title: dict[str, list[list]] = {}
#    for vr, title in zip(payload.get("valueRanges", []), sheet_titles):
#        values_by_title[title] = vr.get("values", [])
#    return values_by_title

In [252]:
#def values_to_dataframe(values: list[list]) -> pd.DataFrame:
#    """Convert a 2D list from Sheets API to a DataFrame, using first row as header.
#    Pads short rows so all rows match header length.
#    """
#    if not values:
#        return pd.DataFrame()
#    header = [str(h) for h in values[0]]
#    rows = values[1:] if len(values) > 1 else []
#    normalized_rows = [row + [""] * (len(header) - len(row)) for row in rows]
#    return pd.DataFrame(normalized_rows, columns=header)

In [253]:
#def fetch_spreadsheet_as_dataframes(spreadsheet_id: str, api_key: str) -> dict[str, pd.DataFrame]:
#    """Fetch all sheets in a spreadsheet and return {sheet_title: DataFrame}."""
#    titles = get_sheet_titles(spreadsheet_id, api_key)
#    if not titles:
#        return {}
#    values_by_title = fetch_values_batch(spreadsheet_id, api_key, titles)
#    return {title: values_to_dataframe(values_by_title.get(title, [])) for title in titles}

# Load data

In [254]:
# # fetch all data from Google Sheets into DataFrames per sheet
# # Build DataFrames per sheet for each spreadsheet ID found in .env
# PROJECTS_DFS = fetch_spreadsheet_as_dataframes(PROJECTS_SHEET_ID, GOOGLE_SHEETS_API_KEY) if PROJECTS_SHEET_ID else {}
# ACTIVES_DFS = fetch_spreadsheet_as_dataframes(ACTIVES_SHEET_ID, GOOGLE_SHEETS_API_KEY) if ACTIVES_SHEET_ID else {}
# FALL_ATTENDANCE_DFS = fetch_spreadsheet_as_dataframes(FALL_ATTENDANCE_SHEET_ID, GOOGLE_SHEETS_API_KEY) if FALL_ATTENDANCE_SHEET_ID else {}
# Load data from CSV URLs exported from Google Sheets
projects_df = pd.read_csv(os.getenv('PROJECTS_SHEET_URL'))
actives_df = pd.read_csv(os.getenv('ACTIVES_SHEET_URL'))
attendance_df = pd.read_csv(os.getenv('ATTENDANCE_URL'))

In [255]:
actives_df['Timestamp'] = pd.to_datetime(actives_df['Timestamp'])
actives_df['Name'] = actives_df['Name'].str.lower()
actives_df['Year'] = actives_df['Year'].map({'1st': 1, '2nd': 2, '3rd': 3, '4th': 4, '5th': 5}).fillna(actives_df['Year'])
actives_df['Are you planning to be an active member this quarter? (All actives have to pay dues)'] = actives_df.iloc[:, 3].apply(lambda x: True if x == 'Yes' else False)

In [256]:
#actives_df.dtypes

In [257]:
projects_df['Status'] = projects_df['Status'].fillna("")
projects_df.columns = ['quarter', 'company', 'point_of_contact', 'project_manager', 'associates',
       'nda', 'status', 'description']

In [258]:
projects_df['nda'] = projects_df['nda'].apply(lambda x: True if x == "Yes" else False)

In [259]:
#projects_df.head()

In [260]:
unique_companies = projects_df['company'].unique()
companies_range = range(len(projects_df['company'].unique()))
company_inds = dict(zip(unique_companies, companies_range))
companies_df = pd.DataFrame({"company_id": companies_range, "name": unique_companies})
                            
projects_df['company'] = projects_df['company'].replace(company_inds)

  projects_df['company'] = projects_df['company'].replace(company_inds)


# company

In [261]:
companies_df.head()

Unnamed: 0,company_id,name
0,0,CARI Health
1,1,UCSD Craft Center
2,2,MD Revolution
3,3,Atmo Biosciences
4,4,Empirical


# project

In [262]:
#projects_df.head()

In [263]:
projects_df['status'] = projects_df['status'].replace('', '-')
projects_df['donated'] = (projects_df['status'] == 'Donated')
projects_df['dnf'] = (projects_df['status'] == 'Did not finish')
projects_df['donated'] = (projects_df['status'] == 'Donated')
projects_df['point_of_contact'] = projects_df['point_of_contact'].replace(np.nan, '-')
projects_df['associates'] = projects_df['associates'].replace(np.nan, '-')

In [264]:
#projects_df.head()

# quarter

In [265]:
projects_df['quarter'] = (projects_df['quarter'].str.upper()
    .str.replace(r'\bS(\d{2})\b', r'SP\1', regex=True)
    .str.replace(r'\bF(\d{2})\b', r'FA\1', regex=True)
    .str.replace(r'\bW(\d{2})\b', r'WI\1', regex=True)
)
quarter_df = pd.DataFrame({"quarter_id": projects_df['quarter'].unique()})
#quarter_df.head()

In [266]:
# projects_df['associates']

# member

In [267]:
coffee = pd.read_csv('coffee.csv')

In [268]:
df = pd.read_csv('members.csv')

In [269]:
members = pd.DataFrame(columns=['member_id', 'name', 'year', 'status', 'role', 'email'])
members['name'] = coffee['x']
members['year'] = coffee['Year']
members['status'] = 0
members['role'] = '-'
members['email'] = '-'
members['member_id'] = members.index

active_names = df['name'].str.lower().str.strip().tolist()

for idx, member_name in enumerate(members['name']):
    member_name_lower = str(member_name).lower().strip()
    
    for active_name in active_names:
        active_name_lower = active_name.lower().strip()
       
        if active_name_lower in member_name_lower:
            members.loc[idx, 'status'] = 1
            break
       
        member_first_name = member_name_lower.split()[0] if member_name_lower else ''
        active_first_name = active_name_lower.split()[0] if active_name_lower else ''
        if member_first_name and active_first_name and member_first_name == active_first_name:
            members.loc[idx, 'status'] = 1
            break

#members.head()

In [270]:
# Load emails and match them to members by name
emails_df = pd.read_csv('emails.csv')

def extract_name_parts_from_email(email):
    """Extract potential name parts from email address username."""
    if pd.isna(email) or not email:
        return None, None
    username = str(email).split('@')[0].lower()
    # Replace common separators with spaces
    username_clean = username.replace('.', ' ').replace('_', ' ').replace('-', ' ')
    parts = [p for p in username_clean.split() if p.isalpha() and len(p) > 1]
    first_part = parts[0] if parts else username.split('.')[0].split('_')[0].split('-')[0]
    last_part = parts[-1] if len(parts) > 1 else None
    return first_part, last_part

# Match emails to members
for idx, member_name in enumerate(members['name']):
    if members.loc[idx, 'email'] != '-':  # Skip if already has email
        continue
        
    member_name_lower = str(member_name).lower().strip()
    member_parts = member_name_lower.split()
    member_first_name = member_parts[0] if member_parts else ''
    member_last_name = member_parts[-1] if len(member_parts) > 1 else ''
    member_full_name = ' '.join(member_parts)
    
    best_match = None
    best_match_score = 0
    
    # Try to match with each email
    for email in emails_df['email']:
        if pd.isna(email):
            continue
            
        email_username = str(email).split('@')[0].lower()
        email_first, email_last = extract_name_parts_from_email(email)
        
        # Calculate match score
        score = 0
        
        # Exact match on extracted name parts
        if email_first and member_first_name and email_first == member_first_name:
            score += 2
            if email_last and member_last_name and email_last == member_last_name:
                score += 3  # Strong match with both first and last
            elif len(member_parts) == 1:
                score += 1  # Only first name, but member only has one name
        
        # Check if email username contains member's name parts
        if member_first_name and member_first_name in email_username:
            score += 1
        if member_last_name and member_last_name in email_username:
            score += 2
        
        # Check if member name parts are in email username (reverse)
        if email_first and email_first in member_full_name:
            score += 1
        if email_last and email_last in member_full_name:
            score += 1
        
        # Prefer matches with higher scores
        if score > best_match_score:
            best_match_score = score
            best_match = email
    
    # Only assign email if we have a reasonable match (score >= 2)
    if best_match and best_match_score >= 2:
        members.loc[idx, 'email'] = best_match

#members.head()


In [271]:
members = pd.read_csv('members.csv')
#members.head()

# assignment

In [272]:
assignments = pd.DataFrame(columns=['member_id', 'project_id'])
assignments['member_id'] = members['member_id']
assignments['project_id'] = '-'
#assignments.head()

# gbm

In [273]:
import datetime
quarters = ['FA25', 'FA25', 'FA25']
dates = [datetime.datetime(2025, 10, 2), datetime.datetime(2025, 10, 22), datetime.datetime(2025, 10, 29)]
gbm = pd.DataFrame(columns=['gbm_id', 'quarter_id', 'date'])
gbm['quarter_id'] = quarters
gbm['date'] = dates
gbm['gbm_id'] = gbm.index
gbm.head()

Unnamed: 0,gbm_id,quarter_id,date
0,0,FA25,2025-10-02
1,1,FA25,2025-10-22
2,2,FA25,2025-10-29


# attendance

In [274]:
# will take a bit to clean
attendance = pd.DataFrame(columns=['member_id', 'gbm_id', 'status'])
attendance['member_id'] = members['member_id']
attendance['gbm_id'] = gbm['gbm_id']
attendance['status'] = '-'
attendance.head()

attendance['status'] = np.random.choice([True, False], len(attendance))

#attendance

# enrollment

In [275]:
enrollment = pd.DataFrame(columns=['member_id', 'quarter_id'])
enrollment['member_id'] = members[members['status'] == 1]['member_id']
enrollment['quarter_id'] = 'FA25'
enrollment.head()

Unnamed: 0,member_id,quarter_id
0,0,FA25
9,9,FA25
10,10,FA25
11,11,FA25
12,12,FA25


# assignment & project (2)

In [276]:
#projects_df.head()

In [277]:
#projects_df['associates']

In [278]:
assig_arr = {"member_id": [], "project_id": []}

In [279]:
def match_project_assignments(project_id, names_string):
    names = [n.strip() for n in names_string.split("\n") if n.strip()]
    for name in names:
        if (members['name'] == name).any():
            assig_arr['member_id'].append(members.loc[members['name'] == name].iloc[0]['member_id'])
            assig_arr['project_id'].append(project_id)
    return

In [280]:
for index, row in projects_df.iterrows():
    match_project_assignments(index, row['associates'])

In [281]:
assig_df = pd.DataFrame(assig_arr)
#assig_df

In [282]:
projects_df['project_id'] = range(0, len(projects_df))
projects_df = projects_df.rename(columns={'quarter': 'quarter_id', 'company': 'company_id'})

In [283]:
projects_df = projects_df.drop(['associates', 'status'], axis=1)
projects_df = projects_df[['project_id', 'quarter_id', 'company_id', 'point_of_contact', 'project_manager', 'nda', 'donated', 'dnf', 'description']]

In [284]:
#### Add project_manager column to assig_df ####

# Create a mapping of project_id to list of project manager member_ids
project_manager_map = {}

for project_id, pm_string in zip(projects_df['project_id'], projects_df['project_manager']):
    if pd.isna(pm_string) or pm_string == '-' or pm_string.strip() == '':
        project_manager_map[project_id] = []
        continue
    
    # Split by ' & ' or ' and ' to handle multiple project managers
    pm_string = str(pm_string)
    # Replace ' and ' with ' & ' for consistent splitting
    pm_string = pm_string.replace(' and ', ' & ')
    names = [name.strip() for name in pm_string.split(' & ')]
    
    manager_ids = []
    for name in names:
        # Try to find matching member by name
        matching_members = members[members['name'] == name]
        if not matching_members.empty:
            manager_ids.append(matching_members.iloc[0]['member_id'])
    
    project_manager_map[project_id] = manager_ids

# Add project managers to assig_df if they don't already have an assignment for that project
new_assignments = []
for project_id, manager_ids in project_manager_map.items():
    for manager_id in manager_ids:
        # Check if this manager already has an assignment for this project
        already_assigned = ((assig_df['member_id'] == manager_id) & 
                           (assig_df['project_id'] == project_id)).any()
        if not already_assigned:
            new_assignments.append({'member_id': manager_id, 'project_id': project_id})

# Add new assignments for project managers
if new_assignments:
    new_assig_df = pd.DataFrame(new_assignments)
    assig_df = pd.concat([assig_df, new_assig_df], ignore_index=True)
    print(f"Added {len(new_assignments)} new assignment(s) for project managers")

# Add project_manager column to assig_df
def is_project_manager(row):
    project_id = row['project_id']
    member_id = row['member_id']
    if project_id in project_manager_map:
        return member_id in project_manager_map[project_id]
    return False

assig_df['project_manager'] = assig_df.apply(is_project_manager, axis=1)

# Display results
print(f"Total assignments: {len(assig_df)}")
print(f"Project manager assignments: {assig_df['project_manager'].sum()}")
#assig_df.head(20)


Added 23 new assignment(s) for project managers
Total assignments: 89
Project manager assignments: 23


In [285]:
projects_df = projects_df.drop(columns = ['project_manager'])

In [286]:
members = members.drop(columns = ['role'])

## All the Dataframes

#### Projects ####

In [287]:
#projects_df.head

In [288]:
#### Parse point_of_contact to extract all contacts ####

import re

# Function to parse a single contact string like "Name <email@domain.com>"
def parse_contact(contact_str):
    """Parse a contact string to extract name and email.
    Format: 'Name <email@domain.com>'
    Returns: (name, email) tuple or (None, None) if parsing fails
    """
    contact_str = str(contact_str).strip()
    if not contact_str or contact_str == '-' or contact_str == 'nan':
        return None, None
    
    # Match pattern: Name <email>
    match = re.match(r'^(.+?)\s*<([^>]+)>$', contact_str)
    if match:
        name = match.group(1).strip()
        email = match.group(2).strip().lower()
        return name, email
    return None, None

# Collect all unique contacts and project-contact relationships
all_contacts = {}  # email -> name
project_contact_pairs = []  # (project_id, email) tuples

for idx, row in projects_df.iterrows():
    project_id = row['project_id']
    poc_str = row['point_of_contact']
    
    if pd.isna(poc_str) or poc_str == '-' or str(poc_str).strip() == '':
        continue
    
    # Split by newline for multiple contacts
    contacts = [c.strip() for c in str(poc_str).split('\n') if c.strip()]
    
    for contact in contacts:
        name, email = parse_contact(contact)
        if name and email:
            # Store contact info (use most recent name if email appears multiple times)
            all_contacts[email] = name
            # Add project-contact relationship
            project_contact_pairs.append((project_id, email))

print(f"Found {len(all_contacts)} unique contacts")
print(f"Found {len(project_contact_pairs)} project-contact relationships")


Found 21 unique contacts
Found 36 project-contact relationships


In [289]:
#### Create contacts table ####

contacts_df = pd.DataFrame([
    {'email': email, 'name': name}
    for email, name in all_contacts.items()
])

# Set email as index
contacts_df = contacts_df.set_index('email').sort_index()

print(f"Created contacts table with {len(contacts_df)} rows")
#contacts_df.head(10)


Created contacts table with 21 rows


In [290]:
#### Create project_contacts junction table ####

project_contacts_df = pd.DataFrame(project_contact_pairs, columns=['project_id', 'contact_email'])

# Set composite index (project_id, contact_email)
project_contacts_df = project_contacts_df.set_index(['project_id', 'contact_email']).sort_index()

print(f"Created project_contacts table with {len(project_contacts_df)} rows")
#project_contacts_df.head(10)


Created project_contacts table with 36 rows


In [291]:
#### Remove point_of_contact from projects_df ####

projects_df = projects_df.drop(columns=['point_of_contact'])

print(f"Removed point_of_contact column from projects_df")
print(f"Projects table now has {len(projects_df.columns)} columns")
#projects_df.head()


Removed point_of_contact column from projects_df
Projects table now has 7 columns


In [292]:
contacts_df = contacts_df.reset_index()


In [293]:
attendance['gbm_id'] = attendance['gbm_id'].apply(lambda x: np.nan if pd.isna(x) else int(x))

In [294]:
# ==========================================
# Build attendance table (using Timestamp day, no mutation of `members`)
# ==========================================

import pandas as pd
import numpy as np

### 1. Create full member × gbm grid
attendance = members[['member_id']].merge(gbm[['gbm_id']], how='cross')
attendance['status'] = False   # default: not attended

### 2. Prepare helper copies for names (do NOT modify `members`)
member_names = pd.DataFrame({
    'member_id': members['member_id'],
    'name_clean': members['name'].str.strip().str.lower()
})
member_names['first_name'] = member_names['name_clean'].str.split().str[0]

### 3. Normalize Timestamp and names in attendance_df
attendance_df['Timestamp'] = pd.to_datetime(attendance_df['Timestamp'], errors='coerce')
attendance_df['date_only'] = attendance_df['Timestamp'].dt.normalize()

attendance_df['name_clean'] = attendance_df['Name'].str.strip().str.lower()
attendance_df['first_name'] = attendance_df['name_clean'].str.split().str[0]

### 4. Map attendance_df → member_id (full name → fallback first name)
full_name_map = member_names.set_index('name_clean')['member_id']
attendance_df['member_id'] = attendance_df['name_clean'].map(full_name_map)

missing = attendance_df['member_id'].isna()
first_name_map = (
    member_names.drop_duplicates('first_name')
                .set_index('first_name')['member_id']
)
attendance_df.loc[missing, 'member_id'] = attendance_df.loc[missing, 'first_name'].map(first_name_map)

### 5. Prepare helper for gbm dates (no mutation of `gbm`)
gbm_dates = pd.DataFrame({
    'gbm_id': gbm['gbm_id'],
    'date_only': pd.to_datetime(gbm['date'], errors='coerce').dt.normalize()
})

date_map = gbm_dates.set_index('date_only')['gbm_id']
attendance_df['gbm_id'] = attendance_df['date_only'].map(date_map)

### 6. Mark True for attended rows
observed = (
    attendance_df[['member_id', 'gbm_id']]
    .dropna()
    .drop_duplicates()
)
observed['status'] = True

attendance = attendance.merge(
    observed,
    on=['member_id', 'gbm_id'],
    how='left',
    suffixes=('', '_obs')
)

attendance['status'] = attendance['status_obs'].fillna(False).astype(bool)
attendance = attendance[['member_id', 'gbm_id', 'status']]

#attendance.head()


  attendance['status'] = attendance['status_obs'].fillna(False).astype(bool)


In [295]:
enrollment = enrollment.reset_index(drop=True)
project_contacts_df = project_contacts_df.reset_index()


In [296]:
members['email'] = members['email'].apply(lambda x: np.nan if x == '-' else x)
members['role'] = pd.Series(np.nan * len(members))

In [297]:
project_contacts_df = project_contacts_df.rename(columns = {'contact_email': 'email'})

## All the Dataframes

#### Projects

In [298]:
projects_df.head()

Unnamed: 0,project_id,quarter_id,company_id,nda,donated,dnf,description
0,0,SP23,0,False,False,False,The project focused on two primary objectives:...
1,1,FA23,1,True,False,False,TCG identified strategies to enhance client en...
2,2,FA23,2,False,False,False,The project focused on completing a competitiv...
3,3,FA23,3,True,False,False,This project focused on conducting market rese...
4,4,WI24,4,True,False,False,Created a list of companies aligning with Empi...


#### Members ####

In [299]:
members.head()

Unnamed: 0,member_id,name,year,status,email,role
0,0,Aastha Shah,3rd,1,,
1,1,Aathi Muthu,3rd,0,,
2,2,Aatish Mandalapu,4th,0,,
3,3,Aditi Parthan,3rd,0,aparthan@ucsd.edu,
4,4,Advay Pradhan,2nd,0,adpradhan@ucsd.edu,


#### Assignments ####

In [300]:
assig_df.head()

Unnamed: 0,member_id,project_id,project_manager
0,3,18,False
1,33,18,False
2,63,18,False
3,36,19,False
4,24,19,False


#### Companies ####

In [301]:
companies_df.head()

Unnamed: 0,company_id,name
0,0,CARI Health
1,1,UCSD Craft Center
2,2,MD Revolution
3,3,Atmo Biosciences
4,4,Empirical


#### Attendance ####

In [302]:
attendance.head()

Unnamed: 0,member_id,gbm_id,status
0,0,0,False
1,0,1,False
2,0,2,True
3,1,0,False
4,1,1,False


#### GBM ####

In [303]:
gbm.head()

Unnamed: 0,gbm_id,quarter_id,date
0,0,FA25,2025-10-02
1,1,FA25,2025-10-22
2,2,FA25,2025-10-29


#### Enrollment ####

In [304]:
enrollment.head()

Unnamed: 0,member_id,quarter_id
0,0,FA25
1,9,FA25
2,10,FA25
3,11,FA25
4,12,FA25


#### Quarter ####

In [305]:
quarter_df.head()

Unnamed: 0,quarter_id
0,SP23
1,FA23
2,WI24
3,SP24
4,FA24


#### Point of Contact

In [306]:
project_contacts_df.head()

Unnamed: 0,project_id,email
0,7,daniel.hoffman@braincorp.com
1,8,angela@cordial.io
2,9,luyin@persperiontech.com
3,11,taskin@anume.com.au
4,12,pmgodoy@ucsd.edu


#### Contacts

In [307]:
contacts_df.head()

Unnamed: 0,email,name
0,alejandra.chaidez@sdcounty.ca.gov,Alejandra Chaidez
1,alexphan@ucsd.edu,Alex Phan
2,angela@cordial.io,Angela Wang
3,ank016@ucsd.edu,Anchit Kumar
4,brandon.lewis@sdcounty.ca.gov,Brandon Lewis


# Database Migration

In [308]:
import psycopg2

DATABASE_URL = os.getenv('DATABASE_URL')

try:
    conn = psycopg2.connect(DATABASE_URL)
    print("Connected!")
except Exception as e:
    print("Error:", e)


Error: connection to server at "aws-1-us-west-1.pooler.supabase.com" (3.101.5.153), port 5432 failed: FATAL:  password authentication failed for user "postgres"
connection to server at "aws-1-us-west-1.pooler.supabase.com" (3.101.5.153), port 5432 failed: FATAL:  password authentication failed for user "postgres"



In [309]:
import psycopg2
from psycopg2.extras import execute_values

# Get database params
DATABASE_URL = os.getenv('DATABASE_URL')

# Connect to the database
try:
    conn = psycopg2.connect(DATABASE_URL)
    print("Connection successful!")
    
    # Create a cursor to execute SQL queries
    cur = conn.cursor()

except Exception as e:
    print(f"Failed to connect: {e}")

Failed to connect: connection to server at "aws-1-us-west-1.pooler.supabase.com" (3.101.5.153), port 5432 failed: FATAL:  password authentication failed for user "postgres"
connection to server at "aws-1-us-west-1.pooler.supabase.com" (3.101.5.153), port 5432 failed: FATAL:  password authentication failed for user "postgres"



In [310]:
# Drop existing tables and create new ones
drop_tables_sql = """
DROP TABLE IF EXISTS attendance CASCADE;
DROP TABLE IF EXISTS enrollment CASCADE;
DROP TABLE IF EXISTS assignment CASCADE;
DROP TABLE IF EXISTS project_contacts CASCADE;
DROP TABLE IF EXISTS gbm CASCADE;
DROP TABLE IF EXISTS project CASCADE;
DROP TABLE IF EXISTS member CASCADE;
DROP TABLE IF EXISTS quarter CASCADE;
DROP TABLE IF EXISTS company CASCADE;
DROP TABLE IF EXISTS contacts CASCADE;
"""

create_tables_sql = """
-- Company table
CREATE TABLE IF NOT EXISTS company (
    company_id INTEGER PRIMARY KEY,
    name TEXT NOT NULL
);

-- Quarter table
CREATE TABLE IF NOT EXISTS quarter (
    quarter_id TEXT PRIMARY KEY
);

-- Member table
CREATE TABLE IF NOT EXISTS member (
    member_id INTEGER PRIMARY KEY,
    name TEXT NOT NULL,
    year TEXT,
    status BOOLEAN NOT NULL,
    role TEXT,
    email TEXT
);

-- Project table
CREATE TABLE IF NOT EXISTS project (
    project_id INTEGER PRIMARY KEY,
    quarter_id TEXT NOT NULL REFERENCES quarter(quarter_id),
    company_id INTEGER NOT NULL REFERENCES company(company_id),
    nda BOOLEAN NOT NULL,
    donated BOOLEAN NOT NULL,
    dnf BOOLEAN NOT NULL,
    description TEXT
);

-- Assignment table (many-to-many between member and project)
CREATE TABLE IF NOT EXISTS assignment (
    member_id INTEGER NOT NULL REFERENCES member(member_id),
    project_id INTEGER NOT NULL REFERENCES project(project_id),
    project_manager BOOLEAN NOT NULL,
    PRIMARY KEY (member_id, project_id)
);

-- GBM (General Body Meeting) table
CREATE TABLE IF NOT EXISTS gbm (
    gbm_id INTEGER PRIMARY KEY,
    quarter_id TEXT NOT NULL REFERENCES quarter(quarter_id),
    date TIMESTAMP NOT NULL
);

-- Attendance table
CREATE TABLE IF NOT EXISTS attendance (
    member_id INTEGER NOT NULL REFERENCES member(member_id),
    gbm_id INTEGER NOT NULL REFERENCES gbm(gbm_id),
    status BOOLEAN NOT NULL,
    PRIMARY KEY (member_id, gbm_id)
);

-- Enrollment table (which members are enrolled in which quarters)
CREATE TABLE IF NOT EXISTS enrollment (
    member_id INTEGER NOT NULL REFERENCES member(member_id),
    quarter_id TEXT NOT NULL REFERENCES quarter(quarter_id),
    PRIMARY KEY (member_id, quarter_id)
);

-- Contacts (List of all contact emails and names)
CREATE TABLE IF NOT EXISTS contacts (
    email TEXT PRIMARY KEY,
    name TEXT NOT NULL
);

-- Project contacts (which contacts for which projects)
CREATE TABLE IF NOT EXISTS project_contacts (
    project_id INTEGER NOT NULL REFERENCES project(project_id),
    email TEXT NOT NULL REFERENCES contacts(email),
    PRIMARY KEY (project_id, email)
);

"""

cur.execute(drop_tables_sql)
cur.execute(create_tables_sql)
conn.commit()


NameError: name 'cur' is not defined

In [None]:
# Insert companies
companies_data = [(int(row['company_id']), str(row['name'])) 
                  for _, row in companies_df.iterrows()]
execute_values(cur, 
    "INSERT INTO company (company_id, name) VALUES %s",
    companies_data)
conn.commit()

InterfaceError: cursor already closed

In [None]:
# Insert quarters
quarters_data = [(str(row['quarter_id']),) for _, row in quarter_df.iterrows()]
execute_values(cur, 
    "INSERT INTO quarter (quarter_id) VALUES %s",
    quarters_data)
conn.commit()

InterfaceError: cursor already closed

In [None]:
# Insert members
members_data = [
    (int(row['member_id']), str(row['name']), 
     str(row['year']) if pd.notna(row['year']) else None,
     bool(row['status']), str(row['role']), str(row['email']))
    for _, row in members.iterrows()
]
execute_values(cur,
    "INSERT INTO member (member_id, name, year, status, role, email) VALUES %s",
    members_data)
conn.commit()


InterfaceError: cursor already closed

In [None]:
# Insert projects
projects_data = [
    (int(row['project_id']), str(row['quarter_id']), int(row['company_id']),
    bool(row['nda']), bool(row['donated']), bool(row['dnf']), str(row['description']))
    for _, row in projects_df.iterrows()
]
execute_values(cur,
    "INSERT INTO project (project_id, quarter_id, company_id, nda, donated, dnf, description) VALUES %s",
    projects_data)
conn.commit()

InterfaceError: cursor already closed

In [None]:
# Insert assignments
if len(assig_df) > 0:
    assignments_data = [(int(row['member_id']), int(row['project_id']), bool(row['project_manager'])) 
                       for _, row in assig_df.iterrows()]
    execute_values(cur,
        "INSERT INTO assignment (member_id, project_id, project_manager) VALUES %s ON CONFLICT DO NOTHING",
        assignments_data)
    conn.commit()
    print(f"Inserted {len(assignments_data)} assignments")
else:
    print("No assignments to insert")


InterfaceError: cursor already closed

In [None]:
# Insert GBMs
gbm_data = [(int(row['gbm_id']), str(row['quarter_id']), row['date']) 
            for _, row in gbm.iterrows()]
execute_values(cur,
    "INSERT INTO gbm (gbm_id, quarter_id, date) VALUES %s",
    gbm_data)
conn.commit()


InterfaceError: cursor already closed

In [None]:
# Insert attendance (for ALL members)
attendance_data = [(int(row['member_id']), int(row['gbm_id']), bool(row['status']))
                   for _, row in attendance.iterrows()]
execute_values(cur,
    "INSERT INTO attendance (member_id, gbm_id, status) VALUES %s",
    attendance_data)
conn.commit()


InterfaceError: cursor already closed

In [None]:
# Insert enrollments
enrollment_data = [(int(row['member_id']), str(row['quarter_id'])) 
                   for _, row in enrollment.iterrows()]
execute_values(cur,
    "INSERT INTO enrollment (member_id, quarter_id) VALUES %s",
    enrollment_data)
conn.commit()


InterfaceError: cursor already closed

In [None]:
# Insert project_contacts_df
contacts_data = [(str(row['email']), str(row['name'])) 
                   for _, row in contacts_df.iterrows()]
execute_values(cur,
    "INSERT INTO contacts (email, name) VALUES %s",
    contacts_data)
conn.commit()

InterfaceError: cursor already closed

In [None]:
# Insert project_contacts_df
project_contacts_data = [(int(row['project_id']), str(row['email'])) 
                   for _, row in project_contacts_df.iterrows()]
execute_values(cur,
    "INSERT INTO project_contacts (project_id, email) VALUES %s",
    project_contacts_data)
conn.commit()

In [None]:
# Verify data and show summary
print("Database Summary:")

tables = ['company', 'quarter', 'member', 'project', 'assignment', 'gbm', 'attendance', 'enrollment', 'contacts', 'project_contacts']
for table in tables:
    cur.execute(f"SELECT COUNT(*) FROM {table}")
    count = cur.fetchone()[0]
    print(f"{table.ljust(15)}: {count} rows")

# Close connection
cur.close()
conn.close()


Database Summary:
company        : 28 rows
quarter        : 9 rows
member         : 68 rows
project        : 39 rows
assignment     : 89 rows
gbm            : 3 rows
attendance     : 204 rows
enrollment     : 26 rows
contacts       : 21 rows
project_contacts: 36 rows
