In [73]:
#environment setup
import pandas as pd
import requests
import os
from pathlib import Path
from dotenv import load_dotenv

# Load .env from the project root (parent of data_cleaning)
load_dotenv(Path.cwd().resolve().parent / ".env")
GOOGLE_SHEETS_API_KEY = os.getenv('GOOGLE_SHEETS_API_KEY')
PROJECTS_SHEET_ID = os.getenv('PROJECTS_SHEET_ID')
ACTIVES_SHEET_ID = os.getenv('ACTIVES_SHEET_ID')
FALL_ATTENDANCE_SHEET_ID = os.getenv('FALL_ATTENDANCE_SHEET_ID')
BASE_URL = "https://sheets.googleapis.com/v4/spreadsheets"

In [74]:
def get_sheet_titles(spreadsheet_id: str, api_key: str) -> list[str]:
    """Return all sheet/tab titles in the spreadsheet."""
    resp = requests.get(
        f"{BASE_URL}/{spreadsheet_id}",
        params={
            "fields": "sheets(properties(title))",
            "key": api_key,
        },
        timeout=30,
    )
    resp.raise_for_status()
    data = resp.json()
    return [s["properties"]["title"] for s in data.get("sheets", [])]

In [75]:
def fetch_values_batch(spreadsheet_id: str, api_key: str, sheet_titles: list[str]) -> dict[str, list[list]]:
    """Batch fetch values for provided sheet titles.
    Returns mapping of title -> 2D list of cell values (including header row).
    """
    if not sheet_titles:
        return {}
    # Multiple 'ranges' params are supported by the API
    params = [("key", api_key), ("valueRenderOption", "UNFORMATTED_VALUE"), ("dateTimeRenderOption", "FORMATTED_STRING")]
    params.extend(("ranges", title) for title in sheet_titles)
    resp = requests.get(
        f"{BASE_URL}/{spreadsheet_id}/values:batchGet",
        params=params,
        timeout=60,
    )
    resp.raise_for_status()
    payload = resp.json()

    values_by_title: dict[str, list[list]] = {}
    for vr, title in zip(payload.get("valueRanges", []), sheet_titles):
        values_by_title[title] = vr.get("values", [])
    return values_by_title

In [76]:
def values_to_dataframe(values: list[list]) -> pd.DataFrame:
    """Convert a 2D list from Sheets API to a DataFrame, using first row as header.
    Pads short rows so all rows match header length.
    """
    if not values:
        return pd.DataFrame()
    header = [str(h) for h in values[0]]
    rows = values[1:] if len(values) > 1 else []
    normalized_rows = [row + [""] * (len(header) - len(row)) for row in rows]
    return pd.DataFrame(normalized_rows, columns=header)

In [77]:
def fetch_spreadsheet_as_dataframes(spreadsheet_id: str, api_key: str) -> dict[str, pd.DataFrame]:
    """Fetch all sheets in a spreadsheet and return {sheet_title: DataFrame}."""
    titles = get_sheet_titles(spreadsheet_id, api_key)
    if not titles:
        return {}
    values_by_title = fetch_values_batch(spreadsheet_id, api_key, titles)
    return {title: values_to_dataframe(values_by_title.get(title, [])) for title in titles}

In [78]:
# # fetch all data from Google Sheets into DataFrames per sheet
# # Build DataFrames per sheet for each spreadsheet ID found in .env
# PROJECTS_DFS = fetch_spreadsheet_as_dataframes(PROJECTS_SHEET_ID, GOOGLE_SHEETS_API_KEY) if PROJECTS_SHEET_ID else {}
# ACTIVES_DFS = fetch_spreadsheet_as_dataframes(ACTIVES_SHEET_ID, GOOGLE_SHEETS_API_KEY) if ACTIVES_SHEET_ID else {}
# FALL_ATTENDANCE_DFS = fetch_spreadsheet_as_dataframes(FALL_ATTENDANCE_SHEET_ID, GOOGLE_SHEETS_API_KEY) if FALL_ATTENDANCE_SHEET_ID else {}

In [79]:
projects_df = pd.read_csv(os.getenv('PROJECTS_SHEET_URL'))
actives_df = pd.read_csv(os.getenv('ACTIVES_SHEET_URL'))
attendance_df = pd.read_csv(os.getenv('ATTENDANCE_URL'))

In [80]:
actives_df.head()

Unnamed: 0,Timestamp,Name,Year,Are you planning to be an active member this quarter? (All actives have to pay dues)
0,10/2/2025 19:37:26,Spencer,3rd,Yes
1,10/2/2025 19:37:27,Jayden Patel,2nd,Yes
2,10/2/2025 19:39:01,Mattie freaking dao,2nd,Yes
3,10/2/2025 19:42:19,aastha,3rd,Yes
4,10/2/2025 19:42:52,Mihir Joshi,4th,Yes


In [81]:
actives_df['Timestamp'] = pd.to_datetime(actives_df['Timestamp'])
actives_df['Name'] = actives_df[:1,].str.lower()
actives_df['Year'] = actives_df['Year'].map({'1st': 1, '2nd': 2, '3rd': 3, '4th': 4, '5th': 5}).fillna(actives_df['Year'])
actives_df['Are you planning to be an active member this quarter? (All actives have to pay dues)'] = actives_df.iloc[:, 3].apply(lambda x: True if x == 'Yes' else False)

InvalidIndexError: (slice(None, 1, None),)

In [70]:
actives_df.dtypes

Timestamp                                                                               datetime64[ns]
Name                                                                                            object
Year                                                                                           float64
Are you planning to be an active member this quarter? (All actives have to pay dues)              bool
dtype: object

In [None]:
def clean_df(df):
    