## Data Extraction and Transformation from Meeting Minute PDFs

This is used to extract motion language from meeting minute PDFs to find each instance of club or organization funding from UCSB's Associated Students Finance and Business Committee. The process includes discarding irrelevant motions, normalization of non-uniform organization names, and loading of details into a singular CSV for use on Power BI.

PDFs we are extracting from are UCSB AS F&B Meeting Minutes, publicly available at [AS F&B Committee Minutes](https://asfb.as.ucsb.edu/minutes2018-2019/)

In [1]:
import re
import logging
from pathlib import Path

import pandas as pd
import pdfplumber
import rapidfuzz

In [2]:
# Ignore non-critical warnings from pdfminer through pdfplumber
logging.getLogger("pdfminer").setLevel(logging.ERROR)

# Open file of all organizations as a list
with open("orgs-ucsb.txt", "r") as all_orgs_text:
    all_orgs = [line.strip() for line in all_orgs_text]

In [3]:
# This function will convert pdf pages to text holding relevant motions
# (motions to fund will only be found after the action items header)
def text_from_pdf(pdf_path):

    collecting = False
    out = ''
    
    with pdfplumber.open(pdf_path) as pdf:
         
        for page in pdf.pages: 
            
            text = page.extract_text()

            if not collecting:

                if "action items" in text.lower():
                    collecting = True
                    
            if collecting:
                out += text
                
        return out
        

# Looks for and returns list of motions found in the text
def find_motions(text):
    
    pattern = r"motion language:(.*?)action: passed"
    motions = re.findall(pattern, text.replace("\n", " ").lower(), flags=re.DOTALL)
    
    return motions
    

# Parses motions, takes normalized club name and dollar amount
def find_motion_details(motions, source_file):

    all_motion_details = []
    
    for motion in motions:
        
        pattern = r"motion to\s+(.*?)\s*\$\s*([\d,]+(?:\.\d{2})?)"

        details = re.findall(pattern, motion.lower())

        if not details or "affirm" in motion or "forward" in motion or "table" in motion or "allocate" in motion:
            continue

        raw_org_name, amount = details[0]

        # Cleans most of the words that confuse fuzzy matching
        cleaner_org_name = re.sub(r"\b( ucsb|fully fund|partially fund|strike| at|motion|fund| to| of|requesting| for)\b", "",
                              raw_org_name, flags=re.IGNORECASE).strip()
        # A motion to strike means we will want to undo an existing funding motion 
        if 'strike' in motion.lower():

            amount = '-' + amount

        _, accuracy, org_index = rapidfuzz.process.extract(cleaner_org_name, [org.lower() for org in all_orgs],
                                            scorer=rapidfuzz.fuzz.ratio,
                                            limit = 1)[0]

        all_motion_details.append({'Club/Org Name' : all_orgs[org_index],
                                   'Non-normalized Name' : cleaner_org_name,
                                   'Normalization Accuracy' : accuracy,
                                   'Funding' : amount,
                                   'Source File' : source_file
                                  })
        
    return(all_motion_details)

In [4]:
# Folder of pdfs, from UCSB AS F&B Meeting Minutes, publicly available, see above
pdf_folder = "meeting-mins-pdfs"

for file in Path(pdf_folder).iterdir():

    date_of_motion = re.sub(r"\b(meeting-mins-pdfs\\|Finance Committee Meeting Minutes.pdf|)\b", "",str(file).strip())
   
    if file.is_file():
        
        text = text_from_pdf(str(file))
        motions = find_motions(text)
        details = find_motion_details(motions, date_of_motion.strip())

        print(details)
    

[{'Club/Org Name': 'Girl Up', 'Non-normalized Name': 'girl up', 'Normalization Accuracy': 100.0, 'Funding': '900', 'Source File': '01.06.2025'}, {'Club/Org Name': 'Girl Up', 'Non-normalized Name': 'girl up', 'Normalization Accuracy': 100.0, 'Funding': '550', 'Source File': '01.06.2025'}, {'Club/Org Name': 'Model United Nations', 'Non-normalized Name': 'model united nations', 'Normalization Accuracy': 100.0, 'Funding': '2,255', 'Source File': '01.06.2025'}, {'Club/Org Name': 'Model United Nations', 'Non-normalized Name': 'model united nations', 'Normalization Accuracy': 100.0, 'Funding': '5,44.24', 'Source File': '01.06.2025'}, {'Club/Org Name': 'Staff', 'Non-normalized Name': 'msa', 'Normalization Accuracy': 50.0, 'Funding': '265', 'Source File': '01.06.2025'}, {'Club/Org Name': "UCSBreakin'", 'Non-normalized Name': 'ucsbreakin’', 'Normalization Accuracy': 90.9090909090909, 'Funding': '1,000', 'Source File': '01.06.2025'}, {'Club/Org Name': 'Association for Computing Machinery', 'Non-n