### Applicants Sample

In [1]:
applicant1 = {
    "high_school_completion": 1,
    "general_college_requirement": 
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4
        },
    "alumni": 1,
    "first": 0,
    "alumni_school_names": ["Georgia Tech", "NYU", "University of Minnesota", "U Georgia"],
    "residency": "domestic",
    "state": "MN",
    "country": "",
    "sat": 1520,
    "act": 0,
    "gpa": 3.7,
    "volunteering_hours": 150,
    "work_months": 0
}

In [2]:
applicant2 = {
    "high_school_completion": 1,
    "general_college_requirement": 
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4
        },
    "alumni": 0,
    "first": 0,
    "alumni_school_names": [],
    "residency": "international",
    "state": "",
    "country": "Korea, Republic of",
    "sat": 1520,
    "act": 0,
    "gpa": 3.7,
    "volunteering_hours": 150,
    "work_months": 0
}

In [3]:
applicant3 = {
    "high_school_completion": 1,
    "general_college_requirement":         
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4
        },
    "alumni": 0,
    "first": 0,
    "alumni_school_names": [],
    "residency": "international",
    "state": "",
    "country": "Korea, Republic of",
    "sat": 1480,
    "act": 0,
    "gpa": 3.9,
    "volunteering_hours": 150,
    "work_months": 3
}

In [4]:
applicant4 = {
    "high_school_completion": 1,
    "general_college_requirement": 
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4
        },
    "alumni": 0,
    "first": 0,
    "alumni_school_names": [],
    "residency": "international",
    "state": "",
    "country": "Korea, Republic of",
    "sat": 1300,
    "act": 0,
    "gpa": 2.8,
    "volunteering_hours": 150,
    "work_months": 0
}

### Crawling

In [5]:
import os
import pdfplumber
import re

import pandas as pd

import openpyxl

In [6]:
input_folder = 'pdf'

gatech_file_path = os.path.join(input_folder, 'gatech.pdf')
umn_file_path = os.path.join(input_folder, 'umn.pdf')
nyu_file_path = os.path.join(input_folder, 'nyu.pdf')
uga_file_path = os.path.join(input_folder, 'uga.pdf')

In [7]:
# Remove None and empty string from row
def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [8]:
# Extract only numbers from the cell
def clean_number(cell):
    cell = re.sub(r'[^\d.]', '', str(cell)) # Remove all non-numeric characters
    try:
        return float(cell)
    except:
        return 0

In [9]:
# Extract text from a section like "C{#}." up to before "C{#+1}."

def extract_section_by_label(file_path, section_label):
    next_label = "c" + str(int(section_label[1:]) + 1)

    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text.lower()

    # Match exact line starting with "c{#}."
    # ^ = matches the beginning of a line
    start_match = re.search(rf"({section_label.lower()}[\. ])", full_text)
    if not start_match:
        print(f"Section {section_label.upper()} not found.")
        return
    start_idx = start_match.start()

    # Look for the start of the next section
    next_match = re.search(rf"({next_label}[\. ])", full_text[start_idx:])
    end_idx = start_idx + next_match.start() if next_match else None # If we don't find the next label, go to the end of the document

    section_text = full_text[start_idx:end_idx]

    return section_text

##### Residency

In [10]:
def extract_residency_data(file_path):
    
    result = {
        'applicants': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0},
        'admitted': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0}
    }

    # Define keywords to identify residency-related and first-year-related tables
    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in-state': -1, 'out-of-state': -1, 'international': -1, 'others': -1}
                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in-state'] = idx
                    elif 'out-' in col:
                        col_idx['out-of-state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                   # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add residency-specific count
                    if col_idx['in-state'] != -1 and col_idx['in-state'] < len(row):
                        result[target]['in-state'] += clean_number(row[col_idx['in-state']])
                    if col_idx['out-of-state'] != -1 and col_idx['out-of-state'] < len(row):
                        result[target]['out-of-state'] += clean_number(row[col_idx['out-of-state']])
                    if col_idx['international'] != -1 and col_idx['international'] < len(row):
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1 and col_idx['others'] < len(row):
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [11]:
extract_residency_data(gatech_file_path)

{'applicants': {'in-state': 10674.0,
  'out-of-state': 38320.0,
  'international': 10795.0,
  'others': 0.0},
 'admitted': {'in-state': 3536.0,
  'out-of-state': 3992.0,
  'international': 885.0,
  'others': 0.0}}

In [12]:
extract_residency_data(umn_file_path)

{'applicants': {'in-state': 13982.0,
  'out-of-state': 19461.0,
  'international': 1256.0,
  'others': 0},
 'admitted': {'in-state': 10498.0,
  'out-of-state': 15400.0,
  'international': 982.0,
  'others': 0}}

In [13]:
extract_residency_data(nyu_file_path)

{'applicants': {'in-state': 0,
  'out-of-state': 0.0,
  'international': 0,
  'others': 0},
 'admitted': {'in-state': 0,
  'out-of-state': 0.0,
  'international': 0,
  'others': 0}}

In [14]:
extract_residency_data(uga_file_path)

{'applicants': {'in-state': 18210.0,
  'out-of-state': 23867.0,
  'international': 1338.0,
  'others': 1.0},
 'admitted': {'in-state': 9149.0,
  'out-of-state': 6709.0,
  'international': 289.0,
  'others': 1.0}}

##### High School Completion Requirement

In [15]:
def extract_highschool_requirement(file_path):

    result = {
        'high school diploma required': 0,
        'GED accepted': 0
    }

    # Define exact phrases expected in each case
    diploma_and_ged = "high school diploma is required and ged is accepted"
    diploma_only = "high school diploma is required and ged is not accepted"
    no_diploma_needed = "high school diploma or equivalent is not required"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C3 section using helper function
    section_text = extract_section_by_label(file_path, "C3")
    if not section_text:
        return result  # Return default if section not found

    section_text = section_text.lower()
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if diploma_and_ged in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
            found_checked_option = True
            break
        elif diploma_only in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
            found_checked_option = True
            break
        elif no_diploma_needed in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 0
            result['GED accepted'] = 0
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if diploma_and_ged in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
        elif diploma_only in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
        elif no_diploma_needed in section_text:
            result['high school diploma required'] = 0
            result['GED accepted'] = 0

    return result

In [16]:
extract_highschool_requirement(gatech_file_path)

{'high school diploma required': 1, 'GED accepted': 0}

In [17]:
extract_highschool_requirement(umn_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [18]:
extract_highschool_requirement(nyu_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [19]:
extract_highschool_requirement(uga_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

##### State

In [20]:
def extract_state(file_path):
    state_name_to_abbr = {
        'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
        'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
        'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
        'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
        'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
        'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN',
        'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE',
        'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
        'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
        'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR',
        'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
        'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
        'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
        'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
    }

    section_text = extract_section_by_label(file_path, 'A1')
    if not section_text:
        return {"State": "Unknown"}

    lines = section_text.lower().splitlines()[:20]

    for line in lines:
        if 'state' in line:
            # Full state name
            for state, abbr in state_name_to_abbr.items():
                if state.lower() in line:
                    return {"State": abbr}
            # Abbreviation state name 
            for abbr in state_name_to_abbr.values():
                if f' {abbr.lower()} ' in f' {line} ':
                    return {"State": abbr}

    return {"State": "Unknown"}

In [21]:
extract_state(gatech_file_path)

{'State': 'GA'}

In [22]:
extract_state(umn_file_path)

{'State': 'MN'}

In [23]:
extract_state(nyu_file_path)

{'State': 'NY'}

In [24]:
extract_state(uga_file_path)

{'State': 'GA'}

##### School Name

In [25]:
def extract_school_name(file_path):
    text = extract_section_by_label(file_path, "A1")
    lines = text.lower().splitlines()
    for line in lines:
        # Handle both "name of college or university" and "name of college/university" formats
        match = re.match(r"\s*name of college\s*(?:or|/)\s*university[:\s]+(.+)", line, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

In [26]:
extract_school_name(gatech_file_path)

'georgia institute of technology'

In [27]:
extract_school_name(umn_file_path)

'university of minnesota – twin cities campus'

In [28]:
extract_school_name(nyu_file_path)

'new york university'

In [29]:
extract_school_name(uga_file_path)

'university of georgia'

##### General College-Preparatory Program Requirement

In [30]:
def extract_college_prep_requirement(file_path):
    
    result = {
        'general college-preparatory program': 
            {
                'required': 0,
                'recommended': 0,
                'neither required or recommended': 0
            }
        }

    # Define key phrases to match
    required = "require"
    recommended = "recommend"
    neither = "neither require nor recommend"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C4 section using helper function
    section_text = extract_section_by_label(file_path, 'C4')
    if not section_text:
        return result # Return default if section not found
    section_text = section_text.lower()
    
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        
        # Skip the question sectence
        if line.startswith('c4'):
            continue
        
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if neither in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['neither required or recommended'] = 1
            found_checked_option = True
            break
        elif required in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['required'] = 1
            found_checked_option = True
            break
        elif recommended in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['recommended'] = 1
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if "neither require nor recommend" in section_text:
            result['general college-preparatory program']['neither required or recommended'] = 1
        elif "recommend" in section_text:
            result['general college-preparatory program']['recommended'] = 1
        elif "require" in section_text:
            result['general college-preparatory program']['required'] = 1

    return result

In [31]:
extract_college_prep_requirement(gatech_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [32]:
extract_college_prep_requirement(umn_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [33]:
extract_college_prep_requirement(nyu_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [34]:
extract_college_prep_requirement(uga_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [35]:
def extract_general_college_subjects(file_path):
    import pdfplumber

    result = {
        "english": 0,
        "math": 0,
        "science": 0,
        "scienceLab": 0,
        "language": 0,
        "social": 0,
        "arts": 0
    }

    # Keywords that commonly appear in the C5 subject distribution table
    subject_keywords = ["english", "math", "science"]

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if not table or len(table[0]) < 2:
                    continue  # Skip empty or invalid tables

                # Convert header cells to lowercase for comparison
                headers = [cell.lower() if cell else "" for cell in table[0]]

                # Check if table looks like a C5 table based on header content
                header_match = "distribution of high" in headers[0] or \
                               ("units required" in headers[1] if len(headers) > 1 else False)

                # nyu - Check if at least 3 subject-related keywords appear in row labels
                match_count = 0
                for row in table:
                    if not row or len(row) < 1:
                        continue
                    label = row[0].lower().strip() if row[0] else ""
                    if any(keyword in label for keyword in subject_keywords):
                        match_count += 1
                keyword_match = match_count >= 3

                # Skip this table if neither condition is satisfied
                if not (header_match or keyword_match):
                    continue

                # Parse the table rows: skip first row if headers are matched
                for row in table[1:] if header_match else table:
                    row = clean_row([cell.lower().strip() if cell else "" for cell in row])
                    if not row or len(row) < 1:
                        continue

                    label = row[0]
                    value = row[1] if len(row) > 1 else ""

                    # Match labels to subjects and clean the numerical value
                    if "english" in label:
                        result["english"] = clean_number(value)
                    elif "math" in label:
                        result["math"] = clean_number(value)
                    elif label == "science":
                        result["science"] = clean_number(value)
                    elif "lab" in label:
                        result["scienceLab"] = clean_number(value)
                    elif "language" in label:
                        result["language"] = clean_number(value)
                    elif "social studies" in label:
                        result["social"] = clean_number(value)
                    elif "arts" in label:
                        result["arts"] = clean_number(value)

                # Return immediately after finding and parsing the correct C5 table
                return result

    return result  # Return default if no matching table was found


In [36]:
extract_general_college_subjects(gatech_file_path)

{'english': 4.0,
 'math': 4.0,
 'science': 4.0,
 'scienceLab': 2.0,
 'language': 2.0,
 'social': 3.0,
 'arts': 0}

In [37]:
extract_general_college_subjects(umn_file_path)

{'english': 4.0,
 'math': 4.0,
 'science': 3.0,
 'scienceLab': 1.0,
 'language': 2.0,
 'social': 3.0,
 'arts': 1.0}

In [38]:
extract_general_college_subjects(nyu_file_path)

{'english': 4.0,
 'math': 3.0,
 'science': 3.0,
 'scienceLab': 3.0,
 'language': 3.0,
 'social': 3.0,
 'arts': 0}

In [39]:
extract_general_college_subjects(uga_file_path)

{'english': 4.0,
 'math': 4.0,
 'science': 4.0,
 'scienceLab': 2.0,
 'language': 2.0,
 'social': 3.0,
 'arts': 0}

### Preprocessing

In [40]:
def calculate_acceptance_rates(data):
    applicants = data['applicants']
    admitted = data['admitted']
    rates = {}
    for category in applicants:
        applied = applicants[category]
        accepted = admitted.get(category, 0)
        if applied > 0:
            rates[category] = accepted / applied
        else:
            rates[category] = None 
    return rates

In [41]:
calculate_acceptance_rates(extract_residency_data(gatech_file_path))

{'in-state': 0.3312722503278996,
 'out-of-state': 0.10417536534446764,
 'international': 0.08198239925891616,
 'others': None}

In [42]:
calculate_acceptance_rates(extract_residency_data(umn_file_path))

{'in-state': 0.7508224860534973,
 'out-of-state': 0.7913262422280458,
 'international': 0.7818471337579618,
 'others': None}

In [43]:
calculate_acceptance_rates(extract_residency_data(nyu_file_path))

{'in-state': None, 'out-of-state': None, 'international': None, 'others': None}

In [44]:
calculate_acceptance_rates(extract_residency_data(uga_file_path))

{'in-state': 0.5024162548050521,
 'out-of-state': 0.2810994259856706,
 'international': 0.21599402092675635,
 'others': 1.0}

##### SAT or ACT

In [45]:
def is_marked(cell, marks=['x', '☑', '☒', '✓', '✔', '4']):
    if not cell:
        return False
    cell_str = str(cell).strip().lower()
    return any(mark in cell_str for mark in marks)

In [46]:
def classify_requirement(text):
    text = text.lower().replace('\n', ' ')
    if "required to be considered" in text:
        return "Required"
    elif "required for some" in text:
        return "Required for some"
    elif "recommended" in text:
        return "Recommended"
    elif "not required" in text:
        return "Optional"
    elif "not considered" in text:
        return "Not considered"
    return "Unknown"

# Nyu case
def extract_requirement_from_text_lines(text_lines, keyword):
    for i, line in enumerate(text_lines):
        if keyword.lower() in line.lower():
            if i + 1 < len(text_lines):
                next_line = text_lines[i + 1].strip()
                return classify_requirement(next_line)
    return "Unknown"

# Nyu case
def extract_from_text(pdf):
    labels = {
        "SAT or ACT": "SAT and/or ACT",
        "ACT Only": "ACT Only",
        "SAT Only": "SAT Only"
    }
    result = {f"{label} Requirement": "Unknown" for label in labels}

    full_text = ""
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            full_text += "\n" + text

    lines = full_text.split("\n")

    for label, keyword in labels.items():
        result[f"{label} Requirement"] = extract_requirement_from_text_lines(lines, keyword)

    return result

# Gatech, Umn case
def extract_from_table(pdf):
    labels = ["SAT or ACT", "ACT Only", "SAT Only"]
    result = {f"{label} Requirement": "Unknown" for label in labels}

    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            if not table or len(table) < 2:
                continue

            table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
            if "sat or act" not in table_str:
                continue

            headers = [str(cell).strip().lower() if cell else "" for cell in table[0]]

            for row in table[1:]:
                row_cells = [str(cell).strip().lower() if cell else "" for cell in row]
                for label in labels:
                    if label.lower() in row_cells[0]:
                        for i in range(1, len(row)):
                            if i < len(headers) and is_marked(row[i]):
                                category = classify_requirement(headers[i])
                                if category:
                                    result[f"{label} Requirement"] = category
                                    break
    return result

def extract_sat_act_required(file_path):
    with pdfplumber.open(file_path) as pdf:
        table_result = extract_from_table(pdf)
        if all(value == "Unknown" for value in table_result.values()):
            return extract_from_text(pdf)
        return table_result

In [47]:
extract_sat_act_required(gatech_file_path)

{'SAT or ACT Requirement': 'Required',
 'ACT Only Requirement': 'Unknown',
 'SAT Only Requirement': 'Unknown'}

In [48]:
extract_sat_act_required(umn_file_path)

{'SAT or ACT Requirement': 'Optional',
 'ACT Only Requirement': 'Unknown',
 'SAT Only Requirement': 'Unknown'}

In [49]:
extract_sat_act_required(nyu_file_path)

{'SAT or ACT Requirement': 'Recommended',
 'ACT Only Requirement': 'Optional',
 'SAT Only Requirement': 'Optional'}

In [50]:
extract_sat_act_required(uga_file_path)

{'SAT or ACT Requirement': 'Unknown',
 'ACT Only Requirement': 'Unknown',
 'SAT Only Requirement': 'Unknown'}

In [51]:
def extract_sat_act_scores(file_path):
    regex_map = {
        "SAT Composite": r"sat.*composite",
        "SAT EBRW": r"sat.*(evidence|ebrw|writing)",
        "SAT Math": r"sat.*math",
        "ACT Composite": r"act.*composite",
        "ACT Math": r"act.*math",
        "ACT English": r"act.*english",
        "ACT Writing": r"act.*writing",
        "ACT Science": r"act.*science",
        "ACT Reading": r"act.*reading"
    }

    score_data = {label: {'25th': None, '50th': None, '75th': None} for label in regex_map}

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            text = page.extract_text() or ""

            act_writing_present = "act writing" in text.lower()

            for table in tables:
                 for row in table:
                    cleaned_row = [str(cell).lower().strip() if cell else "" for cell in row]
                    row_label = cleaned_row[0]

                    matched_label = None
                    for label, pattern in regex_map.items():
                        if re.search(pattern, row_label):
                            matched_label = label
                            break

                    if not matched_label and re.fullmatch(r"writing", row_label):
                        matched_label = "SAT EBRW"

                    if matched_label:
                        numeric_values = [cell for cell in cleaned_row[1:] if cell.replace('.', '', 1).isdigit()]
                        for i, key in enumerate(['25th', '50th', '75th']):
                            if i < len(numeric_values):
                                score_data[matched_label][key] = numeric_values[i]

    return score_data

In [52]:
extract_sat_act_scores(gatech_file_path)

{'SAT Composite': {'25th': '1370', '50th': '1460', '75th': '1530'},
 'SAT EBRW': {'25th': '680', '50th': '720', '75th': '750'},
 'SAT Math': {'25th': '690', '50th': '760', '75th': '790'},
 'ACT Composite': {'25th': '30', '50th': '33', '75th': '34'},
 'ACT Math': {'25th': '29', '50th': '32', '75th': '35'},
 'ACT English': {'25th': '31', '50th': '34', '75th': '35'},
 'ACT Writing': {'25th': '8', '50th': '8', '75th': '9'},
 'ACT Science': {'25th': '29', '50th': '33', '75th': '35'},
 'ACT Reading': {'25th': '31', '50th': '34', '75th': '35'}}

In [53]:
extract_sat_act_scores(umn_file_path)

{'SAT Composite': {'25th': '1328', '50th': '1400', '75th': '1460'},
 'SAT EBRW': {'25th': '640', '50th': '680', '75th': '720'},
 'SAT Math': {'25th': '670', '50th': '720', '75th': '760'},
 'ACT Composite': {'25th': '27', '50th': '29', '75th': '31'},
 'ACT Math': {'25th': '25', '50th': '28', '75th': '31'},
 'ACT English': {'25th': '25', '50th': '28', '75th': '33'},
 'ACT Writing': {'25th': '7', '50th': '8', '75th': '9'},
 'ACT Science': {'25th': '25', '50th': '28', '75th': '32'},
 'ACT Reading': {'25th': '27', '50th': '31', '75th': '34'}}

In [54]:
extract_sat_act_scores(nyu_file_path)

{'SAT Composite': {'25th': None, '50th': None, '75th': None},
 'SAT EBRW': {'25th': None, '50th': None, '75th': None},
 'SAT Math': {'25th': None, '50th': None, '75th': None},
 'ACT Composite': {'25th': None, '50th': None, '75th': None},
 'ACT Math': {'25th': None, '50th': None, '75th': None},
 'ACT English': {'25th': None, '50th': None, '75th': None},
 'ACT Writing': {'25th': None, '50th': None, '75th': None},
 'ACT Science': {'25th': None, '50th': None, '75th': None},
 'ACT Reading': {'25th': None, '50th': None, '75th': None}}

In [55]:
extract_sat_act_scores(uga_file_path)

{'SAT Composite': {'25th': '1230', '50th': '1320', '75th': '1410'},
 'SAT EBRW': {'25th': '620', '50th': '670', '75th': '710'},
 'SAT Math': {'25th': '600', '50th': '660', '75th': '710'},
 'ACT Composite': {'25th': '27', '50th': '7', '75th': '3'},
 'ACT Math': {'25th': '25', '50th': '28', '75th': '31'},
 'ACT English': {'25th': '26', '50th': '31', '75th': '34'},
 'ACT Writing': {'25th': None, '50th': None, '75th': None},
 'ACT Science': {'25th': '25', '50th': '29', '75th': '32'},
 'ACT Reading': {'25th': '28', '50th': '32', '75th': '34'}}

##### Importance

In [56]:
def extract_relative_importance(file_path):
    importance_levels = ["Very Important", "Important", "Considered", "Not Considered"]

    academic_factors = [
        "Rigor of secondary school record",
        "Class rank",
        "Academic Grade Point Average (GPA)",
        "Recommendations",
        "Standardized test scores",
        "Application essay"
    ]

    nonacademic_factors = [
        "Interview",
        "Extracurricular activities",
        "Talent/ability",
        "Character/personal qualities",
        "First generation",
        "Alumni/ae relation",
        "Geographical residence",
        "State residency",
        "Religious affiliation/commitment",
        "Racial/ethnic status",
        "Volunteer work",
        "Work experience",
        "Level of applicant’s interest"
    ]

    result = {
        "Academic": {},
        "Nonacademic": {}
    }

    with pdfplumber.open(file_path) as pdf:
        # STEP 1: Gatech style (table-based)
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if not table or len(table) < 2:
                    continue
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not all(level.lower() in table_str for level in importance_levels):
                    continue
                headers = [str(cell).strip() if cell else "" for cell in table[0]]
                for row in table[1:]:
                    row = [str(cell).strip() if cell else "" for cell in row]
                    if not row or row[0].lower() in ["academic", "nonacademic"]:
                        continue
                    factor = row[0]
                    for i in range(1, min(len(row), len(headers))):
                        if is_marked(row[i]):
                            importance = headers[i].strip()
                            section = "Academic" if factor in academic_factors else "Nonacademic"
                            result[section][factor] = importance
                            break

    if result["Academic"] or result["Nonacademic"]:
        return result
    
    # STEP 2: NYU style fallback (text-based)
    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += "\n" + text

    def find_importance_by_text(factor):
        pattern = factor.lower()
        idx = full_text.lower().find(pattern)
        if idx == -1:
            return None
        after = full_text[idx + len(pattern):].strip()
        for level in importance_levels:
            if after.lower().startswith(level.lower()):
                return level
        return None

    for factor in academic_factors:
        importance = find_importance_by_text(factor)
        if importance:
            result["Academic"][factor] = importance

    for factor in nonacademic_factors:
        importance = find_importance_by_text(factor)
        if importance:
            result["Nonacademic"][factor] = importance

    return result

In [57]:
extract_section_by_label(umn_file_path, "C7")

'c7. relative importance of each of the following academic and nonacademic factors in your first-\ntime, first-year, degree-seeking general (not including programs with specific criteria) admission\ndecisions.\nnot\nfactors very important important considered\nconsidered\nacademic\nrigor of secondary school\n☒ ☐ ☐ ☐\nrecord\nclass rank ☒ ☐ ☐ ☐\nacademic gpa ☒ ☐ ☐ ☐\nstandardized test scores ☐ ☐ ☒ ☐\napplication essay ☐ ☐ ☒ ☐\nrecommendation ☐ ☐ ☒ ☐\nnonacademic\ninterview ☐ ☐ ☐ ☒\nextracurricular activities ☐ ☐ ☒ ☐\ntalent/ability ☐ ☐ ☒ ☐\ncharacter/personal qualities ☐ ☐ ☒ ☐\nfirst generation ☐ ☐ ☒ ☐\nalumni/ae relation ☐ ☐ ☐ ☒\ngeographical residence ☐ ☐ ☒ ☐\nstate residency ☐ ☐ ☒ ☐\nreligious\n☐ ☐ ☐ ☒\naffiliation/commitment\nvolunteer work ☐ ☐ ☒ ☐\nwork experience ☐ ☐ ☒ ☐\nlevel of applicant’s interest ☐ ☐ ☐ ☒\nplease provide additional information if the importance of any specific academic or nonacademic\nfactors differ by academic program.\n⇒ click or tap here to enter text.commo

In [58]:
extract_section_by_label(uga_file_path, "C7")

'c7. relative importance of each of the following academic and nonacademic factors in your first-time, first-\nyear, degree-seeking general (not including programs with specific criteria) admission decisions.\nnot\nvery important important considered\nconsidered\nacademic\nrigor of secondary school record 4\n☐ ☐ ☐ ☐\nclass rank 4\n☐ ☐ ☐ ☐\nacademic gpa 4\n☐ ☐ ☐ ☐\nstandardized test scores 4\n☐ ☐ ☐ ☐\napplication essay 4\n☐ ☐ ☐ ☐\nrecommendation 4\n☐ ☐ ☐ ☐\nnonacademic\ninterview 4\n☐ ☐ ☐ ☐\nextracurricular activities 4\n☐ ☐ ☐ ☐\ntalent/ability 4\n☐ ☐ ☐ ☐\ncharacter/personal qualities 4\n☐ ☐ ☐ ☐\nfirst generation 4\n☐ ☐ ☐ ☐\nalumni/ae relation 4\n☐ ☐ ☐ ☐\ngeographical residence 4\n☐ ☐ ☐ ☐\nstate residency 4\n☐ ☐ ☐ ☐\nreligious affiliation/commitment 4\n☐ ☐ ☐ ☐\nvolunteer work 4\n☐ ☐ ☐ ☐\nwork experience 4\n☐ ☐ ☐ ☐\nlevel of applicant’s interest 4\n☐ ☐ ☐ ☐common data set 2023-2024\nplease provide additional information if the importance of any specific academic or nonacademic\nfactors di

In [59]:
extract_relative_importance(gatech_file_path)

{'Academic': {'Rigor of secondary school record': 'Very Important',
  'Class rank': 'Not Considered',
  'Standardized test scores': 'Considered'},
 'Nonacademic': {'Academic GPA': 'Very Important',
  'Application Essay': 'Important',
  'Recommendation(s)': 'Considered',
  'Interview': 'Not Considered',
  'Extracurricular activities': 'Important',
  'Talent/ability': 'Considered',
  'Character/personal qualities': 'Very Important',
  'First generation': 'Considered',
  'Alumni/ae relation': 'Not Considered',
  'Geographical residence': 'Considered',
  'State residency': 'Very Important',
  'Religious affiliation/commitment': 'Not Considered',
  'Volunteer work': 'Considered',
  'Work experience': 'Considered',
  'Level of applicant’s interest': 'Not Considered'}}

In [60]:
extract_relative_importance(umn_file_path)

{'Academic': {}, 'Nonacademic': {}}

In [61]:
extract_relative_importance(nyu_file_path)

{'Academic': {'Rigor of secondary school record': 'Very Important',
  'Class rank': 'Not Considered',
  'Academic Grade Point Average (GPA)': 'Very Important',
  'Recommendations': 'Very Important',
  'Standardized test scores': 'Important',
  'Application essay': 'Very Important'},
 'Nonacademic': {'Interview': 'Not Considered',
  'Talent/ability': 'Considered',
  'Character/personal qualities': 'Very Important',
  'First generation': 'Considered',
  'Alumni/ae relation': 'Not Considered',
  'Geographical residence': 'Considered',
  'State residency': 'Not Considered',
  'Volunteer work': 'Considered',
  'Work experience': 'Considered'}}

In [62]:
extract_relative_importance(uga_file_path)

{'Academic': {}, 'Nonacademic': {}}

### CDS

In [69]:
def build_cds_data(file_path):
    
    def safe_int(score): 
        try: return int(score)
        except: return 0
    
    residency_data = extract_residency_data(file_path)
    residency_acceptance = calculate_acceptance_rates(residency_data)
    
    highschool_requirement = extract_highschool_requirement(file_path)
    
    college_prep_requirement = extract_college_prep_requirement(file_path)
    college_prep_subjects = extract_general_college_subjects(file_path)
    
    sat_act_requirement = extract_sat_act_required(file_path)
    sat_act_scores = extract_sat_act_scores(file_path)
    sat_composite = sat_act_scores.get("SAT Composite")
    act_composite = sat_act_scores.get("ACT Composite")
    
    relative_importance = extract_relative_importance(file_path)
    
    cds_data = {
        "school_name": extract_school_name(file_path),
        "state": extract_state(file_path),
        "high_school_completion_required": bool(highschool_requirement["high school diploma required"]),
        "general_college_preparatory_required": bool(college_prep_requirement["general college-preparatory program"]["required"]),
        "general_college_subjects": college_prep_subjects, 
        "sat_act_required": {
            "sat_or_act": sat_act_requirement.get("SAT or ACT Requirement", "").lower() == "required",
            "sat only": sat_act_requirement.get("SAT Only Requirement", "").lower() == "required",
            "act only": sat_act_requirement.get("ACT Only Requirement", "").lower() == "required"
        },
        "alumni_importance": relative_importance["Nonacademic"].get("Alumni/ae relation", "Unknown"),
        "first_generation_importance": relative_importance["Nonacademic"].get("First generation", "Unknown"),
        "residency_importance": relative_importance["Nonacademic"].get("State residency", "Unknown"),
        "sat_act_importance": relative_importance["Academic"].get("Standardized test scores", "Unknown"),
        "gpa_importance": relative_importance["Academic"].get("Academic Grade Point Average (GPA)", "Unknown"),
        "residency_acceptance": residency_acceptance,
        "sat_scores": {
            "25th": safe_int(sat_composite.get("25th")),
            "50th": safe_int(sat_composite.get("50th")),
            "75th": safe_int(sat_composite.get("75th"))
        },
        "act scores": {
            "25th": safe_int(act_composite.get("25th")),
            "50th": safe_int(act_composite.get("50th")),
            "75th": safe_int(act_composite.get("75th"))
        },
        "volunteer_work": relative_importance["Nonacademic"].get("Volunteer work", "Unknown"),
        "work_experience": relative_importance["Nonacademic"].get("Work experience", "Unknown")
    }

    return cds_data

In [70]:
build_cds_data(gatech_file_path)

{'school_name': 'georgia institute of technology',
 'state': {'State': 'GA'},
 'high_school_completion_required': True,
 'general_college_preparatory_required': True,
 'general_college_subjects': {'english': 4.0,
  'math': 4.0,
  'science': 4.0,
  'scienceLab': 2.0,
  'language': 2.0,
  'social': 3.0,
  'arts': 0},
 'sat_act_required': {'sat_or_act': True,
  'sat only': False,
  'act only': False},
 'alumni_importance': 'Not Considered',
 'first_generation_importance': 'Considered',
 'residency_importance': 'Very Important',
 'sat_act_importance': 'Considered',
 'gpa_importance': 'Unknown',
 'residency_acceptance': {'in-state': 0.3312722503278996,
  'out-of-state': 0.10417536534446764,
  'international': 0.08198239925891616,
  'others': None},
 'sat_scores': {'25th': 1370, '50th': 1460, '75th': 1530},
 'act scores': {'25th': 30, '50th': 33, '75th': 34},
 'volunteer_work': 'Considered',
 'work_experience': 'Considered'}

In [71]:
def save_cds_data_to_excel(input_folder, output_folder):
    # Iterate over all files in the specified folder
    for file in os.listdir(input_folder):
        if file.endswith(".pdf"): # process only pdf files
            file_path = os.path.join(input_folder, file)
            try:
                cds_data = build_cds_data(file_path)

                school_name = cds_data.get("school_name") or os.path.splitext(file)[0] # try to get school name; use file name if missing
                safe_name = school_name.replace("/", "_").replace("\\", "_")

                df = pd.json_normalize(cds_data, sep='.') # convert nested dictionary to a flat dataframe

                output_path = os.path.join(output_folder, f"{safe_name}.xlsx")
                df.to_excel(output_path, index=False)

                print(f"Saved: {output_path}")

            except Exception as e:
                print(f"Failed to process {file}: {e}")


In [72]:
save_cds_data_to_excel("pdf", "output_cds")

Saved: output_cds\georgia institute of technology.xlsx
Saved: output_cds\new york university.xlsx
Saved: output_cds\university of georgia.xlsx
Saved: output_cds\university of minnesota – twin cities campus.xlsx


In [89]:
gatech_cds = {
    "school_name": "Georgia Tech",
    "state": "GA",
    "high_school_completion_required": True,
    "general_college_preparatory_required": True,
    "general_college_subjects":
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 0
        },
    "sat_act_required":{
        "sat_or_act": True,
        "sat only": False,
        "act only": False
    },
    "alumni_importance": "Not Considered",
    "first_generation_importance": "Considered",
    "residency_importance": "Very Important",
    "sat_act_importance": "Considered",
    "gpa_importance": "Very Important",
    "residency_acceptance": {
        "international": 0.082,
        "in-state": 0.3313,
        "out-of-state": 0.1042
    },
    "sat_scores": {
        "25th": 1400,
        "50th": 1500,
        "75th": 1560
    },
    "act_score": {
        "25th": 30,
        "50th": 33,
        "75th": 34   
    },
    "volunteer_work": "Considered",
    "work_experience": "Not Considered"
}

In [90]:
umn_cds = {
    "school_name": "University of Minnesota",
    "state": "MN",
    "high_school_completion_required": True,
    "general_college_preparatory_required": True,
    "general_college_subjects":
        {
            "english": 4,
            "math": 4,
            "science": 3,
            "scienceLab": 1,
            "language": 2,
            "social": 3,
            "arts": 1
        },
    "sat_act_required":{
        "sat_or_act": True,
        "sat only": False,
        "act only": False
    },
    "alumni_importance": "Not Considered",
    "first_generation_importance": "Considered",
    "residency_importance": "Considered",
    "sat_act_importance": "Considered",
    "gpa_importance": "Very Important",
    "residency_acceptance": {
        "international": 0.7818,
        "in-state": 0.7508,
        "out-of-state": 0.7913
    },
    "sat_scores": {
        "25th": 1328,
        "50th": 1400,
        "75th": 1460
    },
    "act_scores": {
        "25th": 27,
        "50th": 29,
        "75th": 31
    },
    "volunteer_work": "Considered",
    "work_experience": "Considered"
}

### Modeling

##### Importance Related

In [91]:
import pandas as pd
from collections import Counter

# Example list of CDS importance data for multiple schools (gatech, umn, nyu, uga)
cds_importance_data = [
    {
        "gpa_importance": "Very Important",
        "sat_act_importance": "Considered",
        "residency_importance": "Very Important",
        "alumni_importance": "Not Considered",
        "volunteer_work": "Considered",
        #"work_experience": "Considered" 
    },
    {
        "gpa_importance": "Very Important",
        "sat_act_importance": "Considered",
        "residency_importance": "Considered",
        "alumni_importance": "Not Considered",
        "volunteer_work": "Considered",
        #"work_experience": "Considered"
    },
    {
        "gpa_importance": "Very Important",
        "sat_act_importance": "Important",
        "residency_importance": "Not Considered",
        "alumni_importance": "Not Considered",
        "volunteer_work": "Considered",
        #"work_experience": "Considered"
    },
        {
        "gpa_importance": "Very Important",
        "sat_act_importance": "Important",
        "residency_importance": "Not Considered",
        "alumni_importance": "Considered", # mock data
        "volunteer_work": "Considered",
        #"work_experience": "Considered"
    }
]

# 1. Count importance levels per category
importance_levels = ["Very Important", "Important", "Considered", "Not Considered"]
categories = ["gpa_importance", "sat_act_importance", "residency_importance", "alumni_importance", "volunteer_work"]
importance_counts = {cat: Counter() for cat in categories}

for record in cds_importance_data:
    for cat in categories:
        importance_counts[cat][record[cat]] += 1

# 2. Calculate average weights
weight_map = {
    "Very Important": 1.0,
    "Important": 0.9,
    "Considered": 0.8,
    "Not Considered": 0.0
}

average_weights = {}
for cat in categories:
    total = sum(importance_counts[cat].values())
    if total == 0:
        average_weights[cat] = 0
        continue
    weighted_sum = sum(weight_map[level] * count for level, count in importance_counts[cat].items())
    average_weights[cat] = round(weighted_sum / total, 4)

average_weights


{'gpa_importance': 1.0,
 'sat_act_importance': 0.85,
 'residency_importance': 0.45,
 'alumni_importance': 0.2,
 'volunteer_work': 0.8}

In [92]:
# Convert average weights into max point allocation (total = 100 points)
def normalize_max_points_from_avg_weights(average_weights, total_points=100):
    total_weight = sum(average_weights.values())
    max_points_by_cat = {
        key.replace('_importance', ''): round((w / total_weight) * total_points, 2)
        for key, w in average_weights.items()
    }
    return max_points_by_cat

In [93]:
max_points_by_cat = normalize_max_points_from_avg_weights(average_weights)
max_points_by_cat

{'gpa': 30.3,
 'sat_act': 25.76,
 'residency': 13.64,
 'alumni': 6.06,
 'volunteer_work': 24.24}

----------------

##### Total Score

In [94]:
def calculate_total_max_score(school_info, max_points_by_cat, weight_map):
    total = 0
    category_to_importance_key = {
        'gpa': 'gpa_importance',
        'sat_act': 'sat_act_importance',
        'residency': 'residency_importance',
        'alumni': 'alumni_importance',
        'volunteer_work': 'volunteer_work',
        'work_experience': 'work_experience'
    }
    
    for category, max_point in max_points_by_cat.items():
        importance_key = category_to_importance_key.get(category)
        if importance_key and importance_key in school_info:
            importance = school_info[importance_key]
            weight = weight_map.get(importance, 0)
            total += max_point * weight
        else:
            continue
    return total

In [95]:
weight_map = {
    "Very Important": 1.0,
    "Important": 0.9,
    "Considered": 0.8,
    "Not Considered": 0.0
}

In [96]:
calculate_total_max_score(gatech_cds, max_points_by_cat, weight_map)

83.94

In [97]:
calculate_total_max_score(umn_cds, max_points_by_cat, weight_map)

81.212

------------

##### Compatibility Score

In [98]:
def calculate_compatibility_score(applicant, school_requirements, max_points_by_cat):
    
    # 1. Check essential requirments
    if school_requirements["high_school_completion_required"]:
        if not applicant["high_school_completion"]:
            return {"score": 0, "details": {"reason": "High school completion requirement not met"}}
    
    if school_requirements["general_college_preparatory_required"]:
        required_subjects = school_requirements.get("general_college_subjects", {})
        applicant_subjects = applicant.get("general_college_requirement", {})
        
        for subject, required_count in required_subjects.items():
            applicant_count = applicant_subjects.get(subject, 0)
            if applicant_count < required_count:
                return {"score": 0, "details": {"reason": "Subject requirement not met"}}
    
    # Check standardized test requirement
    sat_required = school_requirements["sat_act_required"].get("sat only", False)
    act_required = school_requirements["sat_act_required"].get("act only", False)
    either_required = school_requirements["sat_act_required"].get("sat or act", False)

    sat = applicant.get("sat", 0)
    act = applicant.get("act", 0)

    if sat_required and sat == 0:
        return {"score": 0, "details": {"reason": "SAT score required"}}
    if act_required and act == 0:
        return {"score": 0, "details": {"reason": "ACT score required"}}
    if either_required and sat == 0 and act == 0:
        return {"score": 0, "details": {"reason": "SAT or ACT score required"}}


    # 2. Define importance weight mapping
    weight_map = {
        "Very Important": 1.0,
        "Important": 0.9,
        "Considered": 0.8,
        "Not Considered": 0.0
    }

    total_score = 0
    details = {}

    # 3. Score for alumni relation
    alumni_max = max_points_by_cat['alumni']
    school_name = school_requirements['school_name'].lower()
    alumni_score = 0
    if applicant['alumni']:
        alumni_schools = [name.lower() for name in applicant.get("alumni_school_names", [])]
        if school_name in alumni_schools:
            alumni_score = alumni_max
    alumni_weighted = alumni_score * weight_map[school_requirements['alumni_importance']]
    total_score += alumni_weighted
    details["alumni"] = round(alumni_weighted, 2)

    # 4. Score for first-generation status
    #first_max = max_points_by_cat['first_generation']
    #first_score = first_max if applicant['first'] else 0
    #first_weighted = first_score * weight_map[school_requirements['first_generation_importance']]
    #total_score += first_weighted
    #details["first_generation"] = round(first_weighted, 2)

    # 5. Score based on residency acceptance rate
    residency_max = max_points_by_cat['residency']
    if applicant['residency'] == 'international':
        residency_category = 'international'
    else:
        if applicant['state'] == school_requirements['state']:
            residency_category = 'in-state'
        else:
            residency_category = 'out-of-state'
    residency_score = school_requirements['residency_acceptance'][residency_category] * residency_max
    residency_weighted = residency_score * weight_map[school_requirements['residency_importance']]
    total_score += residency_weighted
    details["residency"] = round(residency_weighted, 2)
    

    # 6. Score based on SAT range position
    sat = applicant['sat']
    act = applicant['act']
    sat_act_max = max_points_by_cat['sat_act']
    
    if sat > 0:
        sat_q25 = school_requirements['sat_scores']['25th']
        sat_q75 = school_requirements['sat_scores']['75th']

        if sat < sat_q25:
            sat_score = 0
        elif sat > sat_q75:
            sat_score = sat_act_max
        else:
            sat_score = ((sat - sat_q25) / (sat_q75 - sat_q25)) * sat_act_max
            sat_score = min(sat_act_max, max(0, sat_score))
        sat_weighted = sat_score * weight_map[school_requirements['sat_act_importance']]
        total_score += sat_weighted
        details["sat"] = round(sat_weighted, 2)
    
    elif act > 0:
        act_q25 = school_requirements['act scores']['25th']
        act_q75 = school_requirements['act scores']['75th']
        if act < act_q25:
            act_score = 0
        elif act > act_q75:
            act_score = sat_act_max
        else:
            act_score = ((act - act_q25) / (act_q75 - act_q25)) * sat_act_max
            act_score = min(sat_act_max, max(0, act_score))
        act_weighted = act_score * weight_map[school_requirements['sat_act_importance']]
        total_score += act_weighted
        details["act"] = round(act_weighted, 2)

    # 7. Score based on GPA
    gpa_max = max_points_by_cat['gpa']
    gpa_score = (applicant['gpa'] / 4.0) * gpa_max
    gpa_weighted = gpa_score * weight_map[school_requirements['gpa_importance']]
    total_score += gpa_weighted
    details["gpa"] = round(gpa_weighted, 2)
    
    # 8. Score for volunteer work
    volunteer_max = max_points_by_cat['volunteer_work']
    volunteer = applicant['volunteering_hours']
    if volunteer < 50:
        volunteer_score = 0
    elif volunteer >= 200:
        volunteer_score = volunteer_max
    else:
        volunteer_score = volunteer / 200 * volunteer_max
    
    # 9. Score for work experience
    #work_max = max_points_by_cat['work_experience']
    #work = applicant['work_months']
    
    # 10. Calculate school-specific total max score
    total_max_score = calculate_total_max_score(school_requirements, max_points_by_cat, weight_map)

    # 11. Normalize total_score by total_max_score
    if total_max_score > 0:
        normalized_score = (total_score / total_max_score) * 100
    else:
        normalized_score = 0
    
    return {
        "score": round(normalized_score, 2), 
        "details": details
    }

In [99]:
calculate_compatibility_score(applicant1, gatech_cds, max_points_by_cat)

{'score': 53.5,
 'details': {'alumni': 0.0, 'residency': 1.42, 'sat': 15.46, 'gpa': 28.03}}

In [100]:
calculate_compatibility_score(applicant1, umn_cds, max_points_by_cat)

{'score': 69.98,
 'details': {'alumni': 0.0, 'residency': 8.19, 'sat': 20.61, 'gpa': 28.03}}

In [101]:
calculate_compatibility_score(applicant2, gatech_cds, max_points_by_cat)

{'score': 53.14,
 'details': {'alumni': 0.0, 'residency': 1.12, 'sat': 15.46, 'gpa': 28.03}}

In [102]:
calculate_compatibility_score(applicant2, umn_cds, max_points_by_cat)

{'score': 70.39,
 'details': {'alumni': 0.0, 'residency': 8.53, 'sat': 20.61, 'gpa': 28.03}}

In [103]:
calculate_compatibility_score(applicant3, gatech_cds, max_points_by_cat)

{'score': 48.8,
 'details': {'alumni': 0.0, 'residency': 1.12, 'sat': 10.3, 'gpa': 29.54}}

In [104]:
calculate_compatibility_score(applicant3, umn_cds, max_points_by_cat)

{'score': 72.26,
 'details': {'alumni': 0.0, 'residency': 8.53, 'sat': 20.61, 'gpa': 29.54}}

In [105]:
calculate_compatibility_score(applicant4, gatech_cds, max_points_by_cat)

{'score': 26.6,
 'details': {'alumni': 0.0, 'residency': 1.12, 'sat': 0.0, 'gpa': 21.21}}

In [106]:
calculate_compatibility_score(applicant4, umn_cds, max_points_by_cat)

{'score': 36.62,
 'details': {'alumni': 0.0, 'residency': 8.53, 'sat': 0.0, 'gpa': 21.21}}