### Applicants Sample

In [51]:
applicant1 = {
    "userId" : "4b34dfc4-fb21-4a9b-886a-126e1428a744",
    "high_school_completion": 1,
    "general_college_requirement": 
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4,
            "history": 0,
            "electives": 0,
            "cs": 0
        },
    "alumni": 1,
    "first": 0,
    "alumni_school_names": ["Georgia Tech", "NYU", "University of Minnesota", "U Georgia"],
    "residency": "domestic",
    "state": "MN",
    "country": "",
    "sat": 1520,
    "act": 0,
    "gpa": 3.7,
    "volunteering_hours": 150,
    "english_test_type": "", 
    "english_test_score": 0
}

In [3]:
applicant2 = {
    "userId" : "4b34dfc4-fb21-4a9b-886a-126e1428a743",
    "high_school_completion": 1,
    "general_college_requirement": 
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4,
            "history": 0,
            "electives": 0,
            "cs": 0
        },
    "alumni": 0,
    "first": 0,
    "alumni_school_names": [],
    "residency": "international",
    "state": "",
    "country": "Korea, Republic of",
    "sat": 1520,
    "act": 0,
    "gpa": 3.7,
    "volunteering_hours": 150,
    "english_test_type": "TOEFL", 
    "english_test_score": 110
}

In [52]:
applicant3 = {
    "userId" : "4b34dfc4-fb21-4a9b-886a-126e1428a742",
    "high_school_completion": 0,
    "general_college_requirement":         
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4,
            "history": 0,
            "electives": 0,
            "cs": 0
        },
    "alumni": 0,
    "first": 0,
    "alumni_school_names": [],
    "residency": "international",
    "state": "",
    "country": "Korea, Republic of",
    "sat": 1480,
    "act": 0,
    "gpa": 3.9,
    "volunteering_hours": 150,
    "english_test_type": "TOEFL", 
    "english_test_score": 100
}

In [5]:
applicant4 = {
    "userId" : "4b34dfc4-fb21-4a9b-886a-126e1428a741",
    "high_school_completion": 1,
    "general_college_requirement": 
        {
            "english": 4,
            "math": 4,
            "science": 4,
            "scienceLab": 2,
            "language": 2,
            "social": 3,
            "arts": 4,
            "history": 0,
            "electives": 0,
            "cs": 0
        },
    "alumni": 0,
    "first": 0,
    "alumni_school_names": [],
    "residency": "international",
    "state": "",
    "country": "Korea, Republic of",
    "sat": 1300,
    "act": 0,
    "gpa": 2.8,
    "volunteering_hours": 150,
    "english_test_type": "TOEFL", 
    "english_test_score": 100
}

### Crawling

In [1]:
import os
import pdfplumber
import re

import pandas as pd

import openpyxl

In [7]:
gatech_file_path = os.path.join('pdf', 'gatech_2425.pdf')
cwru_file_path = os.path.join('pdf_test2', 'cwru_2425.pdf')

In [2]:
# Remove None and empty string from row
def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [3]:
# Extract only numbers from the cell
def clean_number(cell):
    cell = re.sub(r'[^\d.]', '', str(cell)) # Remove all non-numeric characters
    try:
        return float(cell)
    except:
        return 0

In [4]:
# Extract text from a section like "C{#}." up to before "C{#+1}."

def extract_section_by_label(file_path, section_label):
    next_label = "c" + str(int(section_label[1:]) + 1)

    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text.lower()

    # Match exact line starting with "c{#}."
    # ^ = matches the beginning of a line
    start_match = re.search(rf"({section_label.lower()}[\. ])", full_text)
    if not start_match:
        print(f"Section {section_label.upper()} not found.")
        return
    start_idx = start_match.start()

    # Look for the start of the next section
    next_match = re.search(rf"({next_label}[\. ])", full_text[start_idx:])
    end_idx = start_idx + next_match.start() if next_match else None # If we don't find the next label, go to the end of the document

    section_text = full_text[start_idx:end_idx]

    return section_text

##### Residency

In [5]:
def extract_residency_data(file_path):
    
    result = {
        'applicants': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0},
        'admitted': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0}
    }

    # Define keywords to identify residency-related and first-year-related tables
    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in-state': -1, 'out-of-state': -1, 'international': -1, 'others': -1}
                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in-state'] = idx
                    elif 'out-' in col:
                        col_idx['out-of-state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                   # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add residency-specific count
                    if col_idx['in-state'] != -1 and col_idx['in-state'] < len(row):
                        result[target]['in-state'] += clean_number(row[col_idx['in-state']])
                    if col_idx['out-of-state'] != -1 and col_idx['out-of-state'] < len(row):
                        result[target]['out-of-state'] += clean_number(row[col_idx['out-of-state']])
                    if col_idx['international'] != -1 and col_idx['international'] < len(row):
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1 and col_idx['others'] < len(row):
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [11]:
extract_residency_data(gatech_file_path)

{'applicants': {'in-state': 10674.0,
  'out-of-state': 38320.0,
  'international': 10795.0,
  'others': 0.0},
 'admitted': {'in-state': 3536.0,
  'out-of-state': 3992.0,
  'international': 885.0,
  'others': 0.0}}

##### High School Completion Requirement

In [6]:
def extract_highschool_requirement(file_path):

    result = {
        'high school diploma required': 0,
        'GED accepted': 0
    }

    # Define exact phrases expected in each case
    diploma_and_ged = "high school diploma is required and ged is accepted"
    diploma_only = "high school diploma is required and ged is not accepted"
    no_diploma_needed = "high school diploma or equivalent is not required"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C3 section using helper function
    section_text = extract_section_by_label(file_path, "C3")
    if not section_text:
        return result  # Return default if section not found

    section_text = section_text.lower()
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if diploma_and_ged in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
            found_checked_option = True
            break
        elif diploma_only in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
            found_checked_option = True
            break
        elif no_diploma_needed in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 0
            result['GED accepted'] = 0
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if diploma_and_ged in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
        elif diploma_only in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
        elif no_diploma_needed in section_text:
            result['high school diploma required'] = 0
            result['GED accepted'] = 0

    return result

In [16]:
extract_highschool_requirement(gatech_file_path)

{'high school diploma required': 1, 'GED accepted': 0}

##### State

In [7]:
def extract_state(file_path):
    state_name_to_abbr = {
        'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
        'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
        'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
        'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
        'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
        'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN',
        'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE',
        'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
        'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
        'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR',
        'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
        'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
        'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
        'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
    }

    section_text = extract_section_by_label(file_path, 'A1')
    if not section_text:
        return {"State": "Unknown"}

    lines = section_text.lower().splitlines()[:20]

    for line in lines:
        if 'state' in line:
            # Full state name
            for state, abbr in state_name_to_abbr.items():
                if state.lower() in line:
                    return {"State": abbr}
            # Abbreviation state name 
            for abbr in state_name_to_abbr.values():
                if f' {abbr.lower()} ' in f' {line} ':
                    return {"State": abbr}

    return {"State": "Unknown"}

In [21]:
extract_state(gatech_file_path)

{'State': 'GA'}

##### School Name

In [8]:
def extract_school_name(file_path):
    text = extract_section_by_label(file_path, "A1")
    lines = text.lower().splitlines()
    for line in lines:
        # Handle both "name of college or university" and "name of college/university" formats
        match = re.match(r"\s*name of college\s*(?:or|/)\s*university[:\s]+(.+)", line, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

In [26]:
extract_school_name(gatech_file_path)

'georgia institute of technology'

##### General College-Preparatory Program Requirement

In [9]:
def extract_college_prep_requirement(file_path):
    
    result = {
        'general college-preparatory program': 
            {
                'required': 0,
                'recommended': 0,
                'neither required or recommended': 0
            }
        }

    # Define key phrases to match
    required = "require"
    recommended = "recommend"
    neither = "neither require nor recommend"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C4 section using helper function
    section_text = extract_section_by_label(file_path, 'C4')
    if not section_text:
        return result # Return default if section not found
    section_text = section_text.lower()
    
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        
        # Skip the question sectence
        if line.startswith('c4'):
            continue
        
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if neither in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['neither required or recommended'] = 1
            found_checked_option = True
            break
        elif required in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['required'] = 1
            found_checked_option = True
            break
        elif recommended in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['recommended'] = 1
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if "neither require nor recommend" in section_text:
            result['general college-preparatory program']['neither required or recommended'] = 1
        elif "recommend" in section_text:
            result['general college-preparatory program']['recommended'] = 1
        elif "require" in section_text:
            result['general college-preparatory program']['required'] = 1

    return result

In [31]:
extract_college_prep_requirement(gatech_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [10]:
def extract_general_college_subjects(file_path):
    import pdfplumber

    result = {
        "english": 0,
        "math": 0,
        "science": 0,
        "scienceLab": 0,
        "language": 0,
        "social": 0,
        "history": 0,
        "electives": 0,
        "cs": 0,
        "arts": 0
    }

    # Keywords that commonly appear in the C5 subject distribution table
    subject_keywords = ["english", "math", "science"]

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if not table or len(table[0]) < 2:
                    continue  # Skip empty or invalid tables

                # Convert header cells to lowercase for comparison
                headers = [cell.lower() if cell else "" for cell in table[0]]

                # Check if table looks like a C5 table based on header content
                header_match = "distribution of high" in headers[0] or \
                               ("units required" in headers[1] if len(headers) > 1 else False)

                # nyu - Check if at least 3 subject-related keywords appear in row labels
                match_count = 0
                for row in table:
                    if not row or len(row) < 1:
                        continue
                    label = row[0].lower().strip() if row[0] else ""
                    if any(keyword in label for keyword in subject_keywords):
                        match_count += 1
                keyword_match = match_count >= 3

                # Skip this table if neither condition is satisfied
                if not (header_match or keyword_match):
                    continue

                # Parse the table rows: skip first row if headers are matched
                for row in table[1:] if header_match else table:
                    row = clean_row([cell.lower().strip() if cell else "" for cell in row])
                    if not row or len(row) < 1:
                        continue

                    label = row[0]
                    value = row[1] if len(row) > 1 else ""

                    # Match labels to subjects and clean the numerical value
                    if "english" in label:
                        result["english"] = clean_number(value)
                    elif "math" in label:
                        result["math"] = clean_number(value)
                    elif label == "science":
                        result["science"] = clean_number(value)
                    elif "lab" in label:
                        result["scienceLab"] = clean_number(value)
                    elif "language" in label:
                        result["language"] = clean_number(value)
                    elif "social studies" in label:
                        result["social"] = clean_number(value)
                    elif "history" in label:
                        result["history"] = clean_number(value)
                    elif "academic electives" in label:
                        result["electives"] = clean_number(value)
                    elif "computer science" in label:
                        result["cs"] = clean_number(value)
                    elif "arts" in label:
                        result["arts"] = clean_number(value)

                # Return immediately after finding and parsing the correct C5 table
                return result

    return result  # Return default if no matching table was found


In [18]:
extract_general_college_subjects(gatech_file_path)

{'english': 4.0,
 'math': 4.0,
 'science': 4.0,
 'scienceLab': 2.0,
 'language': 2.0,
 'social': 3.0,
 'history': 0,
 'electives': 0,
 'cs': 0,
 'arts': 0}

### Preprocessing

In [11]:
def calculate_acceptance_rates(data):
    applicants = data['applicants']
    admitted = data['admitted']
    rates = {}
    for category in applicants:
        applied = applicants[category]
        accepted = admitted.get(category, 0)
        if applied > 0:
            rates[category] = accepted / applied
        else:
            rates[category] = None 
    return rates

In [41]:
calculate_acceptance_rates(extract_residency_data(gatech_file_path))

{'in-state': 0.3312722503278996,
 'out-of-state': 0.10417536534446764,
 'international': 0.08198239925891616,
 'others': None}

##### SAT or ACT

In [12]:
def is_marked(cell, marks=['x', '☑', '☒', '✓', '✔', '4']):
    if not cell:
        return False
    cell_str = str(cell).strip().lower()
    return any(mark in cell_str for mark in marks)

In [13]:
def classify_requirement(text):
    text = text.lower().replace('\n', ' ')
    if "required to be considered" in text:
        return "Required"
    elif "required for some" in text:
        return "Required for some"
    elif "recommended" in text:
        return "Recommended"
    elif "not required" in text:
        return "Optional"
    elif "not considered" in text:
        return "Not considered"
    return "Unknown"

# Nyu case
def extract_requirement_from_text_lines(text_lines, keyword):
    for i, line in enumerate(text_lines):
        if keyword.lower() in line.lower():
            if i + 1 < len(text_lines):
                next_line = text_lines[i + 1].strip()
                return classify_requirement(next_line)
    return "Unknown"

# Nyu case
def extract_from_text(pdf):
    labels = {
        "SAT or ACT": "SAT and/or ACT",
        "ACT Only": "ACT Only",
        "SAT Only": "SAT Only"
    }
    result = {f"{label} Requirement": "Unknown" for label in labels}

    full_text = ""
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            full_text += "\n" + text

    lines = full_text.split("\n")

    for label, keyword in labels.items():
        result[f"{label} Requirement"] = extract_requirement_from_text_lines(lines, keyword)

    return result

# Gatech, Umn case
def extract_from_table(pdf):
    labels = ["SAT or ACT", "ACT Only", "SAT Only"]
    result = {f"{label} Requirement": "Unknown" for label in labels}

    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            if not table or len(table) < 2:
                continue

            table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
            if "sat or act" not in table_str:
                continue

            headers = [str(cell).strip().lower() if cell else "" for cell in table[0]]

            for row in table[1:]:
                row_cells = [str(cell).strip().lower() if cell else "" for cell in row]
                for label in labels:
                    if label.lower() in row_cells[0]:
                        for i in range(1, len(row)):
                            if i < len(headers) and is_marked(row[i]):
                                category = classify_requirement(headers[i])
                                if category:
                                    result[f"{label} Requirement"] = category
                                    break
    return result

def extract_sat_act_required(file_path):
    with pdfplumber.open(file_path) as pdf:
        table_result = extract_from_table(pdf)
        if all(value == "Unknown" for value in table_result.values()):
            return extract_from_text(pdf)
        return table_result

In [47]:
extract_sat_act_required(gatech_file_path)

{'SAT or ACT Requirement': 'Required',
 'ACT Only Requirement': 'Unknown',
 'SAT Only Requirement': 'Unknown'}

In [14]:
def extract_sat_act_scores(file_path):
    regex_map = {
        "SAT Composite": r"sat.*composite",
        "SAT EBRW": r"sat.*(evidence|ebrw|writing)",
        "SAT Math": r"sat.*math",
        "ACT Composite": r"act.*composite",
        "ACT Math": r"act.*math",
        "ACT English": r"act.*english",
        "ACT Writing": r"act.*writing",
        "ACT Science": r"act.*science",
        "ACT Reading": r"act.*reading"
    }

    score_data = {label: {'25th': None, '50th': None, '75th': None} for label in regex_map}

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            text = page.extract_text() or ""

            act_writing_present = "act writing" in text.lower()

            for table in tables:
                 for row in table:
                    cleaned_row = [str(cell).lower().strip() if cell else "" for cell in row]
                    row_label = cleaned_row[0]

                    matched_label = None
                    for label, pattern in regex_map.items():
                        if re.search(pattern, row_label):
                            matched_label = label
                            break

                    if not matched_label and re.fullmatch(r"writing", row_label):
                        matched_label = "SAT EBRW"

                    if matched_label:
                        numeric_values = [cell for cell in cleaned_row[1:] if cell.replace('.', '', 1).isdigit()]
                        for i, key in enumerate(['25th', '50th', '75th']):
                            if i < len(numeric_values):
                                score_data[matched_label][key] = numeric_values[i]

    return score_data

In [52]:
extract_sat_act_scores(gatech_file_path)

{'SAT Composite': {'25th': '1370', '50th': '1460', '75th': '1530'},
 'SAT EBRW': {'25th': '680', '50th': '720', '75th': '750'},
 'SAT Math': {'25th': '690', '50th': '760', '75th': '790'},
 'ACT Composite': {'25th': '30', '50th': '33', '75th': '34'},
 'ACT Math': {'25th': '29', '50th': '32', '75th': '35'},
 'ACT English': {'25th': '31', '50th': '34', '75th': '35'},
 'ACT Writing': {'25th': '8', '50th': '8', '75th': '9'},
 'ACT Science': {'25th': '29', '50th': '33', '75th': '35'},
 'ACT Reading': {'25th': '31', '50th': '34', '75th': '35'}}

##### Importance

In [15]:
def extract_relative_importance(file_path):
    importance_levels = ["very important", "important", "considered", "not considered"]
    marks = ['x', '☒', '✓', '✔', '4']

    all_factors = [
        "rigor of secondary school record",
        "class rank",
        "academic grade point average (gpa)",
        "recommendations",
        "standardized test scores",
        "application essay",
        "interview",
        "extracurricular activities",
        "talent/ability",
        "character/personal qualities",
        "first generation",
        "alumni/ae relation",
        "geographical residence",
        "state residency",
        "religious affiliation/commitment",
        "volunteer work",
        "work experience",
        "level of applicant’s interest"
    ]

    alias_map = {
        "academic gpa": "academic grade point average (gpa)",
        "recommendation": "recommendations",
        "recommendation(s)": "recommendations",
        "level of applicant's interest": "level of applicant’s interest"
    }

    def normalize_factor(name):
        key = name.lower().replace("’", "'").strip()
        return alias_map.get(key, key)

    result = {factor: "unknown" for factor in all_factors}
    found_table = False

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if not table or len(table) < 2:
                    continue

                headers = [str(cell).strip().lower() if cell else "" for cell in table[0]]
                if not all(level in headers for level in importance_levels):
                    continue

                found_table = True

                for row in table[1:]:
                    row = [str(cell).strip().replace('\n', ' ').lower() if cell else "" for cell in row]
                    if not row or row[0] in ["academic", "nonacademic"]:
                        continue

                    raw_factor = row[0]
                    factor = normalize_factor(raw_factor)
                    importance = "unknown"
                    for i in range(1, min(len(row), len(headers))):
                        if any(mark in row[i] for mark in marks):
                            importance = headers[i]
                            break

                    if factor in result:
                        result[factor] = importance

            if found_table:
                break

    if not found_table:
        section_text = extract_section_by_label(file_path, "C7")
        if not section_text:
            return result

        lines = section_text.lower().splitlines()[:35]
        importance_map = {0: "very important", 1: "important", 2: "considered", 3: "not considered"}

        for line in lines:
            line = line.replace("'", "’")
            tokens = line.split()
            mark_index = None
            for idx, token in enumerate(tokens[-4:]):
                if token in marks:
                    mark_index = idx
                    break
            if mark_index is not None:
                factor_tokens = tokens[:-4]
                factor = normalize_factor(" ".join(factor_tokens))
                importance = importance_map.get(mark_index, "unknown")
                if factor in result:
                    result[factor] = importance
            else:
                for factor in all_factors:
                    factor_key = normalize_factor(factor)
                    if factor_key in line:
                        for level in sorted(importance_levels, key=lambda x: -len(x)):
                            if level in line:
                                result[factor_key] = level
                                break

    return result

In [17]:
extract_relative_importance(gatech_file_path)

{'rigor of secondary school record': 'very important',
 'class rank': 'not considered',
 'academic grade point average (gpa)': 'very important',
 'recommendations': 'considered',
 'standardized test scores': 'considered',
 'application essay': 'important',
 'interview': 'not considered',
 'extracurricular activities': 'important',
 'talent/ability': 'considered',
 'character/personal qualities': 'very important',
 'first generation': 'considered',
 'alumni/ae relation': 'not considered',
 'geographical residence': 'considered',
 'state residency': 'very important',
 'religious affiliation/commitment': 'not considered',
 'volunteer work': 'considered',
 'work experience': 'considered',
 'level of applicant’s interest': 'not considered'}

### CDS

In [16]:
def build_cds_data(file_path):
    
    def safe_int(score): 
        try: return int(score)
        except: return 0
    
    residency_data = extract_residency_data(file_path)
    residency_acceptance = calculate_acceptance_rates(residency_data)
    
    highschool_requirement = extract_highschool_requirement(file_path)
    
    college_prep_requirement = extract_college_prep_requirement(file_path)
    college_prep_subjects = extract_general_college_subjects(file_path)
    
    sat_act_requirement = extract_sat_act_required(file_path)
    sat_act_scores = extract_sat_act_scores(file_path)
    sat_composite = sat_act_scores.get("SAT Composite")
    act_composite = sat_act_scores.get("ACT Composite")
    
    relative_importance = extract_relative_importance(file_path)
    
    cds_data = {
        "school_name": extract_school_name(file_path),
        "state": extract_state(file_path),
        "high_school_completion_required": bool(highschool_requirement["high school diploma required"]),
        "general_college_preparatory_required": bool(college_prep_requirement["general college-preparatory program"]["required"]),
        "general_college_subjects": college_prep_subjects, 
        "sat_act_required": {
            "sat_or_act": sat_act_requirement.get("SAT or ACT Requirement", "").lower() == "required",
            "sat only": sat_act_requirement.get("SAT Only Requirement", "").lower() == "required",
            "act only": sat_act_requirement.get("ACT Only Requirement", "").lower() == "required"
        },
        
        "rigor_of_secondary_school_record": relative_importance.get("rigor of secondary school record", "Unknown"),
        "class_rank": relative_importance.get("class rank", "Unknown"),
        "recommendations": relative_importance.get("recommendations", "Unknown"),
        "standardized_test_scores": relative_importance.get("standardized test scores", "Unknown"),
        "application_essay": relative_importance.get("application essay", "Unknown"),
        "interview": relative_importance.get("interview", "Unknown"),
        "extracurricular_activities": relative_importance.get("extracurricular activities", "Unknown"),
        "talent_ability": relative_importance.get("talent/ability", "Unknown"),
        "character_personal_qualities": relative_importance.get("character/personal qualities", "Unknown"),
        "geographical_residence": relative_importance.get("geographical residence", "Unknown"),
        "religious_affiliation_commitment": relative_importance.get("religious affiliation/commitment", "Unknown"),
        "work_experience": relative_importance.get("work experience", "Unknown"),
        "level_of_applicant_interest": relative_importance.get("level of applicant’s interest", "Unknown"),

        "alumni_importance": relative_importance.get("alumni/ae relation", "Unknown"),
        "first_generation_importance": relative_importance.get("first generation", "Unknown"),
        "residency_importance": relative_importance.get("state residency", "Unknown"),
        "sat_act_importance": relative_importance.get("standardized test scores", "Unknown"),
        "gpa_importance": relative_importance.get("academic grade point average (gpa)", "Unknown"),
        "volunteer_work_importance": relative_importance.get("volunteer work", "Unknown"),

        "residency_acceptance": residency_acceptance,
        "sat_scores": {
            "25th": safe_int(sat_composite.get("25th")),
            "50th": safe_int(sat_composite.get("50th")),
            "75th": safe_int(sat_composite.get("75th"))
        },
        "act scores": {
            "25th": safe_int(act_composite.get("25th")),
            "50th": safe_int(act_composite.get("50th")),
            "75th": safe_int(act_composite.get("75th"))
        },
        
    }

    return cds_data

In [23]:
build_cds_data(gatech_file_path)

{'school_name': 'georgia institute of technology',
 'state': {'State': 'GA'},
 'high_school_completion_required': True,
 'general_college_preparatory_required': True,
 'general_college_subjects': {'english': 4.0,
  'math': 4.0,
  'science': 4.0,
  'scienceLab': 2.0,
  'language': 2.0,
  'social': 3.0,
  'history': 0,
  'electives': 0,
  'cs': 0,
  'arts': 0},
 'sat_act_required': {'sat_or_act': True,
  'sat only': False,
  'act only': False},
 'rigor_of_secondary_school_record': 'very important',
 'class_rank': 'not considered',
 'recommendations': 'considered',
 'standardized_test_scores': 'considered',
 'application_essay': 'important',
 'interview': 'not considered',
 'extracurricular_activities': 'important',
 'talent_ability': 'considered',
 'character_personal_qualities': 'very important',
 'geographical_residence': 'considered',
 'religious_affiliation_commitment': 'not considered',
 'work_experience': 'considered',
 'level_of_applicant_interest': 'not considered',
 'alumni_impo

In [110]:
build_cds_data(cwru_file_path)

{'school_name': 'case western reserve university',
 'state': {'State': 'OH'},
 'high_school_completion_required': True,
 'general_college_preparatory_required': False,
 'general_college_subjects': {'english': 4.0,
  'math': 3.0,
  'science': 3.0,
  'scienceLab': 2.0,
  'language': 2.0,
  'social': 2.0,
  'history': 0,
  'electives': 0,
  'cs': 0,
  'arts': 0},
 'sat_act_required': {'sat_or_act': False,
  'sat only': False,
  'act only': False},
 'rigor_of_secondary_school_record': 'very important',
 'class_rank': 'very important',
 'recommendations': 'unknown',
 'standardized_test_scores': 'considered',
 'application_essay': 'important',
 'interview': 'unknown',
 'extracurricular_activities': 'unknown',
 'talent_ability': 'unknown',
 'character_personal_qualities': 'unknown',
 'geographical_residence': 'unknown',
 'religious_affiliation_commitment': 'unknown',
 'work_experience': 'unknown',
 'level_of_applicant_interest': 'unknown',
 'alumni_importance': 'unknown',
 'first_generation_i

In [17]:
def save_cds_data_to_excel(input_folder, output_folder):
    # Iterate over all files in the specified folder
    for file in os.listdir(input_folder):
        if file.endswith(".pdf"): # process only pdf files
            file_path = os.path.join(input_folder, file)
            try:
                cds_data = build_cds_data(file_path)

                school_name = cds_data.get("school_name") or os.path.splitext(file)[0] # try to get school name; use file name if missing
                safe_name = school_name.replace("/", "_").replace("\\", "_")

                df = pd.json_normalize(cds_data, sep='.') # convert nested dictionary to a flat dataframe

                output_path = os.path.join(output_folder, f"{safe_name}.xlsx")
                df.to_excel(output_path, index=False)

                print(f"Saved: {output_path}")

            except Exception as e:
                print(f"Failed to process {file}: {e}")


In [19]:
# save_cds_data_to_excel("pdf4", "output_cds4")

In [112]:
# save_cds_data_to_excel("pdf", "output_cds")

In [113]:
# save_cds_data_to_excel("pdf2", "output_cds2")

In [114]:
# save_cds_data_to_excel("pdf3", "output_cds3")

In [62]:
# save_cds_data_to_excel("pdf_test", "output_cds_test")
# save_cds_data_to_excel("pdf_test2", "output_cds_test2")

### Modeling

In [63]:
import os
import pandas as pd
from typing import Dict

def load_cds_from_folder(folder_path: str) -> Dict[str, Dict]:
    cds_dict = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            # Normalize school key (e.g., "Georgia Tech.xlsx" → "georgia_tech")
            school_key = filename.replace('.xlsx', '').lower().replace(' ', '_')
            
            # Read Excel file
            file_path = os.path.join(folder_path, filename)
            df = pd.read_excel(file_path, header=None)
            keys = df.iloc[0].values
            values = df.iloc[1].values
            school_dict = dict(zip(keys, values))
            
            cds_dict[school_key] = school_dict

    return cds_dict

In [64]:
# Data not ready yet – using only the Georgia Tech Excel file from the test folder for now.
cds_dict = load_cds_from_folder("output_cds_test")
cds_dict

{'georgia_institute_of_technology': {'school_name': 'georgia institute of technology',
  'high_school_completion_required': True,
  'general_college_preparatory_required': True,
  'rigor_of_secondary_school_record': 'very important',
  'class_rank': 'not considered',
  'recommendations': 'considered',
  'standardized_test_scores': 'considered',
  'application_essay': 'important',
  'interview': 'not considered',
  'extracurricular_activities': 'important',
  'talent_ability': 'considered',
  'character_personal_qualities': 'very important',
  'geographical_residence': 'considered',
  'religious_affiliation_commitment': 'not considered',
  'work_experience': 'considered',
  'level_of_applicant_interest': 'not considered',
  'alumni_importance': 'not considered',
  'first_generation_importance': 'considered',
  'residency_importance': 'very important',
  'sat_act_importance': 'considered',
  'gpa_importance': 'very important',
  'volunteer_work_importance': 'considered',
  'state.State':

##### Importance Related

In [32]:
# config.py

# Keys to extract from CDS files for importance evaluation
importance_keys = [
    "gpa_importance",
    "sat_act_importance",
    "residency_importance",
    "alumni_importance",
    "volunteer_work_importance"
]

# Weight mapping for importance levels
weight_map = {
    "very important": 1.0,
    "important": 0.9,
    "considered": 0.8,
    "not considered": 0.0
}

# List of allowed importance levels
importance_levels = list(weight_map.keys())

# Mapping of scoring categories to CDS importance keys
category_to_importance_key = {
    'gpa': 'gpa_importance',
    'standardized_test': 'sat_act_importance',
    'residency': 'residency_importance',
    'alumni': 'alumni_importance',
    'volunteer_work': 'volunteer_work_importance'
}

# Mapping school name to school id
school_list = [
  { "id": 1, "name": "Harvard University"},
  { "id": 2, "name": "Stanford University"},
  { "id": 3, "name": "Massachusetts Institute of Technology (MIT)"},
  { "id": 4, "name": "California Institute of Technology (Caltech)"},
  { "id": 5, "name": "University of Chicago"},
  { "id": 6, "name": "Princeton University"},
  { "id": 7, "name": "Yale University"},
  { "id": 8, "name": "Columbia University"},
  { "id": 9, "name": "University of Pennsylvania"},
  { "id": 10, "name": "Johns Hopkins University"},
  { "id": 11, "name": "Northwestern University"},
  { "id": 12, "name": "Duke University"},
  { "id": 13, "name": "Dartmouth College"},
  { "id": 14, "name": "Brown University"},
  { "id": 15, "name": "University of California, Berkeley"},
  { "id": 16, "name": "Cornell University"},
  { "id": 17, "name": "Rice University"},
  { "id": 18, "name": "Vanderbilt University"},
  { "id": 19, "name": "University of California, Los Angeles (UCLA)"},
  { "id": 20, "name": "University of Michigan, Ann Arbor"},
  { "id": 21, "name": "University of Southern California (USC)"},
  { "id": 22, "name": "Carnegie Mellon University"},
  { "id": 23, "name": "University of Virginia"},
  { "id": 24, "name": "New York University"},
  { "id": 25, "name": "University of North Carolina at Chapel Hill"},
  { "id": 26, "name": "Wake Forest University"},
  { "id": 27, "name": "University of California, San Diego"},
  { "id": 28, "name": "Tufts University"},
  { "id": 29, "name": "University of Rochester"},
  { "id": 30, "name": "Boston College"},
  { "id": 31, "name": "Georgia Institute of Technology"},
  { "id": 32, "name": "University of California, Davis"},
  { "id": 33, "name": "Brandeis University"},
  { "id": 34, "name": "University of Wisconsin-Madison"},
  { "id": 35, "name": "Case Western Reserve University"},
  { "id": 36, "name": "University of Texas at Austin"},
  { "id": 37, "name": "University of Florida"},
  { "id": 38, "name": "Northeastern University"},
  { "id": 39, "name": "University of Miami"},
  { "id": 40, "name": "Ohio State University"},
  { "id": 41, "name": "University of Maryland, College Park"},
  { "id": 42, "name": "Pepperdine University"},
  { "id": 43, "name": "University of Pittsburgh"},
  { "id": 44, "name": "Purdue University"},
  { "id": 45, "name": "University of Georgia"},
  { "id": 46, "name": "University of Minnesota Twin Cities"},
  { "id": 47, "name": "Baylor University"},
  { "id": 48, "name": "Texas A&M University"},
  { "id": 49, "name": "Rutgers University"},
  { "id": 50, "name": "University of Connecticut"},
  { "id": 51, "name": "University of Delaware"},
  { "id": 52, "name": "University of Massachusetts Amherst"},
  { "id": 53, "name": "University of Denver"},
  { "id": 54, "name": "Indiana University Bloomington"},
  { "id": 55, "name": "University of Colorado Boulder"},
  { "id": 56, "name": "Florida State University"},
  { "id": 57, "name": "Michigan State University"},
  { "id": 58, "name": "University of Iowa"},
  { "id": 59, "name": "University of Oregon"},
  { "id": 60, "name": "University of Kansas"},
  { "id": 61, "name": "Clemson University"},
  { "id": 62, "name": "University of Arizona"},
  { "id": 63, "name": "Iowa State University"},
  { "id": 64, "name": "University of Alabama"},
  { "id": 65, "name": "Oregon State University"},
  { "id": 66, "name": "Colorado State University"},
  { "id": 67, "name": "Temple University"},
  { "id": 68, "name": "University of Missouri"},
  { "id": 69, "name": "Kansas State University"},
  { "id": 70, "name": "University of Oklahoma"},
  { "id": 71, "name": "University of Arkansas"},
  { "id": 72, "name": "Louisiana State University"},
  { "id": 73, "name": "University of New Mexico"},
  { "id": 74, "name": "University of Mississippi"},
  { "id": 75, "name": "West Virginia University"},
  { "id": 76, "name": "University of Kentucky"},
  { "id": 77, "name": "University of Nebraska"},
  { "id": 78, "name": "University of Vermont"},
  { "id": 79, "name": "University of Tennessee"},
  { "id": 80, "name": "University of South Carolina"},
  { "id": 81, "name": "University of Idaho"},
  { "id": 82, "name": "University of North Dakota"},
  { "id": 83, "name": "University of Maine"},
  { "id": 84, "name": "University of Wyoming"},
  { "id": 85, "name": "University of Alaska Fairbanks"},
  { "id": 86, "name": "Montana State University"},
  { "id": 87, "name": "University of Montana"},
  { "id": 88, "name": "University of Louisiana at Lafayette"},
  { "id": 89, "name": "New Mexico State University"},
  { "id": 90, "name": "University of Central Florida"},
  { "id": 91, "name": "Auburn University"},
  { "id": 92, "name": "University of Nevada, Reno"},
  { "id": 93, "name": "University of Nevada, Las Vegas"},
  { "id": 94, "name": "University of Memphis"},
  { "id": 95, "name": "University of Houston"},
  { "id": 96, "name": "University of Toledo"},
  { "id": 97, "name": "University of Akron"},
  { "id": 98, "name": "University of Cincinnati"},
  { "id": 99, "name": "University of Buffalo (SUNY)"},
  { "id": 100, "name": "University of Illinois Chicago"}
]

In [33]:
from collections import Counter
from typing import Dict, List

def compute_average_importance_weights(cds_dict: Dict[str, Dict]) -> Dict[str, float]:
    
    # Count importance levels across schools
    importance_counts = {key: Counter() for key in importance_keys}
    for school_data in cds_dict.values():
        for key in importance_keys:
            value = school_data.get(key, "Not Considered")  # Default fallback
            if value in importance_levels:
                importance_counts[key][value] += 1

    # Calculate weighted average for each category
    average_weights = {}
    for key in importance_keys:
        total = sum(importance_counts[key].values())
        if total == 0:
            average_weights[key] = 0
            continue
        weighted_sum = sum(weight_map[level] * count for level, count in importance_counts[key].items())
        average_weights[key] = round(weighted_sum / total, 4)

    return average_weights

In [65]:
average_weights = compute_average_importance_weights(cds_dict)
average_weights

{'gpa_importance': 1.0,
 'sat_act_importance': 0.8,
 'residency_importance': 0.9,
 'alumni_importance': 0.0,
 'volunteer_work_importance': 0.8}

In [66]:
# 3. Convert average weights into a 100-point scale
def normalize_max_points_from_avg_weights(average_weights: Dict[str, float], total_points=100) -> Dict[str, float]:
    total_weight = sum(average_weights.values())
    return {
        key: round((w / total_weight) * total_points, 2)
        for key, w in average_weights.items()
    }

In [67]:
max_points_by_cat = normalize_max_points_from_avg_weights(average_weights)
max_points_by_cat

{'gpa_importance': 28.57,
 'sat_act_importance': 22.86,
 'residency_importance': 25.71,
 'alumni_importance': 0.0,
 'volunteer_work_importance': 22.86}

----------------

##### Total Score

In [68]:
# 4. Calculate school-specific total maximum score based on weight map
def calculate_total_max_score(school_info: Dict, max_points_by_cat: Dict[str, float]) -> float:
    total = 0
    for category, max_point in max_points_by_cat.items():
        if category in school_info:
            importance = school_info[category]
            weight = weight_map.get(importance, 0)
            total += max_point * weight
    return total

In [69]:
calculate_total_max_score(cds_dict['georgia_institute_of_technology'], max_points_by_cat)

90.85600000000001

In [71]:
calculate_total_max_score(cds_dict['university_of_minnesota_–_twin_cities_campus'], max_points_by_cat)

85.714

------------

##### Compatibility Score

In [72]:
def get_school_id_by_name(school_name: str) -> int:
    """
    Match lowercase school name to ID from school_list.
    """
    for school in school_list:
        if school["name"].lower() == school_name.lower():
            return school["id"]
    return -1  # fallback if not found

In [73]:
def unflatten_dict(flat_dict: Dict[str, any]) -> Dict:
    nested = {}
    for compound_key, value in flat_dict.items():
        parts = compound_key.split(".")
        d = nested
        for part in parts[:-1]:
            if part not in d:
                d[part] = {}
            d = d[part]
        d[parts[-1]] = value
    return nested

In [74]:
cds_dict['georgia_institute_of_technology']

{'school_name': 'georgia institute of technology',
 'high_school_completion_required': True,
 'general_college_preparatory_required': True,
 'rigor_of_secondary_school_record': 'very important',
 'class_rank': 'not considered',
 'recommendations': 'considered',
 'standardized_test_scores': 'considered',
 'application_essay': 'important',
 'interview': 'not considered',
 'extracurricular_activities': 'important',
 'talent_ability': 'considered',
 'character_personal_qualities': 'very important',
 'geographical_residence': 'considered',
 'religious_affiliation_commitment': 'not considered',
 'work_experience': 'considered',
 'level_of_applicant_interest': 'not considered',
 'alumni_importance': 'not considered',
 'first_generation_importance': 'considered',
 'residency_importance': 'very important',
 'sat_act_importance': 'considered',
 'gpa_importance': 'very important',
 'volunteer_work_importance': 'considered',
 'state.State': 'GA',
 'general_college_subjects.english': 4,
 'general_c

In [75]:
unflatten_dict(cds_dict['georgia_institute_of_technology'])

{'school_name': 'georgia institute of technology',
 'high_school_completion_required': True,
 'general_college_preparatory_required': True,
 'rigor_of_secondary_school_record': 'very important',
 'class_rank': 'not considered',
 'recommendations': 'considered',
 'standardized_test_scores': 'considered',
 'application_essay': 'important',
 'interview': 'not considered',
 'extracurricular_activities': 'important',
 'talent_ability': 'considered',
 'character_personal_qualities': 'very important',
 'geographical_residence': 'considered',
 'religious_affiliation_commitment': 'not considered',
 'work_experience': 'considered',
 'level_of_applicant_interest': 'not considered',
 'alumni_importance': 'not considered',
 'first_generation_importance': 'considered',
 'residency_importance': 'very important',
 'sat_act_importance': 'considered',
 'gpa_importance': 'very important',
 'volunteer_work_importance': 'considered',
 'state': {'State': 'GA'},
 'general_college_subjects': {'english': 4,
  

In [76]:
# Main function to calculate applicant compatibility score with a school
def calculate_compatibility_score(applicant: Dict, school_cds: Dict, max_points_by_cat: Dict[str, float]) -> Dict:
    
    school_cds = unflatten_dict(school_cds)
    
    # 1. Check essential requirments
    if school_cds["high_school_completion_required"]:
        if not applicant["high_school_completion"]:
            return {"score": 0, "details": {"reason": "High school completion requirement not met"}}
    
    if school_cds["general_college_preparatory_required"]:
        required_subjects = school_cds.get("general_college_subjects", {})
        applicant_subjects = applicant.get("general_college_requirement", {})
        
        for subject, required_count in required_subjects.items():
            applicant_count = applicant_subjects.get(subject, 0)
            if applicant_count < required_count:
                return {"score": 0, "details": {"reason": "Subject requirement not met"}}
    
    # 2. Check standardized test requirement
    sat_required = school_cds["sat_act_required"].get("sat only", False)
    act_required = school_cds["sat_act_required"].get("act only", False)
    either_required = school_cds["sat_act_required"].get("sat_or_act", False)

    sat = applicant.get("sat", 0)
    act = applicant.get("act", 0)

    if sat_required and sat == 0:
        return {"score": 0, "details": {"reason": "SAT score required"}}
    if act_required and act == 0:
        return {"score": 0, "details": {"reason": "ACT score required"}}
    if either_required and sat == 0 and act == 0:
        return {"score": 0, "details": {"reason": "SAT or ACT score required"}}

    ### ------------
    total_score = 0
    details = {}

    # 3. Score for alumni relation
    alumni_max = max_points_by_cat['alumni_importance']
    school_name = school_cds['school_name'].lower()
    alumni_score = 0
    if applicant['alumni']:
        alumni_schools = [name.lower() for name in applicant.get("alumni_school_names", [])]
        if school_name in alumni_schools:
            alumni_score = alumni_max
    alumni_weighted = alumni_score * weight_map[school_cds['alumni_importance']]
    total_score += alumni_weighted
    details["alumni"] = round(alumni_weighted, 2)

    # 4. Score for first-generation status
    #first_max = max_points_by_cat['first_generation']
    #first_score = first_max if applicant['first'] else 0
    #first_weighted = first_score * weight_map[school_cds['first_generation_importance']]
    #total_score += first_weighted
    #details["first_generation"] = round(first_weighted, 2)

    # 5. Score based on residency acceptance rate
    residency_max = max_points_by_cat['residency_importance']
    if applicant['residency'] == 'international':
        residency_category = 'international'
    else:
        if applicant['state'] == school_cds['state']:
            residency_category = 'in-state'
        else:
            residency_category = 'out-of-state'
    residency_score = school_cds['residency_acceptance'][residency_category] * residency_max
    residency_weighted = residency_score * weight_map[school_cds['residency_importance']]
    total_score += residency_weighted
    details["residency"] = round(residency_weighted, 2)

    # 6. Score based on SAT range position
    sat = applicant['sat']
    act = applicant['act']
    sat_act_max = max_points_by_cat['sat_act_importance']
    
    if sat > 0:
        sat_q25 = school_cds['sat_scores']['25th']
        sat_q75 = school_cds['sat_scores']['75th']

        if sat < sat_q25:
            sat_score = 0
        elif sat > sat_q75:
            sat_score = sat_act_max
        else:
            sat_score = ((sat - sat_q25) / (sat_q75 - sat_q25)) * sat_act_max
            sat_score = min(sat_act_max, max(0, sat_score))
        sat_weighted = sat_score * weight_map[school_cds['sat_act_importance']]
        total_score += sat_weighted
        details["standardized_test"] = round(sat_weighted, 2)
    
    elif act > 0:
        act_q25 = school_cds['act scores']['25th']
        act_q75 = school_cds['act scores']['75th']
        if act < act_q25:
            act_score = 0
        elif act > act_q75:
            act_score = sat_act_max
        else:
            act_score = ((act - act_q25) / (act_q75 - act_q25)) * sat_act_max
            act_score = min(sat_act_max, max(0, act_score))
        act_weighted = act_score * weight_map[school_cds['sat_act_importance']]
        total_score += act_weighted
        details["standardized_test"] = round(act_weighted, 2)

    # 7. Score based on GPA
    gpa_max = max_points_by_cat['gpa_importance']
    gpa_score = (applicant['gpa'] / 4.0) * gpa_max
    gpa_weighted = gpa_score * weight_map[school_cds['gpa_importance']]
    total_score += gpa_weighted
    details["GPA"] = round(gpa_weighted, 2)
    
    # 8. Score for volunteer work
    volunteer_max = max_points_by_cat['volunteer_work_importance']
    volunteer = applicant['volunteering_hours']
    if volunteer < 50:
        volunteer_score = 0
    elif volunteer >= 200:
        volunteer_score = volunteer_max
    else:
        volunteer_score = volunteer / 200 * volunteer_max
    details["volunteer_work"] = round(volunteer_score, 2)
    
    # 9. Score for work experience
    
    # 10. Calculate school-specific total max score
    total_max_score = calculate_total_max_score(school_cds, max_points_by_cat)

    # 11. Normalize total_score by total_max_score
    if total_max_score > 0:
        normalized_score = (total_score / total_max_score) * 100
    else:
        normalized_score = 0
    
    return {
        "applicant": applicant["userId"],
        "school_id": get_school_id_by_name(school_cds["school_name"]),
        "score": round(normalized_score, 2), 
        "details": details
    }

In [77]:
calculate_compatibility_score(applicant1, cds_dict["georgia_institute_of_technology"], max_points_by_cat)

{'applicant': '4b34dfc4-fb21-4a9b-886a-126e1428a744',
 'school_id': 31,
 'score': 50.91,
 'details': {'alumni': 0.0,
  'residency': 2.68,
  'standardized_test': 17.14,
  'GPA': 26.43,
  'volunteer_work': 17.14}}

In [78]:
calculate_compatibility_score(applicant2, cds_dict["georgia_institute_of_technology"], max_points_by_cat)

{'applicant': '4b34dfc4-fb21-4a9b-886a-126e1428a743',
 'school_id': 31,
 'score': 50.28,
 'details': {'alumni': 0.0,
  'residency': 2.11,
  'standardized_test': 17.14,
  'GPA': 26.43,
  'volunteer_work': 17.14}}

In [79]:
calculate_compatibility_score(applicant3, cds_dict["georgia_institute_of_technology"], max_points_by_cat)

{'score': 0,
 'details': {'reason': 'High school completion requirement not met'}}

In [80]:
calculate_compatibility_score(applicant4, cds_dict["georgia_institute_of_technology"], max_points_by_cat)

{'applicant': '4b34dfc4-fb21-4a9b-886a-126e1428a741',
 'school_id': 31,
 'score': 24.33,
 'details': {'alumni': 0.0,
  'residency': 2.11,
  'standardized_test': 0.0,
  'GPA': 20.0,
  'volunteer_work': 17.14}}

In [81]:
calculate_compatibility_score(applicant2, cds_dict["university_of_minnesota_–_twin_cities_campus"], max_points_by_cat)

{'applicant': '4b34dfc4-fb21-4a9b-886a-126e1428a743',
 'school_id': -1,
 'score': 70.93,
 'details': {'alumni': 0.0,
  'residency': 16.08,
  'standardized_test': 18.29,
  'GPA': 26.43,
  'volunteer_work': 17.14}}

In [82]:
calculate_compatibility_score(applicant4, cds_dict["university_of_minnesota_–_twin_cities_campus"], max_points_by_cat)

{'applicant': '4b34dfc4-fb21-4a9b-886a-126e1428a741',
 'school_id': -1,
 'score': 42.09,
 'details': {'alumni': 0.0,
  'residency': 16.08,
  'standardized_test': 0.0,
  'GPA': 20.0,
  'volunteer_work': 17.14}}

------------------

### Final Structure

In [92]:
def generate_school_info(applicant: dict, school: str, folder: str, max_points: dict) -> dict:
    
    cds = load_cds_from_folder(folder)
    matching = calculate_compatibility_score(applicant, cds[school], max_points)

    return {
        "id": matching["school_id"],
        "name": cds[school].get('name', None),
        "rankings": {
            "niche": {
                "total_rank": 0,
                "majors_rank": [
                    {
                        "field": "",
                        "rank": 0,
                        "total": 0
                    },
                    {
                        "field": "",
                        "rank": 0,
                        "total": 0
                    }
                ]
            },
            "us": 0,
            "qs": 0
        },
        "location": {
            "state": cds[school].get('state.State', None),
            "coordinates": {
                "latitude": 0,
                "longitude": 0
            }
        },
        "essays": {
            "common_app": "",
            "supplementary": {
                "0": "",
                "1": "",
            }
        },
        "importance_table": {
            "Rigor of Secondary School Record": cds[school].get('rigor_of_secondary_school_record', None),
            "GPA": cds[school].get('gpa_importance', None),
            "Standardized Test Scores": cds[school].get('sat_act_importance', None),
            "Essays": cds[school].get('application_essay', None),
            "Interview": cds[school].get('interview', None),
            "Extracurricular": cds[school].get('extracurricular_activities', None),
            "Talent/Ability": cds[school].get('talent_ability', None),
            "Character/Personal Qualities": cds[school].get('character_personal_qualities', None),
            "First Generation": cds[school].get('first_generation_importance', None),
            "Alumni Relations": cds[school].get('alumni_importance', None),
            "Geographical Residence": cds[school].get('geographical_residence', None),
            "State Residency": cds[school].get('residency_importance', None)
        },
        "matching_score": {
            "applicant": matching["applicant"],
            "school_id": matching["school_id"],
            "score": round(matching["score"], 2),
            "details": matching["details"],
            "max_score": {
                "alumni": round(max_points_by_cat.get("alumni_importance", 0.0), 2),
                "residency": round(max_points_by_cat.get("residency_importance", 0.0), 2),
                "standardized_test": round(max_points_by_cat.get("sat_act_importance", 0.0), 2),
                "GPA": round(max_points_by_cat.get("gpa_importance", 0.0), 2),
                "volunteer_work": round(max_points_by_cat.get("volunteer_work_importance", 0.0), 2)
            }
        }
    }


In [93]:
generate_school_info(applicant1, 'georgia_institute_of_technology', 'output_cds_test', max_points_by_cat)

{'id': 31,
 'name': None,
 'rankings': {'niche': {'total_rank': 0,
   'majors_rank': [{'field': '', 'rank': 0, 'total': 0},
    {'field': '', 'rank': 0, 'total': 0}]},
  'us': 0,
  'qs': 0},
 'location': {'state': 'GA', 'coordinates': {'latitude': 0, 'longitude': 0}},
 'essays': {'common_app': '', 'supplementary': {'0': '', '1': ''}},
 'importance_table': {'Rigor of Secondary School Record': 'very important',
  'GPA': 'very important',
  'Standardized Test Scores': 'considered',
  'Essays': 'important',
  'Interview': 'not considered',
  'Extracurricular': 'important',
  'Talent/Ability': 'considered',
  'Character/Personal Qualities': 'very important',
  'First Generation': 'considered',
  'Alumni Relations': 'not considered',
  'Geographical Residence': 'considered',
  'State Residency': 'very important'},
 'matching_score': {'applicant': '4b34dfc4-fb21-4a9b-886a-126e1428a744',
  'school_id': 31,
  'score': 50.91,
  'details': {'alumni': 0.0,
   'residency': 2.68,
   'standardized_te