### Applicants Sample

In [174]:
applicant1 = {
    "high_school_completion": 1,
    "general_college_requirement": 1,
    "alumni": 0,
    "first": 0,
    "residency": "international",
    "sat": 1520,
    "gpa": 3.7,
    "volunteering_hours": 150,
    "work_months": 0
}

In [175]:
applicant2 = {
    "high_school_completion": 1,
    "general_college_requirement": 1,
    "alumni": 0,
    "first": 0,
    "residency": "international",
    "sat": 1480,
    "gpa": 3.9,
    "volunteering_hours": 150,
    "work_months": 3
}

In [176]:
applicant3 = {
    "high_school_completion": 1,
    "general_college_requirement": 1,
    "alumni": 0,
    "first": 0,
    "residency": "international",
    "sat": 1300,
    "gpa": 2.8,
    "volunteering_hours": 150,
    "work_months": 0
}

### Crawling

In [177]:
import os
import pdfplumber
import re

In [178]:
input_folder = 'pdf'

gatech_file_path = os.path.join(input_folder, 'gatech.pdf')
umn_file_path = os.path.join(input_folder, 'umn.pdf')

In [179]:
# Remove None and empty string from row
def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [180]:
# Extract only numbers from the cell
def clean_number(cell):
    cell = re.sub(r'[^\d]', '', str(cell)) # Remove all non-numeric characters
    return int(cell) if cell else 0

In [181]:
def extract_residency_data(file_path):
    
    result = {
        'applicants': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0},
        'admitted': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0}
    }

    # Define keywords to identify residency-related and first-year-related tables
    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in-state': -1, 'out-of-state': -1, 'international': -1, 'others': -1}
                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in-state'] = idx
                    elif 'out-' in col:
                        col_idx['out-of-state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                   # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add residency-specific count
                    if col_idx['in-state'] != -1 and col_idx['in-state'] < len(row):
                        result[target]['in-state'] += clean_number(row[col_idx['in-state']])
                    if col_idx['out-of-state'] != -1 and col_idx['out-of-state'] < len(row):
                        result[target]['out-of-state'] += clean_number(row[col_idx['out-of-state']])
                    if col_idx['international'] != -1 and col_idx['international'] < len(row):
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1 and col_idx['others'] < len(row):
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [182]:
gatech_residency = extract_residency_data(gatech_file_path)
gatech_residency

{'applicants': {'in-state': 10674,
  'out-of-state': 38320,
  'international': 10795,
  'others': 0},
 'admitted': {'in-state': 3536,
  'out-of-state': 3992,
  'international': 885,
  'others': 0}}

In [183]:
umn_residency = extract_residency_data(umn_file_path)
umn_residency

{'applicants': {'in-state': 13982,
  'out-of-state': 19461,
  'international': 1256,
  'others': 0},
 'admitted': {'in-state': 10498,
  'out-of-state': 15400,
  'international': 982,
  'others': 0}}

### Preprocessing

In [184]:
def calculate_acceptance_rates(data):
    applicants = data['applicants']
    admitted = data['admitted']
    rates = {}
    for category in applicants:
        applied = applicants[category]
        accepted = admitted.get(category, 0)
        if applied > 0:
            rates[category] = accepted / applied
        else:
            rates[category] = None 
    return rates

In [185]:
calculate_acceptance_rates(gatech_residency)

{'in-state': 0.3312722503278996,
 'out-of-state': 0.10417536534446764,
 'international': 0.08198239925891616,
 'others': None}

In [186]:
calculate_acceptance_rates(umn_residency)

{'in-state': 0.7508224860534973,
 'out-of-state': 0.7913262422280458,
 'international': 0.7818471337579618,
 'others': None}

### Modeling

In [187]:
gatech_cds = {
    "high_school_completion_required": True,
    "general_college_preparatory_required": True,
    "sat_act_required": True,
    "alumni_importance": "Not Considered",
    "first_generation_importance": "Considered",
    "residency_importance": "Very Important",
    "sat_importance": "Considered",
    "gpa_importance": "Very Important",
    "residency_acceptance": {
        "international": 0.082,
        "in-state": 0.3313,
        "out-of-state": 0.1042
    },
    "sat_scores": {
        "25th": 1400,
        "50th": 1500,
        "75th": 1560
    },
    "volunteer_work": "Considered",
    "work_experience": "Not Considered"
}

In [188]:
umn_cds = {
    "high_school_completion_required": True,
    "general_college_preparatory_required": True,
    "sat_act_required": True,
    "alumni_importance": "Not Considered",
    "first_generation_importance": "Considered",
    "residency_importance": "Considered",
    "sat_importance": "Considered",
    "gpa_importance": "Very Important",
    "residency_acceptance": {
        "international": 0.7818,
        "in-state": 0.7508,
        "out-of-state": 0.7913
    },
    "sat_scores": {
        "25th": 1328,
        "50th": 1400,
        "75th": 1460
    },
    "volunteer_work": "Considered",
    "work_experience": "Considered"
}

----------------

In [189]:
import pandas as pd
from collections import Counter

# Example list of CDS importance data for multiple schools (gatech, umn, nyu, uga)
cds_importance_data = [
    {
        "gpa_importance": "Very Important",
        "sat_importance": "Considered",
        "residency_importance": "Very Important",
        "alumni_importance": "Not Considered",
        "volunteer_work": "Considered",
        #"work_experience": "Considered" 
    },
    {
        "gpa_importance": "Very Important",
        "sat_importance": "Considered",
        "residency_importance": "Considered",
        "alumni_importance": "Not Considered",
        "volunteer_work": "Considered",
        #"work_experience": "Considered"
    },
    {
        "gpa_importance": "Very Important",
        "sat_importance": "Important",
        "residency_importance": "Not Considered",
        "alumni_importance": "Not Considered",
        "volunteer_work": "Considered",
        #"work_experience": "Considered"
    },
        {
        "gpa_importance": "Very Important",
        "sat_importance": "Important",
        "residency_importance": "Not Considered",
        "alumni_importance": "Considered", # mock data
        "volunteer_work": "Considered",
        #"work_experience": "Considered"
    }
]

# 1. Count importance levels per category
importance_levels = ["Very Important", "Important", "Considered", "Not Considered"]
categories = ["gpa_importance", "sat_importance", "residency_importance", "alumni_importance", "volunteer_work"]
importance_counts = {cat: Counter() for cat in categories}

for record in cds_importance_data:
    for cat in categories:
        importance_counts[cat][record[cat]] += 1

# 2. Calculate average weights
weight_map = {
    "Very Important": 1.0,
    "Important": 0.9,
    "Considered": 0.8,
    "Not Considered": 0.0
}

average_weights = {}
for cat in categories:
    total = sum(importance_counts[cat].values())
    if total == 0:
        average_weights[cat] = 0
        continue
    weighted_sum = sum(weight_map[level] * count for level, count in importance_counts[cat].items())
    average_weights[cat] = round(weighted_sum / total, 4)

average_weights


{'gpa_importance': 1.0,
 'sat_importance': 0.85,
 'residency_importance': 0.45,
 'alumni_importance': 0.2,
 'volunteer_work': 0.8}

In [190]:
# Convert average weights into max point allocation (total = 100 points)
def normalize_max_points_from_avg_weights(average_weights, total_points=100):
    total_weight = sum(average_weights.values())
    max_points_by_cat = {
        key.replace('_importance', ''): round((w / total_weight) * total_points, 2)
        for key, w in average_weights.items()
    }
    return max_points_by_cat

In [191]:
max_points_by_cat = normalize_max_points_from_avg_weights(average_weights)
max_points_by_cat

{'gpa': 30.3,
 'sat': 25.76,
 'residency': 13.64,
 'alumni': 6.06,
 'volunteer_work': 24.24}

----------------

In [192]:
def calculate_total_max_score(school_info, max_points_by_cat, weight_map):
    total = 0
    category_to_importance_key = {
        'gpa': 'gpa_importance',
        'sat': 'sat_importance',
        'residency': 'residency_importance',
        'alumni': 'alumni_importance',
        'volunteer_work': 'volunteer_work',
        'work_experience': 'work_experience'
    }
    
    for category, max_point in max_points_by_cat.items():
        importance_key = category_to_importance_key.get(category)
        if importance_key and importance_key in school_info:
            importance = school_info[importance_key]
            weight = weight_map.get(importance, 0)
            total += max_point * weight
        else:
            continue
    return total

In [193]:
weight_map = {
    "Very Important": 1.0,
    "Important": 0.9,
    "Considered": 0.8,
    "Not Considered": 0.0
}

In [194]:
calculate_total_max_score(gatech_cds, max_points_by_cat, weight_map)

83.94

In [195]:
calculate_total_max_score(umn_cds, max_points_by_cat, weight_map)

81.212

------------

In [196]:
def calculate_compatibility_score(applicant, school_requirements, max_points_by_cat):
    
    # 1. Check essential requirments
    if not (school_requirements['high_school_completion_required'] and
            school_requirements['general_college_preparatory_required'] and
            school_requirements['sat_act_required']):
        return {"score": 0, "details": {"reason": "Essential requirement not met"}}

    # 2. Define importance weight mapping
    weight_map = {
        "Very Important": 1.0,
        "Important": 0.9,
        "Considered": 0.8,
        "Not Considered": 0.0
    }

    total_score = 0
    details = {}

    # 3. Score for alumni relation
    alumni_max = max_points_by_cat['alumni']
    alumni_score = alumni_max if applicant['alumni'] else 0
    alumni_weighted = alumni_score * weight_map[school_requirements['alumni_importance']]
    total_score += alumni_weighted
    details["alumni"] = round(alumni_weighted, 2)

    # 4. Score for first-generation status
    #first_max = max_points_by_cat['first_generation']
    #first_score = first_max if applicant['first'] else 0
    #first_weighted = first_score * weight_map[school_requirements['first_generation_importance']]
    #total_score += first_weighted
    #details["first_generation"] = round(first_weighted, 2)

    # 5. Score based on residency acceptance rate
    residency_max = max_points_by_cat['residency']
    residency = applicant['residency']
    residency_score = school_requirements['residency_acceptance'][residency] * residency_max
    residency_weighted = residency_score * weight_map[school_requirements['residency_importance']]
    total_score += residency_weighted
    details["residency"] = round(residency_weighted, 2)
    

    # 6. Score based on SAT range position
    sat_max = max_points_by_cat['sat']
    sat = applicant['sat']
    sat_q25 = school_requirements['sat_scores']['25th']
    sat_q50 = school_requirements['sat_scores']['50th']
    sat_q75 = school_requirements['sat_scores']['75th']

    if sat < sat_q25:
        sat_score = 0
    elif sat > sat_q75:
        sat_score = sat_max
    else:
        sat_score = ((sat - sat_q25) / (sat_q75 - sat_q25)) * sat_max
        sat_score = min(sat_max, max(0, sat_score))
    sat_weighted = sat_score * weight_map[school_requirements['sat_importance']]
    total_score += sat_weighted
    details["sat"] = round(sat_weighted, 2)

    # 7. Score based on GPA
    gpa_max = max_points_by_cat['gpa']
    gpa_score = (applicant['gpa'] / 4.0) * gpa_max
    gpa_weighted = gpa_score * weight_map[school_requirements['gpa_importance']]
    total_score += gpa_weighted
    details["gpa"] = round(gpa_weighted, 2)
    
    # 8. Score for volunteer work
    volunteer_max = max_points_by_cat['volunteer_work']
    volunteer = applicant['volunteering_hours']
    if volunteer < 50:
        volunteer_score = 0
    elif volunteer >= 200:
        volunteer_score = volunteer_max
    else:
        volunteer_score = volunteer / 200 * volunteer_max
    
    # 9. Score for work experience
    #work_max = max_points_by_cat['work_experience']
    #work = applicant['work_months']
    
    # 10. Calculate school-specific total max score
    total_max_score = calculate_total_max_score(school_requirements, max_points_by_cat, weight_map)

    # 11. Normalize total_score by total_max_score
    if total_max_score > 0:
        normalized_score = (total_score / total_max_score) * 100
    else:
        normalized_score = 0
    
    return {
        "score": round(normalized_score, 2), 
        "details": details
    }

In [197]:
calculate_compatibility_score(applicant1, gatech_cds, max_points_by_cat)

{'score': 53.14,
 'details': {'alumni': 0.0, 'residency': 1.12, 'sat': 15.46, 'gpa': 28.03}}

In [198]:
calculate_compatibility_score(applicant1, umn_cds, max_points_by_cat)

{'score': 70.39,
 'details': {'alumni': 0.0, 'residency': 8.53, 'sat': 20.61, 'gpa': 28.03}}

In [199]:
calculate_compatibility_score(applicant3, gatech_cds, max_points_by_cat)

{'score': 26.6,
 'details': {'alumni': 0.0, 'residency': 1.12, 'sat': 0.0, 'gpa': 21.21}}

In [200]:
calculate_compatibility_score(applicant3, umn_cds, max_points_by_cat)

{'score': 36.62,
 'details': {'alumni': 0.0, 'residency': 8.53, 'sat': 0.0, 'gpa': 21.21}}