### Applicants Sample

In [1]:
applicant1 = {
    "high_school_completion": 1,
    "general_college_requirement": 1,
    "alumni": 0,
    "first": 0,
    "residency": "international",
    "sat": 1520,
    "gpa": 3.7
}

In [2]:
applicant2 = {
    "high_school_completion": 1,
    "general_college_requirement": 1,
    "alumni": 0,
    "first": 0,
    "residency": "international",
    "sat": 1480,
    "gpa": 3.9
}

In [3]:
applicant3 = {
    "high_school_completion": 1,
    "general_college_requirement": 1,
    "alumni": 0,
    "first": 0,
    "residency": "international",
    "sat": 1300,
    "gpa": 2.8
}

### Crawling

In [13]:
import os
import pdfplumber
import re

In [14]:
input_folder = 'pdf'

gatech_file_path = os.path.join(input_folder, 'gatech.pdf')

In [15]:
# Remove None and empty string from row
def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [16]:
# Extract only numbers from the cell
def clean_number(cell):
    cell = re.sub(r'[^\d]', '', str(cell)) # Remove all non-numeric characters
    return int(cell) if cell else 0

In [19]:
def extract_residency_data(file_path):
    
    result = {
        'applicants': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0},
        'admitted': {'in-state': 0, 'out-of-state': 0, 'international': 0, 'others': 0}
    }

    # Define keywords to identify residency-related and first-year-related tables
    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in-state': -1, 'out-of-state': -1, 'international': -1, 'others': -1}
                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in-state'] = idx
                    elif 'out-' in col:
                        col_idx['out-of-state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                   # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add residency-specific count
                    if col_idx['in-state'] != -1 and col_idx['in-state'] < len(row):
                        result[target]['in-state'] += clean_number(row[col_idx['in-state']])
                    if col_idx['out-of-state'] != -1 and col_idx['out-of-state'] < len(row):
                        result[target]['out-of-state'] += clean_number(row[col_idx['out-of-state']])
                    if col_idx['international'] != -1 and col_idx['international'] < len(row):
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1 and col_idx['others'] < len(row):
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [23]:
gatech_residency = extract_residency_data(gatech_file_path)
gatech_residency

{'applicants': {'in-state': 10674,
  'out-of-state': 38320,
  'international': 10795,
  'others': 0},
 'admitted': {'in-state': 3536,
  'out-of-state': 3992,
  'international': 885,
  'others': 0}}

### Preprocessing

In [25]:
def convert_residency_data_to_stats(residency_data):
    applicants = residency_data['applicants']
    admitted = residency_data['admitted']

    # Calculate acceptance rates by residency type
    residency_acceptance = {}
    for key in ['in-state', 'out-of-state', 'international']:
        if applicants[key] > 0:
            rate = admitted[key] / applicants[key]
            residency_acceptance[key] = round(rate, 4)
        else:
            residency_acceptance[key] = 0.0

    # Calculate overall acceptance rate and std deviation
    total_applicants = sum(applicants.values())
    total_admitted = sum(admitted.values())
    overall_acceptance_rate = total_admitted / total_applicants if total_applicants > 0 else 0

    acceptance_rates = [residency_acceptance[key] for key in ['in-state', 'out-of-state', 'international']]
    mean = overall_acceptance_rate
    variance = sum((x - mean) ** 2 for x in acceptance_rates) / len(acceptance_rates)
    std_dev = variance ** 0.5

    return {
        "residency_acceptance": residency_acceptance,
        "overall_acceptance_rate": round(overall_acceptance_rate, 4),
        "overall_acceptance_std": round(std_dev, 4)
    }

In [26]:
convert_residency_data_to_stats(gatech_residency)

{'residency_acceptance': {'in-state': 0.3313,
  'out-of-state': 0.1042,
  'international': 0.082},
 'overall_acceptance_rate': 0.1407,
 'overall_acceptance_std': 0.1171}

### Modeling

In [27]:
gatech_cds = {
    "high_school_completion_required": True,
    "general_college_preparatory_required": True,
    "sat_act_required": True,
    "alumni_importance": "Not Considered",
    "first_generation_importance": "Considered",
    "residency_importance": "Very Important",
    "sat_importance": "Considered",
    "gpa_importance": "Very Important",
    "residency_acceptance": {
        "international": 0.082,
        "in-state": 0.3313,
        "out-of-state": 0.1042
    },
    "overall_acceptance_rate": 0.1407,
    "overall_acceptance_std": 0.1171,
    "sat_scores": {
        "25th": 1400,
        "50th": 1500,
        "75th": 1560
    }
}

In [28]:
def calculate_compatibility_score1(applicant, school_requirements):
    
    # 1. Check essential requirments
    if not (school_requirements['high_school_completion_required'] and
            school_requirements['general_college_preparatory_required'] and
            school_requirements['sat_act_required']):
        return {"score": 0, "details": {"reason": "Essential requirement not met"}}

    # 2. Define importance weight mapping
    weight_map = {
        "Very Important": 1.0,
        "Important": 0.9,
        "Considered": 0.8,
        "Not Considered": 0.0
    }

    total_score = 0
    details = {}

    # 3. Score for alumni relation
    alumni_score = 20 if applicant['alumni'] else 0
    alumni_weighted = alumni_score * weight_map[school_requirements['alumni_importance']]
    total_score += alumni_weighted
    details["alumni"] = round(alumni_weighted, 2)

    # 4. Score for first-generation status
    first_score = 20 if applicant['first'] else 0
    first_weighted = first_score * weight_map[school_requirements['first_generation_importance']]
    total_score += first_weighted
    details["first_generation"] = round(first_weighted, 2)

    # 5. Score based on residency acceptance rate
    residency = applicant['residency']
    residency_rate = school_requirements['residency_acceptance'][residency]
    overall_rate = school_requirements['overall_acceptance_rate']
    std_dev = school_requirements['overall_acceptance_std']
    if std_dev > 0:
        z = (residency_rate - overall_rate) / std_dev
        residency_score = min(20, max(0, ((z + 2) / 4) * 20))
    else:
        residency_score = 0
    residency_weighted = residency_score * weight_map[school_requirements['residency_importance']]
    total_score += residency_weighted
    details["residency"] = round(residency_weighted, 2)
    

    # 6. Score based on SAT range position
    sat = applicant['sat']
    sat_q25 = school_requirements['sat_scores']['25th']
    sat_q50 = school_requirements['sat_scores']['50th']
    sat_q75 = school_requirements['sat_scores']['75th']

    if sat < sat_q25:
        sat_score = 5
    elif sat < sat_q50:
        sat_score = 10
    elif sat < sat_q75:
        sat_score = 15
    else:
        sat_score = 20
    sat_weighted = sat_score * weight_map[school_requirements['sat_importance']]
    total_score += sat_weighted
    details["sat"] = round(sat_weighted, 2)

    # 7. Score based on GPA
    gpa_score = applicant['gpa'] * 5
    gpa_weighted = gpa_score * weight_map[school_requirements['gpa_importance']]
    total_score += gpa_weighted
    details["gpa"] = round(gpa_weighted, 2)

    return {
        "score": round(total_score, 2), 
        "details": details
    }


In [29]:
calculate_compatibility_score1(applicant1, gatech_cds)

{'score': 37.99,
 'details': {'alumni': 0.0,
  'first_generation': 0.0,
  'residency': 7.49,
  'sat': 12.0,
  'gpa': 18.5}}

In [30]:
calculate_compatibility_score1(applicant2, gatech_cds)

{'score': 34.99,
 'details': {'alumni': 0.0,
  'first_generation': 0.0,
  'residency': 7.49,
  'sat': 8.0,
  'gpa': 19.5}}

In [31]:
calculate_compatibility_score1(applicant3, gatech_cds)

{'score': 25.49,
 'details': {'alumni': 0.0,
  'first_generation': 0.0,
  'residency': 7.49,
  'sat': 4.0,
  'gpa': 14.0}}

----------------

In [32]:
import pandas as pd
from collections import Counter

# Example list of CDS importance data for multiple schools (mock data)
cds_importance_data = [
    {
        "gpa_importance": "Very Important",
        "sat_importance": "Important",
        "residency_importance": "Considered",
        "first_generation_importance": "Considered",
        "alumni_importance": "Not Considered"
    },
    {
        "gpa_importance": "Very Important",
        "sat_importance": "Very Important",
        "residency_importance": "Important",
        "first_generation_importance": "Considered",
        "alumni_importance": "Considered"
    },
    {
        "gpa_importance": "Important",
        "sat_importance": "Very Important",
        "residency_importance": "Considered",
        "first_generation_importance": "Important",
        "alumni_importance": "Not Considered"
    }
]

# 1. Count importance levels per category
importance_levels = ["Very Important", "Important", "Considered", "Not Considered"]
categories = ["gpa_importance", "sat_importance", "residency_importance", "first_generation_importance", "alumni_importance"]
importance_counts = {cat: Counter() for cat in categories}

for record in cds_importance_data:
    for cat in categories:
        importance_counts[cat][record[cat]] += 1

# 2. Calculate average weights
weight_map = {
    "Very Important": 1.0,
    "Important": 0.9,
    "Considered": 0.8,
    "Not Considered": 0.0
}

average_weights = {}
for cat in categories:
    total = sum(importance_counts[cat].values())
    if total == 0:
        average_weights[cat] = 0
        continue
    weighted_sum = sum(weight_map[level] * count for level, count in importance_counts[cat].items())
    average_weights[cat] = round(weighted_sum / total, 4)

average_weights


{'gpa_importance': 0.9667,
 'sat_importance': 0.9667,
 'residency_importance': 0.8333,
 'first_generation_importance': 0.8333,
 'alumni_importance': 0.2667}

In [33]:
# Convert average weights into max point allocation (total = 100 points)
def normalize_max_points_from_avg_weights(average_weights, total_points=100):
    total_weight = sum(average_weights.values())
    max_points_by_cat = {
        key.replace('_importance', ''): round((w / total_weight) * total_points, 2)
        for key, w in average_weights.items()
    }
    return max_points_by_cat

In [34]:
max_points_by_cat = normalize_max_points_from_avg_weights(average_weights)
max_points_by_cat

{'gpa': 25.0,
 'sat': 25.0,
 'residency': 21.55,
 'first_generation': 21.55,
 'alumni': 6.9}

In [35]:
def calculate_compatibility_score2(applicant, school_requirements, max_points_by_cat):
    
    # 1. Check essential requirments
    if not (school_requirements['high_school_completion_required'] and
            school_requirements['general_college_preparatory_required'] and
            school_requirements['sat_act_required']):
        return {"score": 0, "details": {"reason": "Essential requirement not met"}}

    # 2. Define importance weight mapping
    weight_map = {
        "Very Important": 1.0,
        "Important": 0.9,
        "Considered": 0.8,
        "Not Considered": 0.0
    }

    total_score = 0
    details = {}

    # 3. Score for alumni relation
    alumni_max = max_points_by_cat['alumni']
    alumni_score = alumni_max if applicant['alumni'] else 0
    alumni_weighted = alumni_score * weight_map[school_requirements['alumni_importance']]
    total_score += alumni_weighted
    details["alumni"] = round(alumni_weighted, 2)

    # 4. Score for first-generation status
    first_max = max_points_by_cat['first_generation']
    first_score = first_max if applicant['first'] else 0
    first_weighted = first_score * weight_map[school_requirements['first_generation_importance']]
    total_score += first_weighted
    details["first_generation"] = round(first_weighted, 2)

    # 5. Score based on residency acceptance rate
    residency_max = max_points_by_cat['residency']
    residency = applicant['residency']
    residency_rate = school_requirements['residency_acceptance'][residency]
    overall_rate = school_requirements['overall_acceptance_rate']
    std_dev = school_requirements['overall_acceptance_std']
    if std_dev > 0:
        z = (residency_rate - overall_rate) / std_dev
        normalized = min(1, max(0, (z + 2) / 4))
        residency_score = normalized * residency_max
    else:
        residency_score = 0
    residency_weighted = residency_score * weight_map[school_requirements['residency_importance']]
    total_score += residency_weighted
    details["residency"] = round(residency_weighted, 2)
    

    # 6. Score based on SAT range position
    sat_max = max_points_by_cat['sat']
    sat = applicant['sat']
    sat_q25 = school_requirements['sat_scores']['25th']
    sat_q50 = school_requirements['sat_scores']['50th']
    sat_q75 = school_requirements['sat_scores']['75th']

    if sat < sat_q25:
        sat_score = 0
    elif sat > sat_q75:
        sat_score = sat_max
    else:
        sat_score = ((sat - sat_q25) / (sat_q75 - sat_q25)) * sat_max
        sat_score = min(sat_max, max(0, sat_score))
    sat_weighted = sat_score * weight_map[school_requirements['sat_importance']]
    total_score += sat_weighted
    details["sat"] = round(sat_weighted, 2)

    # 7. Score based on GPA
    gpa_max = max_points_by_cat['gpa']
    gpa_score = (applicant['gpa'] / 4.0) * gpa_max
    gpa_weighted = gpa_score * weight_map[school_requirements['gpa_importance']]
    total_score += gpa_weighted
    details["gpa"] = round(gpa_weighted, 2)

    return {
        "score": round(total_score, 2), 
        "details": details
    }


In [36]:
calculate_compatibility_score2(applicant1, gatech_cds, max_points_by_cat)

{'score': 46.2,
 'details': {'alumni': 0.0,
  'first_generation': 0.0,
  'residency': 8.07,
  'sat': 15.0,
  'gpa': 23.12}}

In [37]:
calculate_compatibility_score2(applicant2, gatech_cds, max_points_by_cat)

{'score': 42.45,
 'details': {'alumni': 0.0,
  'first_generation': 0.0,
  'residency': 8.07,
  'sat': 10.0,
  'gpa': 24.38}}

In [38]:
calculate_compatibility_score2(applicant3, gatech_cds, max_points_by_cat)

{'score': 25.57,
 'details': {'alumni': 0.0,
  'first_generation': 0.0,
  'residency': 8.07,
  'sat': 0.0,
  'gpa': 17.5}}