In [1]:
import os # Used for creating folders and reading files
import re # Used for regular expressions
import pdfplumber # Used for extracting text from PDF files
import pandas as pd

In [2]:
# Set input and output folder paths

input_folder = 'pdf'

gatech_file_path = os.path.join(input_folder, 'gatech.pdf')
umn_file_path = os.path.join(input_folder, 'umn.pdf')
nyu_file_path = os.path.join(input_folder, 'nyu.pdf')
uga_file_path = os.path.join(input_folder, 'uga.pdf')

In [3]:
# Remove None and empty string from row

def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [4]:
# Extract only numbers from the cell

def clean_number(cell):
    cell = re.sub(r'[^\d]', '', str(cell)) # Remove all non-numeric characters
    return int(cell) if cell else 0

In [5]:
# Code to extract text from a specific page for verification

def extract_text_from_page(file_path, page_number):
    with pdfplumber.open(file_path) as pdf:
        # Check if the given page number is valid
        if page_number < 1 or page_number > len(pdf.pages):
            print(f"Invalid page number. This PDF has {len(pdf.pages)} pages.")
            return

        # Adjust for 0-based indexing in pdfplumber
        page = pdf.pages[page_number - 1]
        text = page.extract_text()

        # Print the extracted text if available
        if text:
            print(f"Text from Page {page_number}:\n")
            print(text)
        else:
            print(f"No text found on page {page_number}.")

In [6]:
# Extract text from a section like "C{#}." up to before "C{#+1}."

def extract_section_by_label(file_path, section_label):
    next_label = "c" + str(int(section_label[1:]) + 1)

    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text.lower()

    # Match exact line starting with "c{#}."
    # ^ = matches the beginning of a line
    start_match = re.search(rf"({section_label.lower()}[\. ])", full_text)
    if not start_match:
        print(f"Section {section_label.upper()} not found.")
        return
    start_idx = start_match.start()

    # Look for the start of the next section
    next_match = re.search(rf"({next_label}[\. ])", full_text[start_idx:])
    end_idx = start_idx + next_match.start() if next_match else None # If we don't find the next label, go to the end of the document

    section_text = full_text[start_idx:end_idx]

    return section_text

### C1. Applications (Gender, Residency)

##### Gender

In [7]:
def extract_gender_data(file_path):
    
    result = {
        'applicants': {'men': 0, 'women': 0, 'others': 0},
        'admitted': {'men': 0, 'women': 0, 'others': 0}
    }
    
    # Define keywords to identify gender-related and first-year-related tables
    gender_keywords = ['men', 'women', 'another gender', 'unknown gender']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            
            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in gender_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Loop through data rows
                for row in table:
                    row = clean_row(row) # Clean up null and empty cells
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                    # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add gender-specific count from the last column
                    if 'men' in row_str and 'women' not in row_str:
                        result[target]['men'] += clean_number(row[-1])
                    elif 'women' in row_str:
                        result[target]['women'] += clean_number(row[-1])
                    elif 'another gender' in row_str or 'unknown gender' in row_str:
                        result[target]['others'] += clean_number(row[-1])
                            
    return result

In [8]:
extract_gender_data(gatech_file_path)

{'applicants': {'men': 40022, 'women': 19765, 'others': 2},
 'admitted': {'men': 4634, 'women': 3779, 'others': 0}}

In [9]:
extract_gender_data(umn_file_path)

{'applicants': {'men': 18901, 'women': 20862, 'others': 0},
 'admitted': {'men': 14427, 'women': 16218, 'others': 0}}

In [10]:
extract_gender_data(nyu_file_path) # ^^

{'applicants': {'men': 0, 'women': 0, 'others': 0},
 'admitted': {'men': 0, 'women': 0, 'others': 0}}

In [11]:
extract_gender_data(uga_file_path)

{'applicants': {'men': 18211, 'women': 25191, 'others': 14},
 'admitted': {'men': 6260, 'women': 9882, 'others': 6}}

##### Residency

In [14]:
def extract_residency_data(file_path):
    
    result = {
        'applicants': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0},
        'admitted': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0}
    }

    # Define keywords to identify residency-related and first-year-related tables
    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in_state': -1, 'out_of_state': -1, 'international': -1, 'others': -1}
                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in_state'] = idx
                    elif 'out-' in col:
                        col_idx['out_of_state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                   # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add residency-specific count
                    if col_idx['in_state'] != -1 and col_idx['in_state'] < len(row):
                        result[target]['in_state'] += clean_number(row[col_idx['in_state']])
                    if col_idx['out_of_state'] != -1 and col_idx['out_of_state'] < len(row):
                        result[target]['out_of_state'] += clean_number(row[col_idx['out_of_state']])
                    if col_idx['international'] != -1 and col_idx['international'] < len(row):
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1 and col_idx['others'] < len(row):
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [15]:
extract_residency_data(gatech_file_path)

{'applicants': {'in_state': 10674,
  'out_of_state': 38320,
  'international': 10795,
  'others': 0},
 'admitted': {'in_state': 3536,
  'out_of_state': 3992,
  'international': 885,
  'others': 0}}

In [16]:
extract_residency_data(umn_file_path)

{'applicants': {'in_state': 13982,
  'out_of_state': 19461,
  'international': 1256,
  'others': 0},
 'admitted': {'in_state': 10498,
  'out_of_state': 15400,
  'international': 982,
  'others': 0}}

In [24]:
extract_residency_data(nyu_file_path)

{'applicants': {'in_state': 0,
  'out_of_state': 0,
  'international': 0,
  'others': 0},
 'admitted': {'in_state': 0,
  'out_of_state': 0,
  'international': 0,
  'others': 0}}

In [17]:
extract_residency_data(uga_file_path)

{'applicants': {'in_state': 18210,
  'out_of_state': 23867,
  'international': 1338,
  'others': 1},
 'admitted': {'in_state': 9149,
  'out_of_state': 6709,
  'international': 289,
  'others': 1}}

##### Applicant Summary

In [18]:
def extract_applicant_summary(file_path):
    gender_data = extract_gender_data(file_path)
    residency_data = extract_residency_data(file_path)

    result = {
        'applicants': {
            'gender': gender_data.get('applicants', {}),
            'residency': residency_data.get('applicants', {})
        },
        'admitted': {
            'gender': gender_data.get('admitted', {}),
            'residency': residency_data.get('admitted', {})
        }
    }

    return result


In [19]:
extract_applicant_summary(gatech_file_path)

{'applicants': {'gender': {'men': 40022, 'women': 19765, 'others': 2},
  'residency': {'in_state': 10674,
   'out_of_state': 38320,
   'international': 10795,
   'others': 0}},
 'admitted': {'gender': {'men': 4634, 'women': 3779, 'others': 0},
  'residency': {'in_state': 3536,
   'out_of_state': 3992,
   'international': 885,
   'others': 0}}}

In [20]:
extract_applicant_summary(umn_file_path)

{'applicants': {'gender': {'men': 18901, 'women': 20862, 'others': 0},
  'residency': {'in_state': 13982,
   'out_of_state': 19461,
   'international': 1256,
   'others': 0}},
 'admitted': {'gender': {'men': 14427, 'women': 16218, 'others': 0},
  'residency': {'in_state': 10498,
   'out_of_state': 15400,
   'international': 982,
   'others': 0}}}

In [23]:
extract_applicant_summary(nyu_file_path)

{'applicants': {'gender': {'men': 0, 'women': 0, 'others': 0},
  'residency': {'in_state': 0,
   'out_of_state': 0,
   'international': 0,
   'others': 0}},
 'admitted': {'gender': {'men': 0, 'women': 0, 'others': 0},
  'residency': {'in_state': 0,
   'out_of_state': 0,
   'international': 0,
   'others': 0}}}

In [21]:
extract_applicant_summary(uga_file_path)

{'applicants': {'gender': {'men': 18211, 'women': 25191, 'others': 14},
  'residency': {'in_state': 18210,
   'out_of_state': 23867,
   'international': 1338,
   'others': 1}},
 'admitted': {'gender': {'men': 6260, 'women': 9882, 'others': 6},
  'residency': {'in_state': 9149,
   'out_of_state': 6709,
   'international': 289,
   'others': 1}}}

### C3. High School Completetion requirement

In [22]:
def extract_highschool_requirement(file_path):

    result = {
        'high school diploma required': 0,
        'GED accepted': 0
    }

    # Define exact phrases expected in each case
    diploma_and_ged = "high school diploma is required and ged is accepted"
    diploma_only = "high school diploma is required and ged is not accepted"
    no_diploma_needed = "high school diploma or equivalent is not required"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C3 section using helper function
    section_text = extract_section_by_label(file_path, "C3")
    if not section_text:
        return result  # Return default if section not found

    section_text = section_text.lower()
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if diploma_and_ged in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
            found_checked_option = True
            break
        elif diploma_only in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
            found_checked_option = True
            break
        elif no_diploma_needed in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 0
            result['GED accepted'] = 0
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if diploma_and_ged in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
        elif diploma_only in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
        elif no_diploma_needed in section_text:
            result['high school diploma required'] = 0
            result['GED accepted'] = 0

    return result

In [25]:
extract_highschool_requirement(gatech_file_path)

{'high school diploma required': 1, 'GED accepted': 0}

In [26]:
extract_highschool_requirement(umn_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [27]:
extract_highschool_requirement(nyu_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [28]:
extract_highschool_requirement(uga_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [None]:
text = extract_section_by_label(uga_file_path, 'c3')
for line in text.split('\n'):
    print(line)

c3. high school completion requirement
check the appropriate box to identify your high school completion requirement for degree-seeking entering
students:
4 high school diploma is required and ged is accepted
☐
high school diploma is required and ged is not accepted
☐
high school diploma or equivalent is not required
☐



### C4. General College-Preparatory Program Requirement

In [113]:
def extract_college_prep_requirement(file_path):
    
    result = {
        'general college-preparatory program': 
            {
                'required': 0,
                'recommended': 0,
                'neither required or recommended': 0
            }
        }

    # Define key phrases to match
    required = "require"
    recommended = "recommend"
    neither = "neither require nor recommend"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C4 section using helper function
    section_text = extract_section_by_label(file_path, 'C4')
    if not section_text:
        return result # Return default if section not found
    section_text = section_text.lower()
    
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        
        # Skip the question sectence
        if line.startswith('c4'):
            continue
        
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if neither in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['neither required or recommended'] = 1
            found_checked_option = True
            break
        elif required in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['required'] = 1
            found_checked_option = True
            break
        elif recommended in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['recommended'] = 1
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if "neither require nor recommend" in section_text:
            result['general college-preparatory program']['neither required or recommended'] = 1
        elif "recommend" in section_text:
            result['general college-preparatory program']['recommended'] = 1
        elif "require" in section_text:
            result['general college-preparatory program']['required'] = 1

    return result

In [114]:
extract_college_prep_requirement(gatech_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [115]:
extract_college_prep_requirement(umn_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [116]:
text = extract_section_by_label(umn_file_path, 'c4')
for line in text.split('\n'):
    print(line)

c4. does your institution require or recommend a general college-preparatory program for degree-
seeking students?
☐ require:
☒ recommend
☐ neither require nor recommendcommon data set 2023-2024



In [117]:
extract_college_prep_requirement(nyu_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [118]:
extract_college_prep_requirement(uga_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [119]:
text = extract_section_by_label(uga_file_path, 'c4')
for line in text.split('\n'):
    print(line)

c4. does your institution require or recommend a general college-preparatory program for degree-seeking
students?
4 require
☐
recommend
☐
neither require nor recommend
☐



### Summarize

In [33]:
extract_college_prep_requirement(nyu_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [34]:
extract_college_prep_requirement(uga_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [35]:
text = extract_section_by_label(uga_file_path, 'c4')
for line in text.split('\n'):
    print(line)

c4. does your institution require or recommend a general college-preparatory program for degree-seeking
students?
4 require
☐
recommend
☐
neither require nor recommend
☐



### Summarize

In [36]:
# Summarize extracted data into a JSON-ready dictionary

def summarize_results(filename, gender_data, residency_data, highschool_data, prep_data):
    
    def calc_ratio(group_dict):
        total = sum(group_dict.values()) or 1
        return {k: round(v / total * 100) for k, v in group_dict.items()}

    return {
        "filename": filename,
        "summary": {
            "gender_ratio": {
                "applicants": calc_ratio(gender_data.get('applicants', {})),
                "admitted": calc_ratio(gender_data.get('admitted', {}))
            },
            "residency_ratio": {
                "applicants": calc_ratio(residency_data.get('applicants', {})),
                "admitted": calc_ratio(residency_data.get('admitted', {}))
            },
            "hs_diploma_required": bool(highschool_data.get('high school diploma required', 0)),
            "ged_accepted": bool(highschool_data.get('GED accepted', 0)),
            "college_prep": (
                "required" if prep_data['general college-preparatory program']['required']
                else "recommended" if prep_data['general college-preparatory program']['recommended']
                else "neither"
            )
        }
    }

In [37]:
summarize_results(gatech_file_path, extract_gender_data(gatech_file_path), extract_residency_data(gatech_file_path), extract_highschool_requirement(gatech_file_path), extract_college_prep_requirement(gatech_file_path))

{'filename': 'pdf\\gatech.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 67,
    'women': 33,
    'others': 0},
   'admitted': {'men': 55, 'women': 45, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 18,
    'out_of_state': 64,
    'international': 18,
    'others': 0},
   'admitted': {'in_state': 42,
    'out_of_state': 47,
    'international': 11,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': False,
  'college_prep': 'required'}}

In [38]:
summarize_results(umn_file_path, extract_gender_data(umn_file_path), extract_residency_data(umn_file_path), extract_highschool_requirement(umn_file_path), extract_college_prep_requirement(umn_file_path))

{'filename': 'pdf\\umn.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 48,
    'women': 52,
    'others': 0},
   'admitted': {'men': 47, 'women': 53, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 40,
    'out_of_state': 56,
    'international': 4,
    'others': 0},
   'admitted': {'in_state': 39,
    'out_of_state': 57,
    'international': 4,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': True,
  'college_prep': 'recommended'}}

In [39]:
summarize_results(nyu_file_path, extract_gender_data(nyu_file_path), extract_residency_data(nyu_file_path), extract_highschool_requirement(nyu_file_path), extract_college_prep_requirement(nyu_file_path))

{'filename': 'pdf\\nyu.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 0,
    'women': 0,
    'others': 0},
   'admitted': {'men': 0, 'women': 0, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 0,
    'out_of_state': 0,
    'international': 0,
    'others': 0},
   'admitted': {'in_state': 0,
    'out_of_state': 0,
    'international': 0,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': True,
  'college_prep': 'recommended'}}

In [40]:
summarize_results(uga_file_path, extract_gender_data(uga_file_path), extract_residency_data(uga_file_path), extract_highschool_requirement(uga_file_path), extract_college_prep_requirement(uga_file_path))

{'filename': 'pdf\\uga.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 42,
    'women': 58,
    'others': 0},
   'admitted': {'men': 39, 'women': 61, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 42,
    'out_of_state': 55,
    'international': 3,
    'others': 0},
   'admitted': {'in_state': 57,
    'out_of_state': 42,
    'international': 2,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': True,
  'college_prep': 'required'}}

C5. Relative importance of each of the following academic and nonacademic factors

In [120]:
def extract_admission_factors(file_path, mode="table"):
    import pdfplumber

    academic_factors = [
        "rigor of secondary school record", "class rank",
        "academic gpa", "standardized test scores",
        "application essay", "recommendation"
    ]
    nonacademic_factors = [
        "interview", "extracurricular activities", "talent/ability",
        "character/personal qualities", "first generation",
        "alumni/ae relation", "geographical residence",
        "state residency", "religious affiliation/commitment",
        "volunteer work", "work experience", "level of applicant’s interest"
    ]

    result = {
        "academic": {factor: None for factor in academic_factors},
        "nonacademic": {factor: None for factor in nonacademic_factors}
    }

    marker_values = ["x", "✔", "✓", "☑", "☒", "■", "▣", "✗", "4"]

    if mode == "table":
        importance_by_index = {
            1: "very important",
            2: "important",
            3: "considered",
            4: "not considered"
        }

        def normalize_header_row(header_row):
            combined = []
            skip = False
            for i in range(len(header_row)):
                if skip:
                    skip = False
                    continue
                word = header_row[i]
                next_word = header_row[i + 1] if i + 1 < len(header_row) else ""

                if word == "very" and next_word == "important":
                    combined.append("very important")
                    skip = True
                elif word == "not" and next_word == "considered":
                    combined.append("not considered")
                    skip = True
                else:
                    combined.append(word)
            return combined

        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                tables = page.extract_tables()
                for table in tables:
                    if not table or len(table[0]) < 3:
                        continue

                    raw_header_row = [str(cell or '').lower().strip() for cell in table[0]]
                    header_row = normalize_header_row(raw_header_row)

                    start_idx = 1 if header_row[0] in ["academic", "nonacademic"] else 0

                    col_to_level = {
                        idx: importance_by_index[idx]
                        for idx in importance_by_index
                        if idx < len(header_row)
                    }

                    if len(col_to_level) < 2:
                        continue

                    for row in table[1:]:
                        row = [str(cell or '').lower().strip() for cell in row]
                        if len(row) < 2:
                            continue

                        factor_label = row[0]
                        if factor_label in ["academic", "nonacademic"]:
                            continue

                        if factor_label in academic_factors:
                            category = "academic"
                        elif factor_label in nonacademic_factors:
                            category = "nonacademic"
                        else:
                            continue

                        for j in range(1, len(row)):
                            cell = row[j]
                            if cell in marker_values and j in col_to_level:
                                result[category][factor_label] = col_to_level[j]
                                break
                            elif cell in col_to_level.values():
                                result[category][factor_label] = cell
                                break

    elif mode == "text":
        importance_by_index = {
            0: "very important",
            1: "important",
            2: "considered",
            3: "not considered"
        }

        def extract_section_by_label(file_path, section_label):
            section_label = section_label.lower().strip()
            capture = False
            extracted_lines = []

            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if not text:
                        continue
                    lines = text.split('\n')
                    for line in lines:
                        line_clean = line.strip().lower()
                        if line_clean.startswith(section_label):
                            capture = True
                        elif capture and line_clean.startswith('c8'):
                            capture = False
                            break
                        if capture:
                            extracted_lines.append(line)
            return extracted_lines

        lines = extract_section_by_label(file_path, 'C7')
        if not lines:
            return result

        lines = [line.strip().lower() for line in lines if line.strip()]
        i = 0
        while i < len(lines):
            line = lines[i]

            if any(mark in line for mark in marker_values):
                marks = line.split()
                for j, mark in enumerate(marks):
                    if mark in marker_values and j in importance_by_index:
                        importance = importance_by_index[j]

                        factor_candidates = []

                        if i >= 1:
                            factor_candidates.append(lines[i - 1])
                        if i >= 2:
                            factor_candidates.append(lines[i - 2] + " " + lines[i - 1])
                        if i + 1 < len(lines):
                            factor_candidates.append(lines[i - 1] + " " + lines[i + 1])
                        # ✅ 현재 줄에서 마킹 앞쪽 텍스트도 후보로 포함
                        factor_candidates.append(lines[i].split(mark)[0].strip())

                        for candidate in factor_candidates:
                            candidate = candidate.strip()
                            if candidate in academic_factors:
                                result["academic"][candidate] = importance
                                break
                            elif candidate in nonacademic_factors:
                                result["nonacademic"][candidate] = importance
                                break
                i += 2
            else:
                i += 1

    return result


In [89]:
extract_admission_factors_data(gatech_file_path)

{'academic': {'rigor of secondary school record': 'very important',
  'class rank': 'not considered',
  'academic gpa': 'very important',
  'standardized test scores': 'considered',
  'application essay': 'important',
  'recommendation': None},
 'nonacademic': {'interview': 'not considered',
  'extracurricular activities': 'important',
  'talent/ability': 'considered',
  'character/personal qualities': 'very important',
  'first generation': 'considered',
  'alumni/ae relation': 'not considered',
  'geographical residence': 'considered',
  'state residency': 'very important',
  'religious affiliation/commitment': 'not considered',
  'volunteer work': 'considered',
  'work experience': 'considered',
  'level of applicant’s interest': 'not considered'}}

In [121]:
extract_admission_factors_data(umn_file_path)

{'academic': {'rigor of secondary school record': None,
  'class rank': None,
  'academic gpa': None,
  'standardized test scores': None,
  'application essay': None,
  'recommendation': None},
 'nonacademic': {'interview': None,
  'extracurricular activities': None,
  'talent/ability': None,
  'character/personal qualities': None,
  'first generation': None,
  'alumni/ae relation': None,
  'geographical residence': None,
  'state residency': None,
  'religious affiliation/commitment': None,
  'volunteer work': None,
  'work experience': None,
  'level of applicant’s interest': None}}

In [87]:
extract_admission_factors_data(nyu_file_path)

{'academic': {'rigor of secondary school record': None,
  'class rank': None,
  'academic gpa': None,
  'standardized test scores': None,
  'application essay': None,
  'recommendation(s)': None},
 'nonacademic': {'interview': None,
  'extracurricular activities': None,
  'talent/ability': None,
  'character/personal qualities': None,
  'first generation': None,
  'alumni/ae relation': None,
  'geographical residence': None,
  'state residency': None,
  'religious affiliation/commitment': None,
  'volunteer work': None,
  'work experience': None,
  'level of applicant’s interest': None}}

In [86]:
extract_admission_factors_data(uga_file_path)

{'academic': {'rigor of secondary school record': None,
  'class rank': None,
  'academic gpa': None,
  'standardized test scores': None,
  'application essay': None,
  'recommendation(s)': None},
 'nonacademic': {'interview': None,
  'extracurricular activities': None,
  'talent/ability': None,
  'character/personal qualities': None,
  'first generation': None,
  'alumni/ae relation': None,
  'geographical residence': None,
  'state residency': None,
  'religious affiliation/commitment': None,
  'volunteer work': None,
  'work experience': None,
  'level of applicant’s interest': None}}

C6. SAT or ACT

In [167]:
import re

def extract_sat_act_scores(file_path):
    regex_map = {
        "SAT Composite": r"sat.*composite",
        "SAT EBRW": r"sat.*(evidence|ebrw|writing)",
        "SAT Math": r"sat.*math",
        "ACT Composite": r"act.*composite",
        "ACT Math": r"act.*math",
        "ACT English": r"act.*english",
        "ACT Writing": r"act.*writing",
        "ACT Science": r"act.*science",
        "ACT Reading": r"act.*reading"
    }

    score_data = {label: {'25th': None, '50th': None, '75th': None} for label in regex_map}

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            text = page.extract_text() or ""

            act_writing_present = "act writing" in text.lower()

            for table in tables:
                for row in table:
                    if not row or len(row) < 2:
                        continue

                    row_text = [str(cell).lower().strip() if cell else "" for cell in row]
                    label_candidate = row_text[0]
                    label_matched = None

                    for label, pattern in regex_map.items():
                        if re.search(pattern, label_candidate):
                            label_matched = label
                            break

                    if not label_matched and re.fullmatch(r"writing", label_candidate):
                        label_matched = "SAT EBRW" if act_writing_present else None

                    if not label_matched:
                        continue

                    numeric_values = [v for v in row_text[1:] if v.replace('.', '', 1).isdigit()]
                    for j, percentile in enumerate(['25th', '50th', '75th']):
                        if j < len(numeric_values):
                            score_data[label_matched][percentile] = numeric_values[j]

            fallback_labels = []
            for line in text.split("\n"):
                line_lower = line.lower()
                for label, pattern in regex_map.items():
                    if re.search(pattern, line_lower):
                        if label not in fallback_labels:
                            fallback_labels.append(label)

            for table in tables:
                numeric_rows = [
                    [str(cell).strip() for cell in row if cell and str(cell).strip().replace('.', '', 1).isdigit()]
                    for row in table if row
                ]
                numeric_rows = [r for r in numeric_rows if len(r) >= 2]

                if len(numeric_rows) >= 3 and len(fallback_labels) >= 3:
                    for i in range(min(len(fallback_labels), len(numeric_rows))):
                        values = numeric_rows[i]
                        for j, percentile in enumerate(['25th', '50th', '75th']):
                            if j < len(values):
                                score_data[fallback_labels[i]][percentile] = values[j]
                    break

    return score_data


In [168]:
import pandas as pd

def display_sat_act_scores_as_table(score_data):
    df = pd.DataFrame(score_data).T
    df.columns = ['25th', '50th', '75th']
    df = df.fillna("No data")
    print(df)

In [150]:
gatech_scores = extract_sat_act_scores(gatech_file_path)
display_sat_act_scores_as_table(gatech_scores)

               25th  50th  75th
SAT Composite  1370  1460  1530
SAT EBRW        680   720   750
SAT Math        690   760   790
ACT Composite    30    33    34
ACT Math         29    32    35
ACT English      31    34    35
ACT Writing       8     8     9
ACT Science      29    33    35
ACT Reading      31    34    35


In [157]:
umn_scores = extract_sat_act_scores(umn_file_path)
display_sat_act_scores_as_table(umn_scores)

               25th  50th  75th
SAT Composite  1328  1400  1460
SAT EBRW        640   680   720
SAT Math        670   720   760
ACT Composite    27    29    31
ACT Math         25    28    31
ACT English      25    28    33
ACT Writing       7     8     9
ACT Science      25    28    32
ACT Reading      27    31    34


In [169]:
nyu_scores = extract_sat_act_scores(nyu_file_path)
display_sat_act_scores_as_table(nyu_scores)

                  25th     50th     75th
SAT Composite     1480     1550  No data
SAT EBRW           720      750      770
SAT Math           760      780      800
ACT Composite       33       34       35
ACT Math       No data  No data  No data
ACT English    No data  No data  No data
ACT Writing    No data  No data  No data
ACT Science    No data  No data  No data
ACT Reading    No data  No data  No data


In [None]:
uga_scores = extract_sat_act_scores(uga_file_path)
display_sat_act_scores_as_table(uga_scores)

C7. GPA

In [55]:
import re
import pandas as pd
import pdfplumber

def extract_gpa_data(file_path):
    gpa_data = []
    average_gpa = None
    gpa_lines = []

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split('\n')

            for line in lines:
                lower_line = line.lower().strip()

                if "average high school gpa" in lower_line:
                    match = re.search(r'gpa[:\s]+([0-9.]+)', lower_line)
                    if match:
                        average_gpa = float(match.group(1))

                if lower_line.startswith("percent who had gpa") or "total" in lower_line:
                    gpa_lines.append(line.strip())

    for line in gpa_lines:
        match = re.match(r"(Percent.*?)(\s+\d+\.?\d*%)?$", line.strip(), flags=re.IGNORECASE)
        if match:
            label = match.group(1).strip()
            percent = match.group(2).strip() if match.group(2) else "No data"
            gpa_data.append((label, "No data", "No data", percent))

    # DataFrame 변환
    gpa_df = pd.DataFrame(
        gpa_data,
        columns=[
            "Score Range",
            "Percent (Submitted)",
            "Percent (Not Submitted)",
            "Percent (All Students)",
        ]
    )

    return gpa_df, average_gpa


In [27]:
gatech_gpa_df, gatech_avg_gpa = extract_gpa_data(gatech_file_path)
print(gatech_gpa_df.to_string(index=False))
print("Gatech Avg GPA : " , gatech_avg_gpa)

                              Score Range Percent (Submitted) Percent (Not Submitted) Percent (All Students)
               Percent who had GPA of 4.0              92.54%                                         92.54%
Percent who had GPA between 3.75 and 3.99               5.26%                                          5.26%
Percent who had GPA between 3.50 and 3.74               1.13%                                          1.13%
Percent who had GPA between 3.25 and 3.49               0.33%                                          0.33%
Percent who had GPA between 3.00 and 3.24               0.25%                                          0.25%
Percent who had GPA between 2.50 and 2.99               0.43%                                          0.43%
 Percent who had GPA between 2.0 and 2.49               0.06%                                          0.06%
 Percent who had GPA between 1.0 and 1.99               0.00%                                          0.00%
            Percent

In [28]:
umn_gpa_df, umn_avg_gpa = extract_gpa_data(umn_file_path)
print(umn_gpa_df.to_string(index=False))
print("UMN Avg GPA : " , umn_avg_gpa)

Empty DataFrame
Columns: [Score Range, Percent (Submitted), Percent (Not Submitted), Percent (All Students)]
Index: []
UMN Avg GPA :  None


In [56]:
nyu_gpa_df, nyu_avg_gpa = extract_gpa_data(nyu_file_path)
print(nyu_gpa_df.to_string(index=False))
print("NYU Avg GPA : " , nyu_avg_gpa)

                                                                    Score Range Percent (Submitted) Percent (Not Submitted) Percent (All Students)
     Percent of total first-time, first-year students who submitted high school             No data                 No data                No data
                                                     Percent who had GPA of 4.0             No data                 No data                  23.1%
                                      Percent who had GPA between 3.75 and 3.99             No data                 No data                  47.6%
                                      Percent who had GPA between 3.50 and 3.74             No data                 No data                  22.9%
                                      Percent who had GPA between 3.25 and 3.49             No data                 No data                   5.2%
                                      Percent who had GPA between 3.00 and 3.24             No data                 No

In [30]:
uga_gpa_df, uga_avg_gpa = extract_gpa_data(uga_file_path)
print(uga_gpa_df.to_string(index=False))
print("UGA Avg GPA : " , uga_avg_gpa)

                              Score Range Percent (Submitted) Percent (Not Submitted) Percent (All Students)
               Percent who had GPA of 4.0               81.87                                          81.87
Percent who had GPA between 3.75 and 3.99               15.56                                          15.56
Percent who had GPA between 3.50 and 3.74                1.76                                           1.76
Percent who had GPA between 3.25 and 3.49                0.33                                           0.33
Percent who had GPA between 3.00 and 3.24                0.20                                           0.20
Percent who had GPA between 2.50 and 2.99                0.20                                           0.20
 Percent who had GPA between 2.0 and 2.49                0.08                                           0.08
 Percent who had GPA between 1.0 and 1.99                0.00                                           0.00
            Percent

C8. Early Decision

In [17]:
def extract_early_decision(file_path):
    section_text = ""

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text and "C21" in text:
                section_text += text.lower()

    if not section_text:
        return "Unknown"

    if "☒ no" in section_text or "☑ no" in section_text:
        return "No"
    if "☒ yes" in section_text or "☑ yes" in section_text:
        return "Yes"

    if "yes" in section_text and "no" in section_text and "x" in section_text:
        yes_index = section_text.find("yes")
        no_index = section_text.find("no")
        x_index = section_text.find("x")

        if abs(x_index - no_index) < abs(x_index - yes_index):
            return "No"
        else:
            return "Yes"

    if "yes" in section_text and "no" not in section_text:
        return "Yes"

    return "Unknown"


In [18]:
extract_early_decision(gatech_file_path)

'No'

In [19]:
extract_early_decision(umn_file_path)

'No'

In [20]:
extract_early_decision(nyu_file_path)

'Yes'

In [21]:
extract_early_decision(uga_file_path)

'No'