In [1]:
import os # Used for creating folders and reading files
import re # Used for regular expressions
import pdfplumber # Used for extracting text from PDF files

In [3]:
# Set input and output folder paths

input_folder = 'pdf'

gatech_file_path = os.path.join(input_folder, 'gatech.pdf')
umn_file_path = os.path.join(input_folder, 'umn.pdf')
nyu_file_path = os.path.join(input_folder, 'nyu.pdf')
uga_file_path = os.path.join(input_folder, 'uga.pdf')

In [4]:
# Remove None and empty string from row

def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [5]:
# Extract only numbers from the cell

def clean_number(cell):
    cell = re.sub(r'[^\d]', '', str(cell)) # Remove all non-numeric characters
    return int(cell) if cell else 0

In [9]:
# Extract only floats from the cell

def clean_float(cell):
    cell = re.sub(r'[^\d.]', '', str(cell))
    return float(cell) if cell else 0.0

In [11]:
# Code to extract text from a specific page for verification

def extract_text_from_page(file_path, page_number):
    with pdfplumber.open(file_path) as pdf:
        # Check if the given page number is valid
        if page_number < 1 or page_number > len(pdf.pages):
            print(f"Invalid page number. This PDF has {len(pdf.pages)} pages.")
            return

        # Adjust for 0-based indexing in pdfplumber
        page = pdf.pages[page_number - 1]
        text = page.extract_text()

        # Print the extracted text if available
        if text:
            print(f"Text from Page {page_number}:\n")
            print(text)
        else:
            print(f"No text found on page {page_number}.")

In [13]:
# Extract text from a section like "C{#}." up to before "C{#+1}."

def extract_section_by_label(file_path, section_label):
    next_label = "c" + str(int(section_label[1:]) + 1)

    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text.lower()

    # Match exact line starting with "c{#}."
    # ^ = matches the beginning of a line
    start_match = re.search(rf"({section_label.lower()}[\. ])", full_text)
    if not start_match:
        print(f"Section {section_label.upper()} not found.")
        return
    start_idx = start_match.start()

    # Look for the start of the next section
    next_match = re.search(rf"({next_label}[\. ])", full_text[start_idx:])
    end_idx = start_idx + next_match.start() if next_match else None # If we don't find the next label, go to the end of the document

    section_text = full_text[start_idx:end_idx]

    return section_text

### C1. Applications (Gender, Residency)

##### Gender

In [7]:
def extract_gender_data(file_path):
    
    result = {
        'applicants': {'men': 0, 'women': 0, 'others': 0},
        'admitted': {'men': 0, 'women': 0, 'others': 0}
    }
    
    # Define keywords to identify gender-related and first-year-related tables
    gender_keywords = ['men', 'women', 'another gender', 'unknown gender']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            
            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in gender_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Loop through data rows
                for row in table:
                    row = clean_row(row) # Clean up null and empty cells
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                    # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add gender-specific count from the last column
                    if 'men' in row_str and 'women' not in row_str:
                        result[target]['men'] += clean_number(row[-1])
                    elif 'women' in row_str:
                        result[target]['women'] += clean_number(row[-1])
                    elif 'another gender' in row_str or 'unknown gender' in row_str:
                        result[target]['others'] += clean_number(row[-1])
                            
    return result

In [8]:
extract_gender_data(gatech_file_path)

{'applicants': {'men': 40022, 'women': 19765, 'others': 2},
 'admitted': {'men': 4634, 'women': 3779, 'others': 0}}

In [9]:
extract_gender_data(umn_file_path)

{'applicants': {'men': 18901, 'women': 20862, 'others': 0},
 'admitted': {'men': 14427, 'women': 16218, 'others': 0}}

In [10]:
extract_gender_data(nyu_file_path) # ^^

{'applicants': {'men': 0, 'women': 0, 'others': 0},
 'admitted': {'men': 0, 'women': 0, 'others': 0}}

In [11]:
extract_gender_data(uga_file_path)

{'applicants': {'men': 18211, 'women': 25191, 'others': 14},
 'admitted': {'men': 6260, 'women': 9882, 'others': 6}}

##### Residency

In [14]:
def extract_residency_data(file_path):
    
    result = {
        'applicants': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0},
        'admitted': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0}
    }

    # Define keywords to identify residency-related and first-year-related tables
    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in_state': -1, 'out_of_state': -1, 'international': -1, 'others': -1}
                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in_state'] = idx
                    elif 'out-' in col:
                        col_idx['out_of_state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                   # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add residency-specific count
                    if col_idx['in_state'] != -1 and col_idx['in_state'] < len(row):
                        result[target]['in_state'] += clean_number(row[col_idx['in_state']])
                    if col_idx['out_of_state'] != -1 and col_idx['out_of_state'] < len(row):
                        result[target]['out_of_state'] += clean_number(row[col_idx['out_of_state']])
                    if col_idx['international'] != -1 and col_idx['international'] < len(row):
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1 and col_idx['others'] < len(row):
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [15]:
extract_residency_data(gatech_file_path)

{'applicants': {'in_state': 10674,
  'out_of_state': 38320,
  'international': 10795,
  'others': 0},
 'admitted': {'in_state': 3536,
  'out_of_state': 3992,
  'international': 885,
  'others': 0}}

In [16]:
extract_residency_data(umn_file_path)

{'applicants': {'in_state': 13982,
  'out_of_state': 19461,
  'international': 1256,
  'others': 0},
 'admitted': {'in_state': 10498,
  'out_of_state': 15400,
  'international': 982,
  'others': 0}}

In [24]:
extract_residency_data(nyu_file_path)

{'applicants': {'in_state': 0,
  'out_of_state': 0,
  'international': 0,
  'others': 0},
 'admitted': {'in_state': 0,
  'out_of_state': 0,
  'international': 0,
  'others': 0}}

In [17]:
extract_residency_data(uga_file_path)

{'applicants': {'in_state': 18210,
  'out_of_state': 23867,
  'international': 1338,
  'others': 1},
 'admitted': {'in_state': 9149,
  'out_of_state': 6709,
  'international': 289,
  'others': 1}}

##### Applicant Summary

In [18]:
def extract_applicant_summary(file_path):
    gender_data = extract_gender_data(file_path)
    residency_data = extract_residency_data(file_path)

    result = {
        'applicants': {
            'gender': gender_data.get('applicants', {}),
            'residency': residency_data.get('applicants', {})
        },
        'admitted': {
            'gender': gender_data.get('admitted', {}),
            'residency': residency_data.get('admitted', {})
        }
    }

    return result


In [19]:
extract_applicant_summary(gatech_file_path)

{'applicants': {'gender': {'men': 40022, 'women': 19765, 'others': 2},
  'residency': {'in_state': 10674,
   'out_of_state': 38320,
   'international': 10795,
   'others': 0}},
 'admitted': {'gender': {'men': 4634, 'women': 3779, 'others': 0},
  'residency': {'in_state': 3536,
   'out_of_state': 3992,
   'international': 885,
   'others': 0}}}

In [20]:
extract_applicant_summary(umn_file_path)

{'applicants': {'gender': {'men': 18901, 'women': 20862, 'others': 0},
  'residency': {'in_state': 13982,
   'out_of_state': 19461,
   'international': 1256,
   'others': 0}},
 'admitted': {'gender': {'men': 14427, 'women': 16218, 'others': 0},
  'residency': {'in_state': 10498,
   'out_of_state': 15400,
   'international': 982,
   'others': 0}}}

In [23]:
extract_applicant_summary(nyu_file_path)

{'applicants': {'gender': {'men': 0, 'women': 0, 'others': 0},
  'residency': {'in_state': 0,
   'out_of_state': 0,
   'international': 0,
   'others': 0}},
 'admitted': {'gender': {'men': 0, 'women': 0, 'others': 0},
  'residency': {'in_state': 0,
   'out_of_state': 0,
   'international': 0,
   'others': 0}}}

In [21]:
extract_applicant_summary(uga_file_path)

{'applicants': {'gender': {'men': 18211, 'women': 25191, 'others': 14},
  'residency': {'in_state': 18210,
   'out_of_state': 23867,
   'international': 1338,
   'others': 1}},
 'admitted': {'gender': {'men': 6260, 'women': 9882, 'others': 6},
  'residency': {'in_state': 9149,
   'out_of_state': 6709,
   'international': 289,
   'others': 1}}}

### C3. High School Completetion requirement

In [22]:
def extract_highschool_requirement(file_path):

    result = {
        'high school diploma required': 0,
        'GED accepted': 0
    }

    # Define exact phrases expected in each case
    diploma_and_ged = "high school diploma is required and ged is accepted"
    diploma_only = "high school diploma is required and ged is not accepted"
    no_diploma_needed = "high school diploma or equivalent is not required"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C3 section using helper function
    section_text = extract_section_by_label(file_path, "C3")
    if not section_text:
        return result  # Return default if section not found

    section_text = section_text.lower()
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if diploma_and_ged in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
            found_checked_option = True
            break
        elif diploma_only in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
            found_checked_option = True
            break
        elif no_diploma_needed in line and any(mark in line for mark in marks):
            result['high school diploma required'] = 0
            result['GED accepted'] = 0
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if diploma_and_ged in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
        elif diploma_only in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
        elif no_diploma_needed in section_text:
            result['high school diploma required'] = 0
            result['GED accepted'] = 0

    return result

In [25]:
extract_highschool_requirement(gatech_file_path)

{'high school diploma required': 1, 'GED accepted': 0}

In [26]:
extract_highschool_requirement(umn_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [27]:
extract_highschool_requirement(nyu_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [28]:
extract_highschool_requirement(uga_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [None]:
text = extract_section_by_label(uga_file_path, 'c3')
for line in text.split('\n'):
    print(line)

c3. high school completion requirement
check the appropriate box to identify your high school completion requirement for degree-seeking entering
students:
4 high school diploma is required and ged is accepted
☐
high school diploma is required and ged is not accepted
☐
high school diploma or equivalent is not required
☐



### C4. General College-Preparatory Program Requirement

In [39]:
def extract_college_prep_requirement(file_path):
    
    result = {
        'general college-preparatory program': 
            {
                'required': 0,
                'recommended': 0,
                'neither required or recommended': 0
            }
        }

    # Define key phrases to match
    required = "require"
    recommended = "recommend"
    neither = "neither require nor recommend"
    
    # Define marks
    marks = ['x', '☒', '✓', '✔', '4']

    # Extract only the C4 section using helper function
    section_text = extract_section_by_label(file_path, 'C4')
    if not section_text:
        return result # Return default if section not found
    section_text = section_text.lower()
    
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        
        # Skip the question sectence
        if line.startswith('c4'):
            continue
        
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if neither in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['neither required or recommended'] = 1
            found_checked_option = True
            break
        elif required in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['required'] = 1
            found_checked_option = True
            break
        elif recommended in line and any(mark in line for mark in marks):
            result['general college-preparatory program']['recommended'] = 1
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if "neither require nor recommend" in section_text:
            result['general college-preparatory program']['neither required or recommended'] = 1
        elif "recommend" in section_text:
            result['general college-preparatory program']['recommended'] = 1
        elif "require" in section_text:
            result['general college-preparatory program']['required'] = 1

    return result

In [114]:
extract_college_prep_requirement(gatech_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [115]:
extract_college_prep_requirement(umn_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [116]:
text = extract_section_by_label(umn_file_path, 'c4')
for line in text.split('\n'):
    print(line)

c4. does your institution require or recommend a general college-preparatory program for degree-
seeking students?
☐ require:
☒ recommend
☐ neither require nor recommendcommon data set 2023-2024



In [117]:
extract_college_prep_requirement(nyu_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [118]:
extract_college_prep_requirement(uga_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [119]:
text = extract_section_by_label(uga_file_path, 'c4')
for line in text.split('\n'):
    print(line)

c4. does your institution require or recommend a general college-preparatory program for degree-seeking
students?
4 require
☐
recommend
☐
neither require nor recommend
☐



### Summarize

In [33]:
extract_college_prep_requirement(nyu_file_path)

{'general college-preparatory program': {'required': 0,
  'recommended': 1,
  'neither required or recommended': 0}}

In [34]:
extract_college_prep_requirement(uga_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [35]:
text = extract_section_by_label(uga_file_path, 'c4')
for line in text.split('\n'):
    print(line)

c4. does your institution require or recommend a general college-preparatory program for degree-seeking
students?
4 require
☐
recommend
☐
neither require nor recommend
☐



### Summarize

In [36]:
# Summarize extracted data into a JSON-ready dictionary

def summarize_results(filename, gender_data, residency_data, highschool_data, prep_data):
    
    def calc_ratio(group_dict):
        total = sum(group_dict.values()) or 1
        return {k: round(v / total * 100) for k, v in group_dict.items()}

    return {
        "filename": filename,
        "summary": {
            "gender_ratio": {
                "applicants": calc_ratio(gender_data.get('applicants', {})),
                "admitted": calc_ratio(gender_data.get('admitted', {}))
            },
            "residency_ratio": {
                "applicants": calc_ratio(residency_data.get('applicants', {})),
                "admitted": calc_ratio(residency_data.get('admitted', {}))
            },
            "hs_diploma_required": bool(highschool_data.get('high school diploma required', 0)),
            "ged_accepted": bool(highschool_data.get('GED accepted', 0)),
            "college_prep": (
                "required" if prep_data['general college-preparatory program']['required']
                else "recommended" if prep_data['general college-preparatory program']['recommended']
                else "neither"
            )
        }
    }

In [37]:
summarize_results(gatech_file_path, extract_gender_data(gatech_file_path), extract_residency_data(gatech_file_path), extract_highschool_requirement(gatech_file_path), extract_college_prep_requirement(gatech_file_path))

{'filename': 'pdf\\gatech.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 67,
    'women': 33,
    'others': 0},
   'admitted': {'men': 55, 'women': 45, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 18,
    'out_of_state': 64,
    'international': 18,
    'others': 0},
   'admitted': {'in_state': 42,
    'out_of_state': 47,
    'international': 11,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': False,
  'college_prep': 'required'}}

In [38]:
summarize_results(umn_file_path, extract_gender_data(umn_file_path), extract_residency_data(umn_file_path), extract_highschool_requirement(umn_file_path), extract_college_prep_requirement(umn_file_path))

{'filename': 'pdf\\umn.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 48,
    'women': 52,
    'others': 0},
   'admitted': {'men': 47, 'women': 53, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 40,
    'out_of_state': 56,
    'international': 4,
    'others': 0},
   'admitted': {'in_state': 39,
    'out_of_state': 57,
    'international': 4,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': True,
  'college_prep': 'recommended'}}

In [39]:
summarize_results(nyu_file_path, extract_gender_data(nyu_file_path), extract_residency_data(nyu_file_path), extract_highschool_requirement(nyu_file_path), extract_college_prep_requirement(nyu_file_path))

{'filename': 'pdf\\nyu.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 0,
    'women': 0,
    'others': 0},
   'admitted': {'men': 0, 'women': 0, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 0,
    'out_of_state': 0,
    'international': 0,
    'others': 0},
   'admitted': {'in_state': 0,
    'out_of_state': 0,
    'international': 0,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': True,
  'college_prep': 'recommended'}}

In [40]:
summarize_results(uga_file_path, extract_gender_data(uga_file_path), extract_residency_data(uga_file_path), extract_highschool_requirement(uga_file_path), extract_college_prep_requirement(uga_file_path))

{'filename': 'pdf\\uga.pdf',
 'summary': {'gender_ratio': {'applicants': {'men': 42,
    'women': 58,
    'others': 0},
   'admitted': {'men': 39, 'women': 61, 'others': 0}},
  'residency_ratio': {'applicants': {'in_state': 42,
    'out_of_state': 55,
    'international': 3,
    'others': 0},
   'admitted': {'in_state': 57,
    'out_of_state': 42,
    'international': 2,
    'others': 0}},
  'hs_diploma_required': True,
  'ged_accepted': True,
  'college_prep': 'required'}}

C5. Relative importance of each of the following academic and nonacademic factors

In [71]:
def is_marked(cell, marks=['x', '☑', '☒', '✓', '✔', '4']):
    if not cell:
        return False
    cell_str = str(cell).strip().lower()
    return any(mark in cell_str for mark in marks)

def extract_relative_importance(file_path):
    importance_levels = ["Very Important", "Important", "Considered", "Not Considered"]

    academic_factors = [
        "Rigor of secondary school record",
        "Class rank",
        "Academic Grade Point Average (GPA)",
        "Recommendations",
        "Standardized test scores",
        "Application essay"
    ]

    nonacademic_factors = [
        "Interview",
        "Extracurricular activities",
        "Talent/ability",
        "Character/personal qualities",
        "First generation",
        "Alumni/ae relation",
        "Geographical residence",
        "State residency",
        "Religious affiliation/commitment",
        "Racial/ethnic status",
        "Volunteer work",
        "Work experience",
        "Level of applicant’s interest"
    ]

    result = {
        "Academic": {},
        "Nonacademic": {}
    }

    with pdfplumber.open(file_path) as pdf:
        # STEP 1: Gatech style (table-based)
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                if not table or len(table) < 2:
                    continue
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not all(level.lower() in table_str for level in importance_levels):
                    continue
                headers = [str(cell).strip() if cell else "" for cell in table[0]]
                for row in table[1:]:
                    row = [str(cell).strip() if cell else "" for cell in row]
                    if not row or row[0].lower() in ["academic", "nonacademic"]:
                        continue
                    factor = row[0]
                    for i in range(1, min(len(row), len(headers))):
                        if is_marked(row[i]):
                            importance = headers[i].strip()
                            section = "Academic" if factor in academic_factors else "Nonacademic"
                            result[section][factor] = importance
                            break

    if result["Academic"] or result["Nonacademic"]:
        return result

    # STEP 2: NYU style fallback (text-based)
    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += "\n" + text

    def find_importance_by_text(factor):
        pattern = factor.lower()
        idx = full_text.lower().find(pattern)
        if idx == -1:
            return None
        after = full_text[idx + len(pattern):].strip()
        for level in importance_levels:
            if after.lower().startswith(level.lower()):
                return level
        return None

    for factor in academic_factors:
        importance = find_importance_by_text(factor)
        if importance:
            result["Academic"][factor] = importance

    for factor in nonacademic_factors:
        importance = find_importance_by_text(factor)
        if importance:
            result["Nonacademic"][factor] = importance

    return result

In [33]:
extract_relative_importance(gatech_file_path)

{'Rigor of secondary school record': 'Very Important',
 'Class rank': 'Not Considered',
 'Academic GPA': 'Very Important',
 'Standardized test scores': 'Considered',
 'Application Essay': 'Important',
 'Recommendation(s)': 'Considered',
 'Interview': 'Not Considered',
 'Extracurricular activities': 'Important',
 'Talent/ability': 'Considered',
 'Character/personal qualities': 'Very Important',
 'First generation': 'Considered',
 'Alumni/ae relation': 'Not Considered',
 'Geographical residence': 'Considered',
 'State residency': 'Very Important',
 'Religious affiliation/commitment': 'Not Considered',
 'Volunteer work': 'Considered',
 'Work experience': 'Considered',
 'Level of applicant’s interest': 'Not Considered'}

In [79]:
# Manually enter data
# extract_relative_importance(umn_file_path)

In [73]:
extract_relative_importance(nyu_file_path)

{'Academic': {'Rigor of secondary school record': 'Very Important',
  'Class rank': 'Not Considered',
  'Academic Grade Point Average (GPA)': 'Very Important',
  'Recommendations': 'Very Important',
  'Standardized test scores': 'Important',
  'Application essay': 'Very Important'},
 'Nonacademic': {'Interview': 'Not Considered',
  'Talent/ability': 'Considered',
  'Character/personal qualities': 'Very Important',
  'First generation': 'Considered',
  'Alumni/ae relation': 'Not Considered',
  'Geographical residence': 'Considered',
  'State residency': 'Not Considered',
  'Volunteer work': 'Considered',
  'Work experience': 'Considered'}}

In [75]:
# Manually enter data
# extract_relative_importance(uga_file_path)

{'Academic': {}, 'Nonacademic': {}}

C6. SAT or ACT

In [19]:
def extract_sat_act_scores(file_path):
    regex_map = {
        "SAT Composite": r"sat.*composite",
        "SAT EBRW": r"sat.*(evidence|ebrw|writing)",
        "SAT Math": r"sat.*math",
        "ACT Composite": r"act.*composite",
        "ACT Math": r"act.*math",
        "ACT English": r"act.*english",
        "ACT Writing": r"act.*writing",
        "ACT Science": r"act.*science",
        "ACT Reading": r"act.*reading"
    }

    score_data = {label: {'25th': None, '50th': None, '75th': None} for label in regex_map}

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            text = page.extract_text() or ""

            act_writing_present = "act writing" in text.lower()

            for table in tables:
                 for row in table:
                    cleaned_row = [str(cell).lower().strip() if cell else "" for cell in row]
                    row_label = cleaned_row[0]

                    matched_label = None
                    for label, pattern in regex_map.items():
                        if re.search(pattern, row_label):
                            matched_label = label
                            break

                    if not matched_label and re.fullmatch(r"writing", row_label):
                        matched_label = "SAT EBRW"

                    if matched_label:
                        numeric_values = [cell for cell in cleaned_row[1:] if cell.replace('.', '', 1).isdigit()]
                        for i, key in enumerate(['25th', '50th', '75th']):
                            if i < len(numeric_values):
                                score_data[matched_label][key] = numeric_values[i]

    return score_data

In [21]:
extract_sat_act_scores(gatech_file_path)

{'SAT Composite': {'25th': '1370', '50th': '1460', '75th': '1530'},
 'SAT EBRW': {'25th': '680', '50th': '720', '75th': '750'},
 'SAT Math': {'25th': '690', '50th': '760', '75th': '790'},
 'ACT Composite': {'25th': '30', '50th': '33', '75th': '34'},
 'ACT Math': {'25th': '29', '50th': '32', '75th': '35'},
 'ACT English': {'25th': '31', '50th': '34', '75th': '35'},
 'ACT Writing': {'25th': '8', '50th': '8', '75th': '9'},
 'ACT Science': {'25th': '29', '50th': '33', '75th': '35'},
 'ACT Reading': {'25th': '31', '50th': '34', '75th': '35'}}

In [23]:
extract_sat_act_scores(umn_file_path)

{'SAT Composite': {'25th': '1328', '50th': '1400', '75th': '1460'},
 'SAT EBRW': {'25th': '640', '50th': '680', '75th': '720'},
 'SAT Math': {'25th': '670', '50th': '720', '75th': '760'},
 'ACT Composite': {'25th': '27', '50th': '29', '75th': '31'},
 'ACT Math': {'25th': '25', '50th': '28', '75th': '31'},
 'ACT English': {'25th': '25', '50th': '28', '75th': '33'},
 'ACT Writing': {'25th': '7', '50th': '8', '75th': '9'},
 'ACT Science': {'25th': '25', '50th': '28', '75th': '32'},
 'ACT Reading': {'25th': '27', '50th': '31', '75th': '34'}}

In [24]:
extract_sat_act_scores(nyu_file_path)

{'SAT Composite': {'25th': None, '50th': None, '75th': None},
 'SAT EBRW': {'25th': None, '50th': None, '75th': None},
 'SAT Math': {'25th': None, '50th': None, '75th': None},
 'ACT Composite': {'25th': None, '50th': None, '75th': None},
 'ACT Math': {'25th': None, '50th': None, '75th': None},
 'ACT English': {'25th': None, '50th': None, '75th': None},
 'ACT Writing': {'25th': None, '50th': None, '75th': None},
 'ACT Science': {'25th': None, '50th': None, '75th': None},
 'ACT Reading': {'25th': None, '50th': None, '75th': None}}

In [25]:
extract_sat_act_scores(uga_file_path)

{'SAT Composite': {'25th': '1230', '50th': '1320', '75th': '1410'},
 'SAT EBRW': {'25th': '620', '50th': '670', '75th': '710'},
 'SAT Math': {'25th': '600', '50th': '660', '75th': '710'},
 'ACT Composite': {'25th': '27', '50th': '7', '75th': '3'},
 'ACT Math': {'25th': '25', '50th': '28', '75th': '31'},
 'ACT English': {'25th': '26', '50th': '31', '75th': '34'},
 'ACT Writing': {'25th': None, '50th': None, '75th': None},
 'ACT Science': {'25th': '25', '50th': '29', '75th': '32'},
 'ACT Reading': {'25th': '28', '50th': '32', '75th': '34'}}

C7. GPA

In [105]:
def extract_gpa_data(file_path):
    gpa_keys = [
        "4.0", "3.75-3.99", "3.50-3.74", "3.25-3.49", "3.00-3.24",
        "2.50-2.99", "2.00-2.49", "1.00-1.99", "<1.0"
    ]
    gpa_result = {
        "GPA Ranges": {key: None for key in gpa_keys},
        "Average GPA": None
    }
    
    gpa_values = []

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            # Check if table is relevant
            for table in tables:
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if "percent who had gpa" not in table_str:
                    continue  
                # Extract GPA Ranges
                for row in table:
                    if not row or len(row) < 2:
                        continue
                    row_text = ' '.join(str(cell or '').lower() for cell in row)
                    if "percent who had gpa" in row_text:
                        # Check only the rightmost column
                        rightmost = row[-1]
                        if rightmost:
                            match = re.search(r'\d{1,3}\.\d{1,2}', str(rightmost))
                        if match:
                            gpa_values.append(clean_float(match.group(0)))

            # Extract Average GPA
            text = page.extract_text()
            if text:
                lines = text.split('\n')
                for i, line in enumerate(lines):
                    if "average high school gpa" in line.lower():
            # Search next line if splited into more than a line
                        match = re.search(r'(\d\.\d{1,2})', line)
                        if not match and i + 1 < len(lines):
                            match = re.search(r'(\d\.\d{1,2})', lines[i + 1])
                        if match:
                            gpa_result["Average GPA"] = clean_float(match.group(1))
                            break
    # Mapping
    for key, value in zip(gpa_keys, gpa_values):
        gpa_result["GPA Ranges"][key] = value

    return gpa_result

In [103]:
extract_gpa_data(gatech_file_path)

{'GPA Ranges': {'4.0': 92.54,
  '3.75-3.99': 5.26,
  '3.50-3.74': 1.13,
  '3.25-3.49': 0.33,
  '3.00-3.24': 0.25,
  '2.50-2.99': 0.43,
  '2.00-2.49': 0.06,
  '1.00-1.99': 0.0,
  '<1.0': 0.0},
 'Average GPA': 4.14}

In [107]:
extract_gpa_data(umn_file_path)

{'GPA Ranges': {'4.0': None,
  '3.75-3.99': None,
  '3.50-3.74': None,
  '3.25-3.49': None,
  '3.00-3.24': None,
  '2.50-2.99': None,
  '2.00-2.49': None,
  '1.00-1.99': None,
  '<1.0': None},
 'Average GPA': None}

In [92]:
# Manually enter GPA Ranges - Not in a table form
extract_gpa_data(nyu_file_path)

{'GPA Ranges': {'4.0': None,
  '3.75-3.99': None,
  '3.50-3.74': None,
  '3.25-3.49': None,
  '3.00-3.24': None,
  '2.50-2.99': None,
  '2.00-2.49': None,
  '1.00-1.99': None,
  '<1.0': None},
 'Average GPA': 3.81}

In [93]:
extract_gpa_data(uga_file_path)

{'GPA Ranges': {'4.0': 81.87,
  '3.75-3.99': 15.56,
  '3.50-3.74': 1.76,
  '3.25-3.49': 0.33,
  '3.00-3.24': 0.2,
  '2.50-2.99': 0.2,
  '2.00-2.49': 0.08,
  '1.00-1.99': 0.0,
  '<1.0': 0.0},
 'Average GPA': 4.14}

C8. Early Decision

In [107]:
def extract_section_containing(file_path, keyword):
    section_text = ""

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text and keyword.lower() in text.lower():
                section_text += text.lower() + "\n"
    return section_text.strip()

# Return "Yes" or "No" based on presence of marks
def get_marked_choice(section_text):
    if "☒ no" in section_text or "☑ no" in section_text:
        return "No"
    if "☒ yes" in section_text or "☑ yes" in section_text:
        return "Yes"
    return None

# If 'yes', 'no', and 'x' exist in the section, infer the answer based on proximity of 'x' to either 'yes' or 'no'.
def infer_choice_from_x(section_text):
    if "yes" in section_text and "no" in section_text and "x" in section_text:
        yes_index = section_text.find("yes")
        no_index = section_text.find("no")
        x_index = section_text.find("x")

        if abs(x_index - no_index) < abs(x_index - yes_index):
            return "No"
        else:
            return "Yes"
    return None

def extract_early_decision(file_path):
    result = {"Early Decision": None}
    section_text = extract_section_containing(file_path, "C21")

    if not section_text:
        result["Early Decision"] = "Unknown"
        return result

    # Step 1: Check for marked options
    marked = get_marked_choice(section_text)
    if marked:
        result["Early Decision"] = marked
        return result

   # Step 2: Infer using proximity of 'x' to 'yes' or 'no'
    inferred = infer_choice_from_x(section_text)
    if inferred:
        result["Early Decision"] = inferred
        return result

    # Step 3: If only 'yes' exists, assume "Yes"
    if "yes" in section_text and "no" not in section_text:
        result["Early Decision"] = "Yes"
        return result

    # Step 4: No clear answer found
    result["Early Decision"] = "Unknown"
    return result

In [109]:
extract_early_decision(gatech_file_path)

{'Early Decision': 'No'}

In [110]:
extract_early_decision(umn_file_path)

{'Early Decision': 'No'}

In [111]:
extract_early_decision(nyu_file_path)

{'Early Decision': 'Yes'}

In [102]:
extract_early_decision(uga_file_path)

{'Early Decision': 'No'}

C9. Superscore

C10. SAT/ACT Requirement

In [153]:
def classify_requirement(text):
    text = text.lower().replace('\n', ' ')
    if "required to be considered" in text:
        return "Required"
    elif "required for some" in text:
        return "Required for some"
    elif "recommended" in text:
        return "Recommended"
    elif "not required" in text:
        return "Optional"
    elif "not considered" in text:
        return "Not considered"
    return "Unknown"

# Nyu case
def extract_requirement_from_text_lines(text_lines, keyword):
    for i, line in enumerate(text_lines):
        if keyword.lower() in line.lower():
            if i + 1 < len(text_lines):
                next_line = text_lines[i + 1].strip()
                return classify_requirement(next_line)
    return "Unknown"

# Nyu case
def extract_from_text(pdf):
    labels = {
        "SAT or ACT": "SAT and/or ACT",
        "ACT Only": "ACT Only",
        "SAT Only": "SAT Only"
    }
    result = {f"{label} Requirement": "Unknown" for label in labels}

    full_text = ""
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            full_text += "\n" + text

    lines = full_text.split("\n")

    for label, keyword in labels.items():
        result[f"{label} Requirement"] = extract_requirement_from_text_lines(lines, keyword)

    return result

# Gatech, Umn case
def extract_from_table(pdf):
    labels = ["SAT or ACT", "ACT Only", "SAT Only"]
    result = {f"{label} Requirement": "Unknown" for label in labels}

    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            if not table or len(table) < 2:
                continue

            table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
            if "sat or act" not in table_str:
                continue

            headers = [str(cell).strip().lower() if cell else "" for cell in table[0]]

            for row in table[1:]:
                row_cells = [str(cell).strip().lower() if cell else "" for cell in row]
                for label in labels:
                    if label.lower() in row_cells[0]:
                        for i in range(1, len(row)):
                            if i < len(headers) and is_marked(row[i]):
                                category = classify_requirement(headers[i])
                                if category:
                                    result[f"{label} Requirement"] = category
                                    break
    return result

def extract_sat_act_required(file_path):
    with pdfplumber.open(file_path) as pdf:
        table_result = extract_from_table(pdf)
        if all(value == "Unknown" for value in table_result.values()):
            return extract_from_text(pdf)
        return table_result

In [155]:
extract_sat_act_required(gatech_file_path)

{'SAT or ACT Requirement': 'Required',
 'ACT Only Requirement': 'Unknown',
 'SAT Only Requirement': 'Unknown'}

In [156]:
extract_sat_act_required(umn_file_path)

{'SAT or ACT Requirement': 'Optional',
 'ACT Only Requirement': 'Unknown',
 'SAT Only Requirement': 'Unknown'}

In [157]:
extract_sat_act_required(nyu_file_path)

{'SAT or ACT Requirement': 'Recommended',
 'ACT Only Requirement': 'Optional',
 'SAT Only Requirement': 'Optional'}

In [158]:
# Manually enter SAT/ACT requirements - Not in a table form
extract_sat_act_required(uga_file_path)

{'SAT or ACT Requirement': 'Unknown',
 'ACT Only Requirement': 'Unknown',
 'SAT Only Requirement': 'Unknown'}