In [11]:
import os # Used for creating folders and reading files
import re # Used for regular expressions
import pdfplumber # Used for extracting text from PDF files
import pandas as pd

In [12]:
# Set input and output folder paths

input_folder = 'pdf'
output_folder = 'output'

gatech_file_path = os.path.join(input_folder, 'gatech.pdf')
umn_file_path = os.path.join(input_folder, 'umn.pdf')
nyu_file_path = os.path.join(input_folder, 'nyu.pdf')
uga_file_path = os.path.join(input_folder, 'uga.pdf')

In [13]:
# Remove None and empty string from row

def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [14]:
# Extract only numbers from the cell

def clean_number(cell):
    cell = re.sub(r'[^\d]', '', str(cell)) # Remove all non-numeric characters
    return int(cell) if cell else 0

In [15]:
# Code to extract text from a specific page for verification

def extract_text_from_page(file_path, page_number):
    with pdfplumber.open(file_path) as pdf:
        # Check if the given page number is valid
        if page_number < 1 or page_number > len(pdf.pages):
            print(f"Invalid page number. This PDF has {len(pdf.pages)} pages.")
            return

        # Adjust for 0-based indexing in pdfplumber
        page = pdf.pages[page_number - 1]
        text = page.extract_text()

        # Print the extracted text if available
        if text:
            print(f"Text from Page {page_number}:\n")
            print(text)
        else:
            print(f"No text found on page {page_number}.")

In [16]:
# Extract text from a section like "C{#}." up to before "C{#+1}."

def extract_section_by_label(file_path, section_label):
    next_label = "c" + str(int(section_label[1:]) + 1)

    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text.lower()

    # Match exact line starting with "c{#}."
    # ^ = matches the beginning of a line
    start_match = re.search(rf"({section_label.lower()}[\. ])", full_text)
    if not start_match:
        print(f"Section {section_label.upper()} not found.")
        return
    start_idx = start_match.start()

    # Look for the start of the next section
    next_match = re.search(rf"({next_label}[\. ])", full_text[start_idx:])
    end_idx = start_idx + next_match.start() if next_match else None # If we don't find the next label, go to the end of the document

    section_text = full_text[start_idx:end_idx]

    return section_text

### C1. Applications (Gender, Residency)

##### Gender

In [17]:
def extract_gender_data(file_path):
    
    result = {
        'applicants': {'men': 0, 'women': 0, 'others': 0},
        'admitted': {'men': 0, 'women': 0, 'others': 0}
    }
    
    # Define keywords to identify gender-related and first-year-related tables
    gender_keywords = ['men', 'women', 'another gender', 'unknown gender']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            
            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in gender_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Loop through data rows
                for row in table:
                    row = clean_row(row) # Clean up null and empty cells
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                    # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add gender-specific count from the last column
                    if 'men' in row_str and 'women' not in row_str:
                        result[target]['men'] += clean_number(row[-1])
                    elif 'women' in row_str:
                        result[target]['women'] += clean_number(row[-1])
                    elif 'another gender' in row_str or 'unknown gender' in row_str:
                        result[target]['others'] += clean_number(row[-1])
                            
    return result

In [18]:
extract_gender_data(gatech_file_path)

{'applicants': {'men': 40022, 'women': 19765, 'others': 2},
 'admitted': {'men': 4634, 'women': 3779, 'others': 0}}

In [19]:
extract_gender_data(umn_file_path)

{'applicants': {'men': 18901, 'women': 20862, 'others': 0},
 'admitted': {'men': 14427, 'women': 16218, 'others': 0}}

In [20]:
extract_gender_data(nyu_file_path) # ^^

{'applicants': {'men': 0, 'women': 0, 'others': 0},
 'admitted': {'men': 0, 'women': 0, 'others': 0}}

In [21]:
extract_gender_data(uga_file_path)

{'applicants': {'men': 18211, 'women': 25191, 'others': 14},
 'admitted': {'men': 6260, 'women': 9882, 'others': 6}}

##### Residency

In [22]:
def extract_residency_data(file_path):
    
    result = {
        'applicants': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0},
        'admitted': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0}
    }

    # Define keywords to identify residency-related and first-year-related tables
    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is relevant
                table_str = ' '.join(str(cell).lower() for row in table for cell in row if cell)
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in_state': -1, 'out_of_state': -1, 'international': -1, 'others': -1}
                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in_state'] = idx
                    elif 'out-' in col:
                        col_idx['out_of_state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join(str(cell or '').lower() for cell in row)

                   # Identify whether this row is about applicants or admitted students
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue # Skip if not relevant
                    
                    # Add residency-specific count
                    if col_idx['in_state'] != -1:
                        result[target]['in_state'] += clean_number(row[col_idx['in_state']])
                    if col_idx['out_of_state'] != -1:
                        result[target]['out_of_state'] += clean_number(row[col_idx['out_of_state']])
                    if col_idx['international'] != -1:
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1:
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [23]:
extract_residency_data(gatech_file_path)

{'applicants': {'in_state': 10674,
  'out_of_state': 38320,
  'international': 10795,
  'others': 0},
 'admitted': {'in_state': 3536,
  'out_of_state': 3992,
  'international': 885,
  'others': 0}}

In [24]:
extract_residency_data(umn_file_path)

{'applicants': {'in_state': 13982,
  'out_of_state': 19461,
  'international': 1256,
  'others': 0},
 'admitted': {'in_state': 10498,
  'out_of_state': 15400,
  'international': 982,
  'others': 0}}

In [25]:
extract_residency_data(uga_file_path)

{'applicants': {'in_state': 18210,
  'out_of_state': 23867,
  'international': 1338,
  'others': 1},
 'admitted': {'in_state': 9149,
  'out_of_state': 6709,
  'international': 289,
  'others': 1}}

##### Applicant Summary

In [26]:
def extract_applicant_summary(file_path):
    gender_data = extract_gender_data(file_path)
    residency_data = extract_residency_data(file_path)

    result = {
        'applicants': {
            'gender': gender_data.get('applicants', {}),
            'residency': residency_data.get('applicants', {})
        },
        'admitted': {
            'gender': gender_data.get('admitted', {}),
            'residency': residency_data.get('admitted', {})
        }
    }

    return result


In [40]:
extract_applicant_summary(gatech_file_path)

{'applicants': {'gender': {'men': 40022, 'women': 19765, 'others': 2},
  'residency': {'in_state': 10674,
   'out_of_state': 38320,
   'international': 10795,
   'others': 0}},
 'admitted': {'gender': {'men': 4634, 'women': 3779, 'others': 0},
  'residency': {'in_state': 3536,
   'out_of_state': 3992,
   'international': 885,
   'others': 0}}}

In [41]:
extract_applicant_summary(umn_file_path)

{'applicants': {'gender': {'men': 18901, 'women': 20862, 'others': 0},
  'residency': {'in_state': 13982,
   'out_of_state': 19461,
   'international': 1256,
   'others': 0}},
 'admitted': {'gender': {'men': 14427, 'women': 16218, 'others': 0},
  'residency': {'in_state': 10498,
   'out_of_state': 15400,
   'international': 982,
   'others': 0}}}

In [42]:
extract_applicant_summary(uga_file_path)

{'applicants': {'gender': {'men': 18211, 'women': 25191, 'others': 14},
  'residency': {'in_state': 18210,
   'out_of_state': 23867,
   'international': 1338,
   'others': 1}},
 'admitted': {'gender': {'men': 6260, 'women': 9882, 'others': 6},
  'residency': {'in_state': 9149,
   'out_of_state': 6709,
   'international': 289,
   'others': 1}}}

### C3. High School Completetion requirement

In [27]:
def extract_highschool_requirement(file_path):

    result = {
        'high school diploma required': 0,
        'GED accepted': 0
    }

    # Define exact phrases expected in each case
    diploma_and_ged = "high school diploma is required and ged is accepted"
    diploma_only = "high school diploma is required and ged is not accepted"
    no_diploma_needed = "high school diploma or equivalent is not required"

    # Extract only the C3 section using helper function
    section_text = extract_section_by_label(file_path, "C3")
    if not section_text:
        return result  # Return default if section not found

    section_text = section_text.lower()
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if diploma_and_ged in line and any(mark in line for mark in ['x', '☒', '✓', '✔', '4']):
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
            found_checked_option = True
            break
        elif diploma_only in line and any(mark in line for mark in ['x', '☒', '✓', '✔', '4']):
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
            found_checked_option = True
            break
        elif no_diploma_needed in line and any(mark in line for mark in ['x', '☒', '✓', '✔', '4']):
            result['high school diploma required'] = 0
            result['GED accepted'] = 0
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if diploma_and_ged in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 1
        elif diploma_only in section_text:
            result['high school diploma required'] = 1
            result['GED accepted'] = 0
        elif no_diploma_needed in section_text:
            result['high school diploma required'] = 0
            result['GED accepted'] = 0

    return result

In [28]:
extract_highschool_requirement(gatech_file_path)

{'high school diploma required': 1, 'GED accepted': 0}

In [29]:
extract_highschool_requirement(umn_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [30]:
extract_highschool_requirement(nyu_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [31]:
extract_highschool_requirement(uga_file_path)

{'high school diploma required': 1, 'GED accepted': 1}

In [32]:
extract_section_by_label(uga_file_path, 'c3')

'c3. high school completion requirement\ncheck the appropriate box to identify your high school completion requirement for degree-seeking entering\nstudents:\n4 high school diploma is required and ged is accepted\n☐\nhigh school diploma is required and ged is not accepted\n☐\nhigh school diploma or equivalent is not required\n☐\n'

### C4. General College-Preparatory Program Requirement

In [33]:
def extract_college_prep_requirement(file_path):
    
    result = {
        'general college-preparatory program': 
            {
                'required': 0,
                'recommended': 0,
                'neither required or recommended': 0
            }
        }

    # Define key phrases to match
    required = "require"
    recommended = "recommend"
    neither = "neither require nor recommend"

    # Extract only the C4 section using helper function
    section_text = extract_section_by_label(file_path, 'C4')
    if not section_text:
        return result # Return default if section not found
    section_text = section_text.lower()
    
    found_checked_option = False # Will be set to True if a checkbox is detected

    # Go through each line to find checked options
    for line in section_text.split('\n'):
        # '4' is included because some pdf files incorrectly render checkmarks as '4'
        if neither in line and any(mark in line for mark in ['x', '☒', '✓', '✔', '4']):
            result['general college-preparatory program']['neither required or recommended'] = 1
            found_checked_option = True
            break
        elif required in line and any(mark in line for mark in ['x', '☒', '✓', '✔', '4']):
            result['general college-preparatory program']['required'] = 1
            found_checked_option = True
            break
        elif recommended in line and any(mark in line for mark in ['x', '☒', '✓', '✔', '4']):
            result['general college-preparatory program']['recommended'] = 1
            found_checked_option = True
            break

    # If no checkboxes were found, use a fallback approach based on keyword presence
    if not found_checked_option:
        if "neither require nor recommend" in section_text:
            result['general college-preparatory program']['neither required or recommended'] = 1
        elif "recommend" in section_text:
            result['general college-preparatory program']['recommended'] = 1
        elif "require" in section_text:
            result['general college-preparatory program']['required'] = 1

    return result

In [34]:
extract_college_prep_requirement(gatech_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [35]:
extract_college_prep_requirement(umn_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [36]:
extract_college_prep_requirement(nyu_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [37]:
extract_college_prep_requirement(uga_file_path)

{'general college-preparatory program': {'required': 1,
  'recommended': 0,
  'neither required or recommended': 0}}

In [38]:
extract_section_by_label(uga_file_path, 'c4')

'c4. does your institution require or recommend a general college-preparatory program for degree-seeking\nstudents?\n4 require\n☐\nrecommend\n☐\nneither require nor recommend\n☐\n'

### Automation

In [39]:
# input 폴더 안에 있는 모든 pdf 파일 가져와서 처리
for filename in os.listdir(input_folder):
    if filename.endswith('.pdf'):  # pdf 파일만 처리
        file_path = os.path.join(input_folder, filename)  # 경로 생성
        data = extract_data_from_pdf(file_path)           # pdf 1개 처리 결과
        print(data)

NameError: name 'extract_data_from_pdf' is not defined