In [4]:
import os # Used for creating folders and reading files
import re # Used for regular expressions
import pdfplumber # Used for extracting text from PDF files
import pandas as pd

In [5]:
# Set input and output folder paths

input_folder = 'pdf'
output_folder = 'output'

In [6]:
# Remove None and empty string from row

def clean_row(row):
    return [cell for cell in row if cell not in [None, '']]

In [7]:
# Extract only numbers from the cell

def clean_number(cell):
    cell = re.sub(r'[^\d]', '', str(cell)) # Remove all non-numeric characters
    return int(cell) if cell else 0

### Gender

In [None]:
def extract_gender_data(file_path):
    result = {'applicants': {'men': 0, 'women': 0, 'others': 0},
              'admitted': {'men': 0, 'women': 0, 'others': 0}}
    
    gender_keywords = ['men', 'women', 'another gender', 'unknown gender']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is gender-related
                table_str = ' '.join([' '.join([str(cell or '') for cell in row]).lower() for row in table])

                # Skip non-gender table
                if not (any(keyword in table_str for keyword in gender_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue  
                
                # Loop through data rows
                for row_idx, row in enumerate(table): # row_index: row number, row: list of values
                    row = clean_row(row)
                    row_str = ' '.join([str(cell or '') for cell in row]).lower()

                    # Determine target group
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue

                    # Extract numbers
                    if len(row) > 2: # Column-based structure
                        # Assume that men, women, and others are always located in the first, second, and third columns respectively
                        result[target]['men'] += clean_number(row[0])
                        result[target]['women'] += clean_number(row[1])
                        result[target]['others'] += clean_number(row[2])
                    else: # Row-based structure
                        if 'men' in row_str and 'women' not in row_str:
                            result[target]['men'] += clean_number(row[-1])
                        elif 'women' in row_str:
                            result[target]['women'] += clean_number(row[-1])
                        elif 'another gender' in row_str or 'unknown gender' in row_str:
                            result[target]['others'] += clean_number(row[-1])
                            
    return result

In [203]:
file_path = os.path.join(input_folder, 'gatech.pdf')
data = extract_gender_data(file_path)
data

{'applicants': {'men': 40022, 'women': 19765, 'others': 2},
 'admitted': {'men': 4634, 'women': 3779, 'others': 0}}

In [204]:
file_path = os.path.join(input_folder, 'umn.pdf')
data = extract_gender_data(file_path)
data

{'applicants': {'men': 18901, 'women': 20862, 'others': 0},
 'admitted': {'men': 14427, 'women': 16218, 'others': 0}}

In [205]:
file_path = os.path.join(input_folder, 'uga.pdf')
data = extract_gender_data(file_path)
data

{'applicants': {'men': 18211, 'women': 25191, 'others': 14},
 'admitted': {'men': 6260, 'women': 9882, 'others': 6}}

### Residency

In [None]:
def extract_residency_data(file_path):
    result = {'applicants': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0},
        'admitted': {'in_state': 0, 'out_of_state': 0, 'international': 0, 'others': 0}}

    residency_keywords = ['in-state', 'out-of-state', 'international']
    first_keywords = ['first-time', 'first-year']

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                
                # Check if this table is residency-related
                table_str = ' '.join([' '.join([str(cell or '') for cell in row]).lower() for row in table])

                # Skip non-residency table
                if not (any(keyword in table_str for keyword in residency_keywords) and all(keyword in table_str for keyword in first_keywords)):
                    continue 
                
                # Extract header row
                header_row = clean_row([str(cell or '').lower() for cell in table[0]])

                # Find index of each column
                col_idx = {'in_state': -1, 'out_of_state': -1, 'international': -1, 'others': -1}

                for idx, col in enumerate(header_row):
                    if 'in-' in col:
                        col_idx['in_state'] = idx
                    elif 'out-' in col:
                        col_idx['out_of_state'] = idx
                    elif 'inter' in col:
                        col_idx['international'] = idx
                    elif 'unk' in col:
                        col_idx['others'] = idx

                # Loop through data rows
                for row in table[1:]:  # skip header
                    row = clean_row(row)
                    row_str = ' '.join([str(cell or '') for cell in row]).lower()

                    # Determine target group
                    if 'applied' in row_str:
                        target = 'applicants'
                    elif 'admitted' in row_str:
                        target = 'admitted'
                    else:
                        continue
                    
                    # Extract numbers
                    if col_idx['in_state'] != -1:
                        result[target]['in_state'] += clean_number(row[col_idx['in_state']])
                    if col_idx['out_of_state'] != -1:
                        result[target]['out_of_state'] += clean_number(row[col_idx['out_of_state']])
                    if col_idx['international'] != -1:
                        result[target]['international'] += clean_number(row[col_idx['international']])
                    if col_idx['others'] != -1:
                        result[target]['others'] += clean_number(row[col_idx['others']])

    return result


In [41]:
file_path = os.path.join(input_folder, 'gatech.pdf')
data = extract_residency_data(file_path)
data

{'applicants': {'in_state': 10674,
  'out_of_state': 38320,
  'international': 10795,
  'others': 0},
 'admitted': {'in_state': 3536,
  'out_of_state': 3992,
  'international': 885,
  'others': 0}}

In [42]:
file_path = os.path.join(input_folder, 'umn.pdf')
data = extract_residency_data(file_path)
data

{'applicants': {'in_state': 13982,
  'out_of_state': 19461,
  'international': 1256,
  'others': 0},
 'admitted': {'in_state': 10498,
  'out_of_state': 15400,
  'international': 982,
  'others': 0}}

In [44]:
file_path = os.path.join(input_folder, 'uga.pdf')
data = extract_residency_data(file_path)
data

{'applicants': {'in_state': 18210,
  'out_of_state': 23867,
  'international': 1338,
  'others': 1},
 'admitted': {'in_state': 9149,
  'out_of_state': 6709,
  'international': 289,
  'others': 1}}

In [26]:
# input 폴더 안에 있는 모든 pdf 파일 가져와서 처리
for filename in os.listdir(input_folder):
    if filename.endswith('.pdf'):  # pdf 파일만 처리
        file_path = os.path.join(input_folder, filename)  # 경로 생성
        data = extract_data_from_pdf(file_path)           # pdf 1개 처리 결과
        print(data)

TypeError: sequence item 2: expected str instance, NoneType found