In [None]:
import os # 폴더 만들고 파일 읽어올 때 사용
import re # 정규표현식
import pdfplumber # pdf에서 텍스트 추출
import pandas as pd

In [10]:
# input/output 폴더 세팅
input_folder = 'pdf'
output_folder = 'output'

In [41]:
# 숫자 데이터 깔끔하게 변환하는 함수
def clean_number(text):
    try:
        return int(text.replace(',', '').strip())
    except:
        return 0

In [42]:
# 숫자만 추출하는 함수
def extract_number(pattern, text):
    match = re.findall(pattern, text, re.IGNORECASE)
    return int(match[0].replace(',', '')) if match else 0

In [55]:
def find_number(row):
    for cell in reversed(row):
        if cell and re.search(r'\d', str(cell)):
            return clean_number(cell)
    return 0

In [81]:
def extract_gender_data(file_path):
    result = {'지원자': {'남':0, '여':0, '기타':0},
              '합격자': {'남':0, '여':0, '기타':0}}

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                for row in table:
                    row_str = ' '.join([str(cell or '') for cell in row]).lower()

                    if 'who applied' in row_str:
                        target = '지원자'
                    elif 'who were admitted' in row_str:
                        target = '합격자'
                    else:
                        continue

                    if 'men' in row_str and 'women' not in row_str:
                        result[target]['남'] += find_number(row)
                    elif 'women' in row_str:
                        result[target]['여'] += find_number(row)
                    elif 'another gender' in row_str or 'unknown gender' in row_str:
                        result[target]['기타'] += find_number(row)

    return [{'구분': '지원자', '남': result['지원자']['남'], '여': result['지원자']['여'], '기타': result['지원자']['기타']},
            {'구분': '합격자', '남': result['합격자']['남'], '여': result['합격자']['여'], '기타': result['합격자']['기타']}]


In [78]:
def extract_gender_data(file_path):
    result = {'지원자': {'남': 0, '여': 0, '기타': 0},
              '합격자': {'남': 0, '여': 0, '기타': 0}}

    gender_cols = {}  # {'남': idx, '여': idx, '기타': idx}

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                for row in table:
                    # row 를 string 으로
                    row_str = ' '.join([str(cell or '') for cell in row]).lower()

                    # 열 이름 탐색 (NYU style)
                    if 'men' in row_str and 'women' in row_str:
                        for idx, cell in enumerate(row):
                            cell = (cell or '').lower()
                            if 'men' in cell:
                                gender_cols['남'] = idx
                            if 'women' in cell:
                                gender_cols['여'] = idx
                            if 'another gender' in cell or 'unknown gender' in cell:
                                gender_cols['기타'] = idx
                        continue  # 이 row는 header row라 패스

                    # 지원자/합격자 구분
                    if 'who applied' in row_str:
                        target = '지원자'
                    elif 'who were admitted' in row_str:
                        target = '합격자'
                    else:
                        continue

                    if gender_cols and max(gender_cols.values()) < len(row):
                     # NYU 스타일 처리
                        result[target]['남'] += clean_number(row[gender_cols.get('남', -1)])
                        result[target]['여'] += clean_number(row[gender_cols.get('여', -1)])
                        result[target]['기타'] += clean_number(row[gender_cols.get('기타', -1)])
                    else:
                        # Gatech / UMN 스타일 처리
                        if 'men' in row_str and 'women' not in row_str:
                            result[target]['남'] += find_number(row)
                        if 'women' in row_str:
                            result[target]['여'] += find_number(row)
                        if 'another gender' in row_str or 'unknown gender' in row_str:
                            result[target]['기타'] += find_number(row)


    return [{'구분': '지원자', '남': result['지원자']['남'], '여': result['지원자']['여'], '기타': result['지원자']['기타']},
            {'구분': '합격자', '남': result['합격자']['남'], '여': result['합격자']['여'], '기타': result['합격자']['기타']}]


In [82]:
file_path = os.path.join(input_folder, 'gatech.pdf')
data = extract_gender_data(file_path)

In [83]:
data

[{'구분': '지원자', '남': 40022, '여': 19765, '기타': 2},
 {'구분': '합격자', '남': 4634, '여': 3779, '기타': 0}]

In [84]:
file_path = os.path.join(input_folder, 'umn.pdf')
data = extract_gender_data(file_path)

In [85]:
data

[{'구분': '지원자', '남': 18901, '여': 20862, '기타': 0},
 {'구분': '합격자', '남': 14427, '여': 16218, '기타': 0}]

In [76]:
file_path = os.path.join(input_folder, 'nyu.pdf')
data = extract_gender_data(file_path)

In [77]:
data

[{'구분': '지원자', '남': 0, '여': 0, '기타': 0},
 {'구분': '합격자', '남': 0, '여': 0, '기타': 0}]

In [26]:
# input 폴더 안에 있는 모든 pdf 파일 가져와서 처리
for filename in os.listdir(input_folder):
    if filename.endswith('.pdf'):  # pdf 파일만 처리
        file_path = os.path.join(input_folder, filename)  # 경로 생성
        data = extract_data_from_pdf(file_path)           # pdf 1개 처리 결과
        print(data)

TypeError: sequence item 2: expected str instance, NoneType found