In [None]:
# 라이브러리 설치
!pip install python-docx pymupdf konlpy pandas openpyxl reportlab

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting reportlab
  Downloading reportlab-4.3.1-py3-none-any.whl.metadata (1.7 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━

In [25]:
import xml.etree.ElementTree as ET
import docx
import fitz
import pandas as pd
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from google.colab import files
from konlpy.tag import Okt
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
okt = Okt()

In [None]:
def extract_hwpx(hwpx_file):
    try:
        tree = ET.parse(hwpx_file)
        root = tree.getroot()
        texts = [elem.text.strip() for elem in root.iter("t") if elem.text]
        return texts
    except Exception as e:
        return [f"오류 발생: {e}"]


def extract_docx(docx_file):
    try:
        doc = docx.Document(docx_file)
        texts = []

        # 본문 단락
        for para in doc.paragraphs:
            if para.text.strip():
                texts.append(para.text.strip())

        # 표 안의 텍스트
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    cell_text = cell.text.strip()
                    if cell_text:
                        texts.append(cell_text)

        return texts
    except Exception as e:
        return [f"오류 발생: {e}"]


def extract_pdf(pdf_file):
    try:
        doc = fitz.open(pdf_file)
        texts = []
        for page in doc:
            texts.extend(
                [
                    line.strip()
                    for line in page.get_text("text").split("\n")
                    if line.strip()
                ]
            )
        return texts
    except Exception as e:
        return [f"오류 발생: {e}"]

In [29]:
def extract_proper_nouns(text_list):
    proper_nouns = set()
    for text in text_list:
        tokens = okt.pos(text)
        for word, tag in tokens:
            if tag == "Noun" and len(word) > 1:
                proper_nouns.add(word)
    return list(sorted(proper_nouns))


def extract_keywords_tfidf(text_list, top_k=10):
    documents = [" ".join(okt.nouns(text)) for text in text_list if text.strip()]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    scores = tfidf_matrix.mean(axis=0).A1
    word_scores = dict(zip(vectorizer.get_feature_names_out(), scores))
    top_keywords = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return [word for word, score in top_keywords]

In [43]:
# 파일 업로드 및 텍스트 추출
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

if file_name.lower().endswith(".hwpx"):
    extracted_text = extract_hwpx(file_name)
elif file_name.lower().endswith(".docx"):
    extracted_text = extract_docx(file_name)
elif file_name.lower().endswith(".pdf"):
    extracted_text = extract_pdf(file_name)
else:
    extracted_text = ["지원되지 않는 파일 형식입니다."]

proper_nouns = extract_keywords_tfidf(extracted_text, top_k=15)

Saving 6 Perceptron.pdf to 6 Perceptron (1).pdf


In [44]:
# 고유명사 추출 및 출력
def get_initial_consonant(word):
    if "가" <= word[0] <= "힣":
        cho_idx = (ord(word[0]) - ord("가")) // 588
        chosung_list = [
            "ㄱ",
            "ㄲ",
            "ㄴ",
            "ㄷ",
            "ㄸ",
            "ㄹ",
            "ㅁ",
            "ㅂ",
            "ㅃ",
            "ㅅ",
            "ㅆ",
            "ㅇ",
            "ㅈ",
            "ㅉ",
            "ㅊ",
            "ㅋ",
            "ㅌ",
            "ㅍ",
            "ㅎ",
        ]
        return chosung_list[cho_idx]
    else:
        return "기타"


grouped = defaultdict(list)
for noun in proper_nouns:
    cho = get_initial_consonant(noun)
    grouped[cho].append(noun)

# DataFrame으로 변환
grouped_df = pd.DataFrame(
    {"Proper Nouns": {k: ", ".join(sorted(v)) for k, v in grouped.items()}}
).sort_index()

# 출력
print(grouped_df)

  Proper Nouns
ㄱ   가중, 공간, 구조
ㄴ           노드
ㄷ           다중
ㅁ     메커니즘, 목적
ㅂ           변환
ㅅ           선형
ㅇ           은닉
ㅊ           차원
ㅌ           특징
ㅍ         퍼셉트론
ㅎ       학습, 함수


In [None]:
# 사용자 선택에 따라 저장
excel_choice = input("엑셀 파일로 저장하시겠습니까? (y/n): ").strip().lower()
if excel_choice == "y":
    df = pd.DataFrame(grouped_df, columns=["Proper Nouns"])
    df.to_excel("grouped_df.xlsx", index=False)
    print("✔ grouped_df.xlsx 파일로 저장되었습니다.")

csv_choice = input("CSV 파일로 저장하시겠습니까? (y/n): ").strip().lower()
if csv_choice == "y":
    df = pd.DataFrame(grouped_df, columns=["Proper Nouns"])
    df.to_csv("grouped_df.csv", index=False, encoding="utf-8-sig")
    print("✔ grouped_df.csv 파일로 저장되었습니다.")

pdf_choice = input("PDF 파일로 저장하시겠습니까? (y/n): ").strip().lower()
if pdf_choice == "y":
    pdfmetrics.registerFont(TTFont("NanumGothic", "/content/NanumGothic.ttf"))
    c = canvas.Canvas("grouped_df.pdf", pagesize=A4)
    textobject = c.beginText(40, 800)
    textobject.setFont("NanumGothic", 12)

    for noun in grouped_df:
        textobject.textLine(noun)

    c.drawText(textobject)
    c.save()
    print("✔ grouped_df.pdf 파일로 저장되었습니다.")

엑셀 파일로 저장하시겠습니까? (y/n): n
CSV 파일로 저장하시겠습니까? (y/n): y
✔ grouped_df.csv 파일로 저장되었습니다.
PDF 파일로 저장하시겠습니까? (y/n): n
