<a href="https://colab.research.google.com/github/infocz-lucy/colab-test/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# PDF 처리 및 OCR
!pip install pdfplumber pdf2image pytesseract

# DOCX 처리
!pip install python-docx

# 텍스트 전처리
!pip install konlpy nltk spacy transformers

# RDF 변환
!pip install rdflib

# Neo4j Python 드라이버
!pip install neo4j

# OpenAI API
!pip install openai

# 한국어 형태소 분석기 Mecab 설치 (Google Colab 전용)
!apt-get update
!apt-get install g++ openjdk-8-jdk
!pip3 install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

# mecab-python의 버전 오류로 인해 아래 패키지를 설치하면 코랩에서 Mecab을 사용가능
!pip install mecab-python3

# SpaCy 영어 모델 다운로드
!python -m spacy download en_core_web_sm


In [None]:
!pip install transformers

In [36]:
# 필요한 라이브러리 임포트
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from docx import Document
from konlpy.tag import Mecab
import re
from nltk.corpus import stopwords
import nltk
import spacy

# 한국어 형태소 분석기 및 영어 NLP 모델 로드
mecab = Mecab()
nlp_en = spacy.load("en_core_web_sm")
nltk.download("stopwords")
stopwords_en = set(stopwords.words("english"))

# 예시 PDF 및 DOCX 파일 경로
pdf_path = "/content/sample_data/raw2.pdf"
docx_path = "example.docx"

# PDF 추출 및 OCR
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    if not text:  # PDF가 이미지인 경우 OCR 사용
        images = convert_from_path(pdf_path, 500)
        for img in images:
            text += pytesseract.image_to_string(img, lang="kor+eng")
    return text.strip()

# DOCX 추출
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

# 텍스트 전처리 (한국어)
def preprocess_korean(text):
    text = re.sub(r"[^가-힣\s]", "", text)
    tokens = mecab.morphs(text)
    return tokens

# 텍스트 전처리 (영어)
def preprocess_english(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    tokens = [word for word in text.split() if word not in stopwords_en]
    return tokens

# 엔티티 및 관계 추출
def extract_entities_relations(text):
    doc = nlp_en(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    relations = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    return entities, relations

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# KoBERT 모델을 사용한 예제
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")
model = AutoModelForTokenClassification.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

def extract_entities(text):
    entities = nlp(text)
    return [(entity['word'], entity['entity']) for entity in entities]

# 데이터 추출 및 전처리 실행
text_pdf = extract_text_from_pdf(pdf_path)

# Example usage
entities = extract_entities(text_pdf)
print(entities)
print(text_pdf)

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-finetuned-korquad and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('10', 'LABEL_0'), (',', 'LABEL_0'), ('O', 'LABEL_0'), ('##ct', 'LABEL_0'), ('.', 'LABEL_0'), ('29', 'LABEL_0'), (',', 'LABEL_0'), ('1987', 'LABEL_0'), ('제', 'LABEL_0'), ('##1', 'LABEL_0'), ('##장', 'LABEL_0'), ('총', 'LABEL_1'), ('##강', 'LABEL_1'), ('제', 'LABEL_0'), ('##1', 'LABEL_0'), ('##조', 'LABEL_0'), ('①', 'LABEL_0'), ('##대', 'LABEL_0'), ('##한', 'LABEL_0'), ('##민국', 'LABEL_0'), ('##은', 'LABEL_0'), ('민주', 'LABEL_0'), ('##공화국', 'LABEL_0'), ('##이다', 'LABEL_0'), ('.', 'LABEL_0'), ('②', 'LABEL_0'), ('##대', 'LABEL_0'), ('##한', 'LABEL_0'), ('##민국', 'LABEL_0'), ('##의', 'LABEL_0'), ('주권', 'LABEL_0'), ('##은', 'LABEL_0'), ('국민', 'LABEL_0'), ('##에', 'LABEL_0'), ('##게', 'LABEL_0'), ('있', 'LABEL_0'), ('##고', 'LABEL_0'), (',', 'LABEL_0'), ('모든', 'LABEL_0'), ('권력', 'LABEL_0'), ('##은', 'LABEL_0'), ('국민', 'LABEL_0'), ('##으로', 'LABEL_0'), ('##부터', 'LABEL_0'), ('나온다', 'LABEL_0'), ('.', 'LABEL_0'), ('제', 'LABEL_0'), ('##2', 'LABEL_0'), ('##조', 'LABEL_0'), ('①', 'LABEL_0'), ('##대', 'LABEL_0'), ('##한', 

In [32]:
# 데이터 추출 및 전처리 실행
text_pdf = extract_text_from_pdf(pdf_path)
# text_docx = extract_text_from_docx(docx_path)

tokens_pdf_ko = preprocess_korean(text_pdf)
tokens_pdf_en = preprocess_english(text_pdf)

# tokens_docx_ko = preprocess_korean(text_docx)
# tokens_docx_en = preprocess_english(text_docx)

# 엔티티 및 관계 추출
entities, relations = extract_entities_relations(text_pdf)


print('text-pdf: ', text_pdf)
# print("PDF 한국어 토큰:", tokens_pdf_ko)
# print("PDF 영어 토큰:", tokens_pdf_en)
# print("Extracted Entities:", entities)
# print("Extracted Relations:", relations)

# print("DOCX 한국어 토큰:", tokens_docx_ko)
# print("DOCX 영어 토큰:", tokens_docx_en)

text-pdf:  CONSTITUTION OF THE REPUBLIC OF KOREA
Wholly Amended by Constitution No. 10, Oct. 29, 1987
CHAPTER I GENERAL PROVISIONS
제1장 총강
Article 1
(1) The Republic of Korea shall be a democratic republic.
(2) The sovereignty of the Republic of Korea shall reside in the people, and all state authority shall
emanate from the people.
제1조
①대한민국은 민주공화국이다.
②대한민국의 주권은 국민에게 있고, 모든 권력은 국민으로부터 나온다.
Article 2
(1) Nationality in the Republic of Korea shall be prescribed by Act.
(2) It shall be the duty of the State to protect citizens residing abroad as prescribed by Act.
제2조
①대한민국의 국민이 되는 요건은 법률로 정한다.
②국가는 법률이 정하는 바에 의하여 재외국민을 보호할 의무를 진다.
Article 3
The territory of the Republic of Korea shall consist of the Korean peninsula and its adjacent islands.
제3조
대한민국의 영토는 한반도와 그 부속도서로 한다.Article 4
The Republic of Korea shall seek unification and shall formulate and carry out a policy of peaceful
unification based on the basic free and democratic order.
제4조
대한민국은 통일을 지향하며, 자유민주적 기본질서에 입각한 평화적 통일 정책을 수립하고 