In [None]:
# 셀 1: 의존성 설치 및 환경 확인
%pip install numpy sentence-transformers torch scikit-learn pandas

# VEC 문장 분할 및 의미 기반 원문 정렬 (최신 요구사항 반영, \p{Han} 사용)
import unicodedata
import torch
import numpy as np
import pandas as pd
import regex as re
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:
# 0. SBERT 모델 및 토크나이저 로딩
_model = None
_tokenizer = None
def get_model_and_tokenizer():
    global _model, _tokenizer
    if _model is None:
        model_name = 'snunlp/KR-SBERT-V40K-klueNLI-augSTS'
        _model = SentenceTransformer(model_name)
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        _model = _model.to(device)
        _tokenizer = _model.tokenizer
    return _model, _tokenizer

In [None]:
# Cell 1: Imports & Environment Setup
import re
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast
import sentencepiece as spm
import FlagEmbedding

# 파일 경로 설정
input_path = "C:/Users/junto/Downloads/head-repo/SP/split_p/input_p.xlsx"
output_path = "C:/Users/junto/Downloads/head-repo/SP/split_p/output_p.xlsx"

# (Optional) 환경 점검
print("NumPy", np.__version__)
print("Pandas", pd.__version__)

ModuleNotFoundError: No module named 'BGEEmbedder'

In [None]:
# Cell 2: Tokenizer Initialization
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
sp = spm.SentencePieceProcessor()
sp.load('spm.model')  # SentencePiece 모델 파일 경로

def tokenize(text, method='bert'):
    if method == 'bert':
        return bert_tokenizer.tokenize(text)
    elif method == 'spm':
        return sp.encode(text, out_type=str)
    else:
        raise ValueError(f"Unknown tokenization method: {method}")

In [None]:
# Cell 3: Load Source & Target Texts from Excel
# input_p.xlsx에 'src' 및 'tgt' 컬럼이 있다고 가정
df_input = pd.read_excel(input_path)
src_full = df_input['원문'].astype(str).str.cat(sep=' ')
tgt_full = df_input['번역문'].astype(str).str_cat(sep=' ')

In [None]:
# Cell 4: Sentence Splitting (기존 기준 유지)
def split_sentences(text):
    sents = re.split(r'(?<=[。？！.!?])\s+', text)
    parts = []
    for s in sents:
        if len(s) > 150:
            parts.extend([s[i:i+150] for i in range(0, len(s), 150)])
        else:
            parts.append(s)
    merged, buffer = [], ''
    for seg in parts:
        han_count = len(re.findall(r'[\u4E00-\u9FFF]', seg))
        if han_count <= 3:
            buffer += seg
        else:
            if buffer:
                if merged:
                    merged[-1] += buffer
                buffer = ''
            merged.append(seg)
    if buffer and merged:
        merged[-1] += buffer
    return [m.strip() for m in merged]

In [None]:
# Cell 5: Generate Target Units (Tokenized)
tgt_sents = split_sentences(tgt_full)
tgt_units = [' '.join(tokenize(s, method='bert')) for s in tgt_sents]

In [None]:
# Cell 6: Initial Source Chunking
def chunk_src(src_text, max_chars=200):
    sents = split_sentences(src_text)
    chunks, buf = [], ''
    for s in sents:
        if len(buf) + len(s) > max_chars:
            chunks.append(buf.strip())
            buf = s
        else:
            buf += s
    if buf:
        chunks.append(buf.strip())
    return chunks

src_chunks = chunk_src(src_full)

In [None]:
# Cell 7: Embedding with BGE-M3
embedder = BGEEmbedder(model_name='bge-m3')
src_embs = embedder.embed_sentences(src_chunks)
tgt_embs = embedder.embed_sentences(tgt_units)

In [None]:
# Cell 8: Reverse Alignment (Greedy Matching)
assignments = []
for i, t_emb in enumerate(tgt_embs):
    sims = np.inner([t_emb], src_embs)[0]
    best_j = sims.argmax()
    assignments.append((i, best_j))

In [None]:
# Cell 9: Split & Collate Results from assignments
from tokenizer import split_src_meaning_units  # pipeline-bge 모듈 활용
records = []
for tgt_i, src_j in assignments:
    segments = split_src_meaning_units(src_chunks[src_j])
    records.append({
        'tgt_id': tgt_i,
        'tgt_text': tgt_sents[tgt_i],
        'src_segments': segments
    })
df_output = pd.DataFrame(records)
df_output.head()

In [None]:
# Cell 10: Save Output to Excel
df_output.to_excel(output_path, index=False)
print(f"Output saved to {output_path}")