In [38]:
!nvidia-smi

Mon Mar 31 14:06:58 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:19:00.0 Off |                  N/A |
| 97%   63C    P2             254W / 350W |  15193MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        Off | 00000000:1A:00.0 Off |  

In [2]:
# 셀 1: 필수 모듈 임포트 및 다운로드
import os



In [3]:
# 셀 2: 인자 설정
args = {
    'input_path': '../data/xsumfaith_org.json',
    
    'granularity': '3G',
    'cuda_device': '2',
    'weight_rouge': 0.3,
    'weight_bert': 0.7,
    'doc_label': 'document',
    'summary_label': 'claim',
    'label_label': 'label',
    'score_column': 'FIZZ_score',
    'model_name': 'orca2'
}

# CUDA 디바이스 환경 설정
os.environ["CUDA_VISIBLE_DEVICES"] = args['cuda_device']


In [4]:
import json
from tqdm import tqdm
import nltk
from rouge_score import rouge_scorer
from bert_score import BERTScorer
from atomic_fact_decomposition import AtomicFactDecomposer
from atomic_fact_filtering import AtomicFactFilterer
from atomic_fact_scoring import AtomicFactScorer

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# 셀 3: main 함수 정의
def main(args):  # not discarding selected document sentences
    with open(args['input_path'], "r", encoding="utf-8") as f:
        data = json.load(f)

    decomposer = AtomicFactDecomposer(model_name=args['model_name'])
    filterer = AtomicFactFilterer()
    scorer = AtomicFactScorer(granularity=args['granularity'])
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)

    output_data = []
    # 이후 루프에서 매번 append
    for entry in tqdm(data, desc="Processing", mininterval=0.01):
        doc = entry[args['doc_label']]
        summary = entry[args['summary_label']]
        label = entry[args['label_label']]

        atomic_facts = decomposer.atomic_facts_decompose(summary)
        filtered_atomic_facts = filterer.atomic_facts_filtering(summary, atomic_facts)
        
        summary_sentences = decomposer.split_sentences(filtered_atomic_facts)
        doc_sentences = decomposer.split_sentences(doc)

        # info 딕셔너리 구성
        info = {
            "summary_before": summary,
            "summary_sentences": summary_sentences,
            "doc": []
        }

        selected_sentences = set()
        for fact in summary_sentences:
            best_score = 0
            best_sentence = None

            for doc_sentence in doc_sentences:
                rouge_score = rouge_scorer_obj.score(fact, doc_sentence)['rougeL'].fmeasure
                P, R, F1 = bert_scorer.score([doc_sentence], [fact])
                bert_score = F1.item()
                combined_score = (args['weight_rouge'] * rouge_score) + (args['weight_bert'] * bert_score)

                if combined_score > best_score:
                    best_score = combined_score
                    best_sentence = doc_sentence

            if best_sentence:
                selected_sentences.add(best_sentence)

        modified_sentences = []
        for sentence in doc_sentences:
            modified_sentences.append((sentence, "original", None))
            if sentence in selected_sentences:
                atomic_facts = decomposer.atomic_facts_decompose(sentence)
                filtered_facts = filterer.atomic_facts_filtering(sentence, atomic_facts)
                fact_sentences = decomposer.split_sentences(filtered_facts)

                info["doc"].append({
                    "before": sentence,
                    "after": fact_sentences
                })

                for fact_sentence in fact_sentences:
                    modified_sentences.append((fact_sentence, "fact", sentence))

        # 매 루프마다 JSONL 형식으로 저장
        with open("sdinfo.json", "a", encoding="utf-8") as fout:
            fout.write(json.dumps(info, ensure_ascii=False, indent=4) + ",\n\n")


In [6]:
# 셀 4: 실행
with open("sdinfo.json", "w", encoding="utf-8") as f:
    pass
main(args)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Processing:   0%|          | 1/1250 [00:13<4:43:52, 13.64s/it]Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Processing:   0%|          | 2/1250 [00:18<2:57:57,  8.56s/it]Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.

KeyboardInterrupt: 

In [None]:
import json

def main(args):  # not discarding selected document sentences
    with open(args['input_path'], "r", encoding="utf-8") as f:
        data = json.load(f)

    decomposer = AtomicFactDecomposer(model_name=args['model_name'])
    filterer = AtomicFactFilterer()
    scorer = AtomicFactScorer(granularity=args['granularity'])
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)

    # output 파일 open
    with open("sdinfo.json", "w", encoding="utf-8") as fout:
        for entry in tqdm(data, desc="Processing", mininterval=0.01):
            doc = entry[args['doc_label']]
            summary = entry[args['summary_label']]
            label = entry[args['label_label']]

            atomic_facts = decomposer.atomic_facts_decompose(summary)
            filtered_atomic_facts = filterer.atomic_facts_filtering(summary, atomic_facts)

            summary_sentences = decomposer.split_sentences(filtered_atomic_facts)
            doc_sentences = decomposer.split_sentences(doc)

            # 보기 좋게 줄바꿈 포함하여 저장
            info = {
                "summary before": summary,
                "summary_sentences": summary_sentences,
                "doc before": doc,
                "doc_sentences": doc_sentences,
                "label": label
            }
            fout.write(json.dumps(info, ensure_ascii=False, indent=4) + ",\n\n")  # 쉼표로 구분, 공백 줄 추가


In [None]:
# 셀 4: 실행

with open("sdinfo.json", "w", encoding="utf-8") as f:
    pass
main(args)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.51it/s]
Keyword arguments {'add_special_tokens': False} not recognized.
Processing:   0%|          | 1/1250 [00:08<2:55:58,  8.45s/it]Keyword arguments {'add_special_tokens': False} not recognized.
Processing:   0%|          | 2/1250 [00:09<1:27:56,  4.23s/it]Keyword arguments {'add_special_tokens': False} not recognized.
Processing:   0%|          | 3/1250 [00:11<1:01:00,  2.94s/it]Keyword arguments {'add_special_tokens': False} not recognized.
Processing:   0%|          | 4/1250 [00:12<45:50,  2.21s/it]  Keyword arguments {'add_special_tokens': False} not recognized.
Processing:   0%|          | 5/1250 [00:13<35:37,  1.72s/it]Keyword

KeyboardInterrupt: 