In [1]:
import torch
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
from datasets import load_dataset
import nltk
from rouge_score import rouge_scorer

# NLTK Sentence Tokenizer 다운로드
nltk.download('punkt')

# 모델 및 토크나이저 로드
MODEL_NAME = "google/bert_uncased_L-4_H-256_A-4"  # MiniBERT 모델
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = EncoderDecoderModel.from_encoder_decoder_pretrained(MODEL_NAME, MODEL_NAME)

# 1️⃣ 뉴스 요약 데이터 로드
dataset = load_dataset("cnn_dailymail", "3.0.0")

# 2️⃣ 데이터 전처리 함수 정의
def preprocess_function(examples):
    inputs = tokenizer(examples["article"], padding="max_length", truncation=True, max_length=256)
    targets = tokenizer(examples["highlights"], padding="max_length", truncation=True, max_length=128)
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]
    }

# 3️⃣ 데이터 변환
train_dataset = dataset["train"].map(preprocess_function, batched=True)
val_dataset = dataset["validation"].map(preprocess_function, batched=True)

2025-02-05 21:27:47.933979: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /Users/hanhokim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of BertLMHeadModel were not initialized from the model checkpoint at google/bert_uncased_L-4_H-256_A-4 and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.

In [2]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
def compute_metrics(p):
    predictions, labels = p
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(labels, predictions)
    return {key: value.fmeasure for key, value in scores.items()}
model_name = "google-t5/t5-small"
token_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
token_tokenizer = AutoTokenizer.from_pretrained(model_name)
model.config.decoder_start_token_id = token_tokenizer.pad_token_id
model.config.pad_token_id = token_tokenizer.pad_token_id

In [5]:
# CustomTrainer 클래스 정의
class CustomTrainer(Trainer):
    def training_step(self, model, inputs, return_loss=True):
        model.train()
        
        # 입력 데이터 준비
        input_ids = inputs["input_ids"]
        labels = inputs["labels"]
        
        # 모델 출력
        outputs = model(input_ids, labels=labels)
        logits = outputs.logits
        
        # 크기 출력
        #print(f"logits.shape: {logits.shape}")  # 예: (batch_size, seq_len, num_labels)
        #print(f"labels.shape: {labels.shape}")  # 예: (batch_size, seq_len)
        
        # 손실 계산 (CrossEntropyLoss 사용)
        loss_fct = torch.nn.CrossEntropyLoss()
        
        # logits을 view할 때 차원 조정 필요
        batch_size, seq_len, num_labels = logits.shape
        loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
        
        if return_loss:
            return loss

# 하이퍼파라미터 설정
training_args = TrainingArguments(
    output_dir="./miniBERT_news_model",  # 출력 디렉토리
    num_train_epochs=3,                  # 학습 에폭 수
    per_device_train_batch_size=8,      # 학습 배치 크기
    per_device_eval_batch_size=8,       # 평가 배치 크기
    gradient_accumulation_steps=1,
    evaluation_strategy="epoch",         # 에폭 끝날 때마다 평가
    save_strategy="epoch",               # 에폭마다 모델 저장
    load_best_model_at_end=True,          # 최고의 모델 저장 여부
    use_cpu=True
)

# CustomTrainer 사용
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# 모델 저장
model.save_pretrained("./miniBERT_news_model")
tokenizer.save_pretrained("./miniBERT_news_model")

print("✅ 뉴스 데이터로 학습 완료!")

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Epoch,Training Loss,Validation Loss


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [3]:
train_dataset

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 287113
})

In [29]:
# 또한, 데이터셋의 feature 정보 확인
print(train_dataset.features)
print(val_dataset.features)

{'article': Value(dtype='string', id=None), 'highlights': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
{'article': Value(dtype='string', id=None), 'highlights': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [30]:

# 각 데이터 샘플의 길이 확인
print(len(train_dataset[0]['input_ids']))  # 첫 번째 입력 시퀀스 길이
print(len(train_dataset[0]['labels']))    # 첫 번째 출력 시퀀스 길이

256
128


In [1]:
import torch
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import nltk

# 1. 데이터셋 로드
dataset = load_dataset("cnn_dailymail", "3.0.0")
train_dataset = dataset['train']
validation_dataset = dataset['validation']

# 2. 토크나이저 및 모델 로드
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")

# 3. 데이터 전처리 함수 (최대 길이 감소)
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    targets = examples["highlights"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length')  # 최대 입력 길이 줄임: 1024 -> 512
    labels = tokenizer(targets, max_length=64, truncation=True, padding='max_length')   # 최대 출력 길이 줄임: 128 -> 64
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 4. 데이터 전처리 적용
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(preprocess_function, batched=True)

2025-02-06 20:36:19.476967: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

In [16]:
# 5. 학습 설정 (배치 사이즈, epoch 수 감소)
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_cnn_dailymail_reduced_hw", # 모델 저장 경로 변경
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 배치 사이즈 1 유지
    per_device_eval_batch_size=8,  # 배치 사이즈 1 유지
    gradient_accumulation_steps=2, # Gradient Accumulation 활성화 (예시: 2로 설정)
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=3,
    num_train_epochs=3, # epoch 수 줄임: 3 -> 2
    predict_with_generate=True,
    fp16=True, # fp16은 False로 유지
)

In [17]:
# 6. Trainer 객체 생성 및 학습
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [18]:
trainer.train() # 모델 학습 시작

RuntimeError: MPS backend out of memory (MPS allocated: 5.59 GB, other allocations: 1.03 GB, max allowed: 6.80 GB). Tried to allocate 196.34 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [27]:
import os
# 저장할 파일 경로 설정 (원하는 경로로 변경 가능)
train_dataset_path = "./data/tokenized_cnn_dailymail_train"
validation_dataset_path = "./data/tokenized_cnn_dailymail_validation"

# tokenized_train_dataset 저장
tokenized_train_dataset.save_to_disk(train_dataset_path)
print(f"Tokenized train dataset 저장 완료: {train_dataset_path}")

# tokenized_validation_dataset 저장
tokenized_validation_dataset.save_to_disk(validation_dataset_path)
print(f"Tokenized validation dataset 저장 완료: {validation_dataset_path}")

Saving the dataset (0/4 shards):   0%|          | 0/287113 [00:00<?, ? examples/s]

Tokenized train dataset 저장 완료: ./data/tokenized_cnn_dailymail_train


Saving the dataset (0/1 shards):   0%|          | 0/13368 [00:00<?, ? examples/s]

Tokenized validation dataset 저장 완료: ./data/tokenized_cnn_dailymail_validation


In [25]:
!cd colabtools
!python setup.py install

python: can't open file '/Users/hanhokim/Documents/limbus-identity-model/setup.py': [Errno 2] No such file or directory


In [35]:
from transformers import BartTokenizer, BartForConditionalGeneration

# 1. 모델 및 토크나이저 로드
model_name = "facebook/bart-large-cnn" # Hugging Face Hub의 BART 모델 이름
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# 2. 요약할 뉴스 기사 텍스트 입력 (예시)
article_text = """
"['- Units with SP: change their Panic Type to """"Ruin"""".', '(Ruin: When this unit enters Low Morale state, gain 3 Sinking at every Turn End and lose 1 Clash Power. When this unit Panics, gain 5 Sinking and 3 Sinking Count at every Turn End and lose 2 Clash Power)', '- Units without SP: -10% chance to flip Heads', '- Reduced by 1 at Turn End', '- This effect is not cumulative; inflicting it will replace the current value with the inflicted value.', '※ Apply the """"Units without SP"""" conditional to certain enemies that have SP but can\'t change their Panic Types.']"
"""

# 3. 텍스트 토큰화 및 모델 입력 형태로 변환
inputs = tokenizer.encode("summarize: " + article_text, return_tensors="pt") # "summarize: " prefix 추가 (BART 모델에 맞게)

# 4. 요약문 생성
summary_ids = model.generate(inputs, max_length=120, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True) # 요약문 생성 옵션 설정

# 5. 토큰화된 요약문 디코딩 및 출력
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("요약 결과:")
print(summary_text)

요약 결과:
summarize: "['- Units with SP: change their Panic Type to Ruin.', '(Ruin: When this unit enters Low Morale state, gain 3 Sinking at every Turn End and lose 1 Clash Power)', '- Units without SP: -10% chance to flip Heads'


In [36]:
from transformers import pipeline

# 1. summarization pipeline 생성 (FLAN-T5-large 모델 사용)
summarizer = pipeline("summarization", model="google/flan-t5-large")

# 2. 요약할 게임 스킬 설명 데이터
skill_description = """
['- Units with SP: change their Panic Type to """"Ruin"""".', '(Ruin: When this unit enters Low Morale state, gain 3 Sinking at every Turn End and lose 1 Clash Power. When this unit Panics, gain 5 Sinking and 3 Sinking Count at every Turn End and lose 2 Clash Power)', '- Units without SP: -10% chance to flip Heads', '- Reduced by 1 at Turn End', '- This effect is not cumulative; inflicting it will replace the current value with the inflicted value.', '※ Apply the """"Units without SP"""" conditional to certain enemies that have SP but can\'t change their Panic Types.']
"""

# 3. 제로샷 요약 실행
summary = summarizer(skill_description, 
                     max_length=150, # 최대 요약 길이 (토큰 수 기준, 필요에 따라 조절)
                     min_length=30)  # 최소 요약 길이 (토큰 수 기준, 필요에 따라 조절)

# 4. 요약 결과 출력
print("원본 스킬 설명:")
print(skill_description)
print("\n제로샷 요약 결과:")
print(summary[0]['summary_text'])

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use mps:0


RuntimeError: MPS backend out of memory (MPS allocated: 5.98 GB, other allocations: 828.23 MB, max allowed: 6.80 GB). Tried to allocate 11.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [1]:
import pandas as pd
from transformers import pipeline
from collections import defaultdict

# BART 모델 로드
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# CSV 파일 불러오기
df = pd.read_csv("./csv/compressed_condition_skill_data.csv")

# 스킬 코드별로 효과 합치기
def summarize_skill_data(df):
    # Identity_ID와 스킬 코드별로 모든 효과를 합침
    skill_summary = defaultdict(lambda: defaultdict(str))

    for _, row in df.iterrows():
        identity_id = row['Identity_ID']
        skill_code = row['Skill_Code']
        full_text = f"Uptie Level: {row['Uptie_Level']}, Condition: {row['Condition']}, Effect: {row['Processed_Effect']}"
        skill_summary[identity_id][skill_code] += full_text + " "  # 효과 합침

    # 합쳐진 텍스트에 대해 요약
    summarized_skills = []
    for identity_id, skills in skill_summary.items():
        for skill_code, combined_effect in skills.items():
            summary = summarizer(combined_effect, max_length=100, min_length=10, do_sample=False)
            summarized_effect = summary[0]['summary_text']
            summarized_skills.append({
                'Identity_ID': identity_id,
                'Skill_Code': skill_code,
                'Summarized_Effect': summarized_effect
            })

    return pd.DataFrame(summarized_skills)

# 결과 출력
summarized_skills_df = summarize_skill_data(df)
summarized_skills_df.to_csv("./csv/summarized_skills_df.csv", index=False)

print("✅ 압축된 데이터가 'summarized_skills_df.csv' 파일로 저장되었습니다.")

2025-02-09 23:32:26.761525: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Device set to use mps:0
  test_elements = torch.tensor(test_elements)
Your max_length is set to 100, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 100, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your max_length is set to 100, but your input_length is only 21. Since this is a summarization task, where outputs short

✅ 압축된 데이터가 'summarized_skills_df.csv' 파일로 저장되었습니다.


In [3]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# CSV 파일 불러오기
df = pd.read_csv("./csv/compressed_condition_skill_data.csv")

# 요약 함수 정의
def summarize_text(text):
    if pd.isna(text) or text.strip() == "No Effect":  # 빈 값이나 "No Effect"일 경우
        return "No Effect"
    try:
        summary = summarizer(text, max_length=100, min_length=10, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error: {e}"

# 요약 실행
df['Summary_Effect'] = df['Processed_Effect'].apply(summarize_text)

# 결과 저장
output_csv_path = "./skills_summary/summarized_per_skill_code_bart_large_cnn_base.csv"
df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
  test_elements = torch.tensor(test_elements)
Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decr

✅ 요약된 데이터가 './skills_summary/summarized_per_skill_code_bart_large_cnn_base.csv' 파일로 저장되었습니다.


In [4]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# CSV 파일 불러오기
df = pd.read_csv("./csv/compressed_condition_skill_data.csv")

# 요약 함수 정의
def summarize_text(text):
    if pd.isna(text) or text.strip() == "No Effect":  # 빈 값이나 "No Effect"일 경우
        return "No Effect"
    try:
        summary = summarizer(text, max_length=100, min_length=10, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error: {e}"

# 요약 실행
df['Summary_Effect'] = df['Processed_Effect'].apply(summarize_text)

# 결과 저장
output_csv_path = "./skills_summary/summarized_per_skill_code_distilbart_cnn_base.csv"
df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
  test_elements = torch.tensor(test_elements)
Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 50. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decr

KeyboardInterrupt: 

In [5]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# CSV 파일 불러오기
df = pd.read_csv("./skills_summary/summarized_per_skill_code_erase_noeffect_bart_large_cnn_base.csv")

# 스킬 코드별로 효과 합치기
def summarize_skill_data(df):
    # Identity_ID와 스킬 코드별로 모든 효과를 합침
    skill_summary = defaultdict(lambda: defaultdict(str))

    for _, row in df.iterrows():
        identity_id = row['Identity_ID']
        skill_code = row['Skill_Code']
        if len(str(row['Processed_Effect'])) < len(str(row['Summary_Effect'])):
            full_text = f"{row['Processed_Effect']}"
        else:
            full_text = f"{row['Summary_Effect']}"
        skill_summary[identity_id][skill_code] += full_text + " "  # 효과 합침

    # 합쳐진 텍스트에 대해 요약
    summarized_skills = []
    for identity_id, skills in skill_summary.items():
        for skill_code, combined_effect in skills.items():
            summary = summarizer(combined_effect, max_length=100, min_length=10, do_sample=False)
            summarized_effect = summary[0]['summary_text']
            summarized_skills.append({
                'Identity_ID': identity_id,
                'Skill_Code': skill_code,
                'Summarized_Effect': summarized_effect
            })

    return pd.DataFrame(summarized_skills)

# 결과 출력
summarized_skills_df = summarize_skill_data(df)

# 결과 저장
output_csv_path = "./skills_summary/summarized_per_skill_code_noeffect_bart_large_cnn.csv"
summarized_skills_df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
  test_elements = torch.tensor(test_elements)
Your max_length is set to 100, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)
Your max_length is set to 100, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 100, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Your max_length is set to 100, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decr

✅ 요약된 데이터가 './skills_summary/summarized_per_skill_code_noeffect_bart_large_cnn.csv' 파일로 저장되었습니다.


In [14]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="google/flan-t5-base")

# CSV 파일 불러오기
df = pd.read_csv("./skills_summary/summarized_per_skill_code_erase_noeffect_flan_t5_base.csv")

# 스킬 코드별로 효과 합치기
def summarize_skill_data(df):
    # Identity_ID와 스킬 코드별로 모든 효과를 합침
    skill_summary = defaultdict(lambda: defaultdict(str))

    for _, row in df.iterrows():
        identity_id = row['Identity_ID']
        skill_code = row['Skill_Code']
        if len(str(row['Processed_Effect'])) < len(str(row['Summary_Effect'])):
            full_text = f"{row['Processed_Effect']}"
        else:
            full_text = f"{row['Summary_Effect']}"
        skill_summary[identity_id][skill_code] += full_text + " "  # 효과 합침

    # 합쳐진 텍스트에 대해 요약
    summarized_skills = []
    for identity_id, skills in skill_summary.items():
        for skill_code, combined_effect in skills.items():
            summary = summarizer(combined_effect, max_length=100, min_length=10, do_sample=False)
            summarized_effect = summary[0]['summary_text']
            summarized_skills.append({
                'Identity_ID': identity_id,
                'Skill_Code': skill_code,
                'Summarized_Effect': summarized_effect
            })

    return pd.DataFrame(summarized_skills)

# 결과 출력
summarized_skills_df = summarize_skill_data(df)

# 결과 저장
output_csv_path = "./skills_summary/summarized_per_skill_code_noeffect_flan_t5_base.csv"
summarized_skills_df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
Your max_length is set to 100, but your input_length is only 73. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
  test_elements = torch.tensor(test_elements)
Your max_length is set to 100, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 100, but your input_length is only 72. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Your max_length is set to 100, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decr

✅ 요약된 데이터가 './skills_summary/summarized_per_skill_code_noeffect_flan_t5_base.csv' 파일로 저장되었습니다.


In [16]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="google/flan-t5-base")

# CSV 파일 불러오기
df = pd.read_csv("./skills_summary/summarized_per_skill_code_noeffect_flan_t5_base.csv")

# Identity_ID별로 효과 합치기
def summarize_by_identity(df):
    identity_summary = defaultdict(str)

    for _, row in df.iterrows():
        identity_id = row['Identity_ID']
        summarized_effect = str(row['Summarized_Effect'])
        identity_summary[identity_id] += summarized_effect + " "  # 효과 합침

    # 합쳐진 텍스트에 대해 요약
    summarized_identities = []
    for identity_id, combined_effect in identity_summary.items():
        summary = summarizer(combined_effect, max_length=100, min_length=10, do_sample=False)
        summarized_effect = summary[0]['summary_text']
        summarized_identities.append({
            'Identity_ID': identity_id,
            'Final_Summary': summarized_effect
        })

    return pd.DataFrame(summarized_identities)

# 결과 출력
summarized_identities_df = summarize_by_identity(df)

# 결과 저장
output_csv_path = "./identities_summary/summarized_per_identity_noeffect_flan_t5_base.csv"
summarized_identities_df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
  test_elements = torch.tensor(test_elements)
Your max_length is set to 100, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 100, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 100, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


✅ 요약된 데이터가 './identities_summary/summarized_per_identity_noeffect_flan_t5_base.csv' 파일로 저장되었습니다.
