In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 모델 로드
model = SentenceTransformer("jhgan/ko-sbert-sts")
model_name = "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct"
llm_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct:
- configuration_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct:
- modeling_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading shards: 100%|██████████| 2/2 [07:07<00:00, 213.61s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]


In [3]:
# 데이터 로드
train = pd.read_csv("./datas/train.csv")
test = pd.read_csv("./datas/test.csv")
sample = pd.read_csv("./datas/sample_submission.csv")

In [4]:
grouped = train.groupby("인적사고")

res = {}
res_enhanced = {}
cosine_res = []

for name, group in tqdm(grouped):
    plan = group["재발방지대책 및 향후조치계획"]
    sentences = plan.tolist()
    vectors = model.encode(sentences, batch_size=32, show_progress_bar=True)

    similarity = cosine_similarity(vectors, vectors)
    best_idx = similarity.mean(axis=1).argmax()

    cosine_res += similarity[best_idx].tolist()
    representative_plan = plan.iloc[best_idx]
    res[name] = representative_plan

    # RAG 적용 - 대회 규칙 준수를 위한...
    rag_prompt = f"""
    내가 두 번은 안 묻는다카이. 니가 받은 내용 그대로만 말하면 되는 기다. 알았나?
    니가 멋대로 뭐 바꾸거나 추가하면 안된다 이기야. 그냥 받은 내용 그~대로 말하면 되는 기라. 명심해라!

    {representative_plan}"""

    messages = [{"role": "system", "content": "전달받은 내용을 단 한 글자도 바꾸지 않고 완전히 그대로 출력합니다."}, {"role": "user", "content": rag_prompt}]

    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")

    output = llm_model.generate(
        input_ids.to(llm_model.device),
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=256,
        do_sample=False,
        temperature=None,
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    assistant_response = generated_text.split("[|assistant|]")[-1].strip()
    res_enhanced[name] = assistant_response

Batches: 100%|██████████| 2/2 [00:00<00:00,  8.83it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00, 111.44it/s]
Batches: 100%|██████████| 60/60 [00:00<00:00, 130.45it/s]
Batches: 100%|██████████| 17/17 [00:00<00:00, 113.64it/s]
Batches: 100%|██████████| 80/80 [00:00<00:00, 122.19it/s]
Batches: 100%|██████████| 65/65 [00:00<00:00, 131.71it/s]
Batches: 100%|██████████| 47/47 [00:00<00:00, 127.49it/s]
Batches: 100%|██████████| 69/69 [00:00<00:00, 128.44it/s]
Batches: 100%|██████████| 7/7 [00:00<00:00, 119.82it/s]
Batches: 100%|██████████| 55/55 [00:00<00:00, 127.33it/s]
Batches: 100%|██████████| 24/24 [00:00<00:00, 114.91it/s]
Batches: 100%|██████████| 18/18 [00:00<00:00, 113.95it/s]
Batches: 100%|██████████| 10/10 [00:00<00:00, 110.70it/s]
Batches: 100%|██████████| 23/23 [00:00<00:00, 121.22it/s]
Batches: 100%|██████████| 109/109 [00:00<00:00, 121.12it/s]
Batches: 100%|██████████| 57/57 [00:00<00:00, 124.21it/s]
Batches: 100%|██████████| 8/8 [00:00<00:00, 113.52it/s]
Batches: 100%|███████

In [5]:
res_v = {}
res_enhanced_v = {}

for k, v in res.items():
    res_v[k] = model.encode(v)
    res_enhanced_v[k] = model.encode(res_enhanced[k])

for i in range(len(test)):
    accident = test.loc[i, "인적사고"]

    if accident in res_enhanced:
        sample.loc[i, "재발방지대책 및 향후조치계획"] = res_enhanced[accident]
        sample.iloc[i, 2:] = res_enhanced_v[accident]

sample.to_csv("nollm+rag_baseline_submission.csv", index=False, encoding="utf-8-sig")