In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.to(device) # 모델을 GPU단으로 이동

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [36]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# 모델 및 토크나이저 로드
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
                                                    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
                                                    pad_token='<pad>', mask_token='<mask>')
model.to(device)

# 텍스트 생성 함수 정의
def generate_text(prompt, max_length=50, temperature=1.0):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    output = model.generate(input_ids, max_length=max_length, temperature=temperature, num_beams=5, no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# 텍스트 생성 예제
prompt = "오늘 날씨는"
generated_text = generate_text(prompt, max_length=500, temperature=0.8)
print('-'*1000)
print("Generated Text:", generated_text)

config.json: 100%|██████████| 606/606 [00:00<00:00, 1.62MB/s]
You are using a model of type llama to instantiate a model of type gpt2. This is not supported for all configurations of models and can yield errors.
model.safetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 43.6MB/s]
model-00001-of-00015.safetensors:  17%|█▋        | 157M/919M [00:13<01:06, 11.4MB/s]
Downloading shards:   0%|          | 0/15 [00:14<?, ?it/s]


KeyboardInterrupt: 

In [25]:
data[['질문_1', '질문_1', '답변_1', '답변_2', '답변_3', '답변_4', '답변_5']].apply(lambda col: col.apply(lambda sentence: len(sentence)).mean())

질문_1     19.187888
질문_1     19.187888
답변_1     95.731366
답변_2    194.504658
답변_3    194.607143
답변_4    196.967391
답변_5    195.385093
dtype: float64

In [28]:
# 데이터 로드
data = pd.read_csv('./data/train.csv')

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2',
                                                    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
                                                    pad_token='<pad>', mask_token='<mask>')
# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            # input_text = "### 질문: " + row[q_col] + tokenizer.eos_token + "### 답변: " + row[a_col]
            input_text = f"""### 지시문: \n 다음은 건축, 시공 관련 내용이야. 질문에 대해서 친절하게 답변해줘. \n ### 질문: \n {row[q_col]} \n ### 답변: \n{tokenizer.eos_token}{row[a_col]}""" 
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
644it [00:00, 650.31it/s]

Done.





In [30]:
from transformers import get_cosine_schedule_with_warmup


# 모델 학습 하이퍼파라미터(Hyperparameter) 세팅
# 실제 필요에 따라 조정하세요.
CFG = {
    'LR' : 2e-5, # Learning Rate
    'EPOCHS' : 15, # 학습 Epoch
    'WARMUP_STEPS': 500,
}

# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'])
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=CFG['WARMUP_STEPS'], num_training_steps=len(formatted_data))
model.train()

# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        # 데이터를 GPU단으로 이동
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 진행률 표시줄에 평균 손실 업데이트
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    # 에폭의 평균 손실을 출력
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained(f"./checkpoints/hansoldeco-kogpt2-instructed-{CFG['EPOCHS']}e")
tokenizer.save_pretrained(f"./checkpoints/hansoldeco-kogpt2-instructed-{CFG['EPOCHS']}e")

Epoch 1 - Avg Loss: 1.3916: 100%|██████████| 6440/6440 [03:10<00:00, 33.88it/s]


Epoch 1/15, Average Loss: 1.3915988604729035


Epoch 2 - Avg Loss: 1.0559: 100%|██████████| 6440/6440 [03:18<00:00, 32.44it/s]


Epoch 2/15, Average Loss: 1.0558796087152655


Epoch 3 - Avg Loss: 0.8658: 100%|██████████| 6440/6440 [03:07<00:00, 34.31it/s]


Epoch 3/15, Average Loss: 0.8657582309444128


Epoch 4 - Avg Loss: 0.6552: 100%|██████████| 6440/6440 [03:20<00:00, 32.05it/s]


Epoch 4/15, Average Loss: 0.6552328903181768


Epoch 5 - Avg Loss: 0.5687: 100%|██████████| 6440/6440 [03:12<00:00, 33.38it/s]


Epoch 5/15, Average Loss: 0.568725226271134


Epoch 6 - Avg Loss: 0.4379: 100%|██████████| 6440/6440 [03:13<00:00, 33.30it/s]


Epoch 6/15, Average Loss: 0.4378665391772245


Epoch 7 - Avg Loss: 0.3962: 100%|██████████| 6440/6440 [03:14<00:00, 33.07it/s]


Epoch 7/15, Average Loss: 0.3961999350325754


Epoch 8 - Avg Loss: 0.3059: 100%|██████████| 6440/6440 [03:10<00:00, 33.79it/s]


Epoch 8/15, Average Loss: 0.305864921217981


Epoch 9 - Avg Loss: 0.2738: 100%|██████████| 6440/6440 [03:16<00:00, 32.73it/s]


Epoch 9/15, Average Loss: 0.27378761607528845


Epoch 10 - Avg Loss: 0.2270: 100%|██████████| 6440/6440 [03:05<00:00, 34.72it/s]


Epoch 10/15, Average Loss: 0.2269822559626021


Epoch 11 - Avg Loss: 0.2203: 100%|██████████| 6440/6440 [03:17<00:00, 32.68it/s]


Epoch 11/15, Average Loss: 0.22029413271128484


Epoch 12 - Avg Loss: 0.1890: 100%|██████████| 6440/6440 [03:14<00:00, 33.07it/s]


Epoch 12/15, Average Loss: 0.18900005807174566


Epoch 13 - Avg Loss: 0.1870: 100%|██████████| 6440/6440 [03:17<00:00, 32.61it/s]


Epoch 13/15, Average Loss: 0.18703799568556295


Epoch 14 - Avg Loss: 0.1750: 100%|██████████| 6440/6440 [03:06<00:00, 34.56it/s]


Epoch 14/15, Average Loss: 0.17495571321385525


Epoch 15 - Avg Loss: 0.1509: 100%|██████████| 6440/6440 [03:13<00:00, 33.34it/s]


Epoch 15/15, Average Loss: 0.1508837421690779


('./checkpoints/hansoldeco-kogpt2-instructed-15/tokenizer_config.json',
 './checkpoints/hansoldeco-kogpt2-instructed-15/special_tokens_map.json',
 './checkpoints/hansoldeco-kogpt2-instructed-15/tokenizer.json')

# Load & Test

In [38]:
# 저장된 Fine-tuned 모델과 토크나이저 불러오기
model_dir = "./checkpoints/hansoldeco-kogpt2-instructed-15"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir,
                                                    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
                                                    pad_token='<pad>', mask_token='<mask>')

# Inference를 위한 test.csv 파일 로드
test = pd.read_csv('./data/test.csv')

# test.csv의 '질문'에 대한 '답변'을 저장할 리스트
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    # 입력 텍스트를 토큰화하고 모델 입력 형태로 변환
    # test_input_text = "질문: " + test_question + tokenizer.eos_token + "답변: "
    test_input_text = f"""### 지시문: \n 다음은 건축, 시공 관련 내용이야. 질문에 대해서 친절하게 답변해줘. \n ### 질문: \n {test_question} \n ### 답변:{tokenizer.eos_token}"""
    input_ids = tokenizer.encode(test_input_text, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=200,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

100%|██████████| 130/130 [02:13<00:00,  1.03s/it]


In [40]:
preds

['방청페인트의 주된 단점은 철재 부식입니다. 이러한 문제는 건물의 내구성과 안전성을 고려하여 결정해야 합니다. 또한 페인트에 포함된 독성물질이나 연기 역시 중요한데, 이에 대한 주의깊게 살펴보는 것이 좋습니다.특히 페인트는 화학물질을 포함하고 있는 경우가 많은데, 이러한 유해물질은폐 기간 동안 노출될 수 있으므로 주의해야 합니다.따라서 페인트를 사용할 때에는 안전에 대해 신중한 접근이 필요합니다. 추가적으로 각 부분의 특성을 고려하여, 페인트의 장단점과 처리 방법을 신중히 고려해야 합니다. 만약 페인트가 건조되기 전에 다른 방수 처리를 통해 표면을 세척하는 등의 조치를 취한다면, 페인트의 내부가 상할 수 있습니다.',
 '해당 작업은 건물의 외벽 또는 내부 공간에 녹이 스면 됩니다. 이러한 경우에는 먼저 건물 주변의 벽체나 천장을 확인하여 부식이나 손상된 부분이 있는지 확인해야 합니다. 또한 외부 기상 조건과 실내 습도 사이의 온도차를 줄이는 것도 중요한데, 이를 위해서는 건물 내부의 온도와 습도를 적절히 조절하여 결로가 발생하지 않도록 주의해야 합니다. 마지막으로, 화재에 대한 안전성을 체크하는 것이 중요합니다. 예를 들어, 건물 주변에 설치된 방수 시스템이나 누수 구멍의 방화벽과 같은 구조적 제약 사항을 유의하고, 필요에 따라 보수나 수리 작업이 필요할 수 있습니다. 이러한 다양한 요소들을 종합적으로 고려하여 적절한 조치를 취하는 것이 중요합니다. 또한, 녹오염을 방지하기 위해 방수 시스템을 도입하거나 외부 환경 조건을 이용하여 녹이 침투할 수 있는 경우에도 각 부분의 녹이 조금씩',
 '큐블럭은 시간이 지나면 곰팡이가 생길 수 있는 경시현상이 있습니다. 이러한 현상은 건물 외부에서 외벽이나 바닥재와 같은 구조적 요소들도 함께 고려해야 합니다. 또한 균열, 백화현상, 그리고 철근의 부식을 방지하여 건물의 수명을 연장시키는 것도 중요합니다. 따라서 이러한 구조적 요소들을 종합적으로 고려하여 건물을 설계하고 시공하는 것이 환경에 더욱 이로울 것입니다. 따라서

# Submission

In [41]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = model.encode(preds)
pred_embeddings.shape

(130, 512)

In [42]:
submit = pd.read_csv('./data/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.011785,0.063328,-0.01674,0.019605,0.096068,0.041541,0.00521,0.011576,0.003577,...,-0.006069,-0.092898,0.051836,-0.052177,-0.029583,0.010297,0.068943,-0.041931,-0.003002,0.021973
1,TEST_001,0.025359,0.068629,0.000225,0.027807,0.092003,-0.0509,-0.023755,0.023371,-0.009215,...,0.015039,-0.068274,0.103401,-0.034807,-0.003538,0.057399,-0.029201,-0.083902,-0.039216,0.025106
2,TEST_002,-0.028847,0.000879,-0.017899,0.013757,0.12065,-0.043101,0.002575,0.017365,0.044537,...,-0.053734,-0.057986,0.060782,-0.003203,-0.004732,0.027527,-0.007636,-0.022685,-0.04631,0.029414
3,TEST_003,-0.00705,0.000616,-0.019514,0.033853,0.081581,-0.033796,-0.064221,0.025045,-0.002043,...,-0.033327,-0.000841,0.04951,-0.059528,0.023882,0.011655,-0.0053,-0.023497,-0.023127,0.041698
4,TEST_004,0.015772,0.025752,-0.017363,-0.00482,0.102921,-0.017052,0.032527,0.002291,-0.010819,...,0.000268,-0.068008,0.067847,-0.035059,-0.030108,0.007864,0.018334,-0.033835,-0.016224,0.003315


In [35]:
submit.to_csv('kogpt2-instructed-15e.csv', index=False)