In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
#device = torch.device("gpu") if torch.backends.mps.is_available() else torch.device("cpu")
#device

In [5]:
# 데이터 로드
train_df = pd.read_csv('/content/drive/MyDrive/문장 유형 분류 AI 경진대회/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/문장 유형 분류 AI 경진대회/test.csv')

In [6]:
# 레이블 인코더 생성 및 학습
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])
# train.csv 데이터셋의 레이블을 숫자 레이블로 변환
train_df['label'] = label_encoder.transform(train_df['label'])

In [7]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [8]:
# 토크나이저와 모델 설정
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=72)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [9]:
# 데이터셋과 데이터로더 설정
class SentenceDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.encodings = tokenizer(df['문장'].tolist(), truncation=True, padding=True)
        self.labels = df['label'].tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentenceDataset(train_df, tokenizer)
val_dataset = SentenceDataset(val_df, tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)


In [10]:
# 훈련 인자 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy='epoch',
)

In [11]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [13]:
#model.to(device)
# 훈련 실행
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.3294,0.83805
2,0.677,0.831495
3,0.4893,0.905176


TrainOutput(global_step=2481, training_loss=0.7481875067898843, metrics={'train_runtime': 2169.9594, 'train_samples_per_second': 18.293, 'train_steps_per_second': 1.143, 'total_flos': 6429827257862400.0, 'train_loss': 0.7481875067898843, 'epoch': 3.0})

In [17]:
train_df

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
11971,TRAIN_11971,화양정(서울 성동구 살곶이목장 내에 있던 정자) 앞에 목책을 세우고 각 읍에 예치했...,사실형,긍정,과거,확실,17
12062,TRAIN_12062,A씨는 지난 2020년 11월 10일 오전 1시 10분쯤 인천시 미추홀구 자신의 모...,사실형,긍정,과거,확실,17
12184,TRAIN_12184,제조원가는 물론 판매관리비와 이익까지 과세표준에 포함되는 국산맥주와 달리 수입맥주는...,사실형,긍정,과거,확실,17
1964,TRAIN_01964,똑똑선물샵은 선물하는 대상과 의도에 맞는 적절한 상품을 AI가 큐레이션하는 서비스다.,사실형,긍정,현재,확실,21
10219,TRAIN_10219,B군의 머리카락이 비교적 짧았지만 마른 체형인데다 여장을 한 상황에서 성별을 구별하...,추론형,긍정,과거,확실,48
...,...,...,...,...,...,...,...
11284,TRAIN_11284,"20일 출시되는 일본산 게임 ＇우마무스메 프리티더비＇와 23일 한국산 ＇미르M＇, ...",예측형,긍정,현재,확실,38
11964,TRAIN_11964,여기에 설상가상 케인은 4주 이상 결장이 예상돼 손흥민의 복귀는 토트넘에게는 반가운...,추론형,긍정,현재,확실,52
5390,TRAIN_05390,단한 번도 경험해보지 못한 선거를 경험할 판이다.,추론형,긍정,현재,불확실,51
860,TRAIN_00860,"재고를 팔아 매출채권을 찍고 다시 현금이 유입되는 선순환이 원활히 작동해야 하는데,...",사실형,긍정,과거,확실,17


In [19]:
class SentenceDataset_pred(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.encodings = tokenizer(df['문장'].tolist(), truncation=True, padding=True)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = SentenceDataset_pred(test_df, tokenizer)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

predictions = trainer.predict(test_dataset=test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_labels)

predicted_labels

array(['사실형-긍정-현재-확실', '사실형-긍정-현재-확실', '사실형-긍정-과거-확실', ...,
       '사실형-긍정-현재-확실', '추론형-긍정-미래-확실', '사실형-긍정-과거-확실'], dtype=object)

In [24]:
submission = pd.read_csv('/content/drive/MyDrive/문장 유형 분류 AI 경진대회/sample_submission.csv')
submission.head()

Unnamed: 0,ID,label
0,TEST_0000,추론형-긍정-현재-확실
1,TEST_0001,추론형-긍정-현재-확실
2,TEST_0002,추론형-긍정-현재-확실
3,TEST_0003,추론형-긍정-현재-확실
4,TEST_0004,추론형-긍정-현재-확실


In [25]:
submission['label'] = predicted_labels
submission.head()

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-과거-확실
4,TEST_0004,사실형-긍정-과거-확실


In [26]:
submission.to_csv('/content/drive/MyDrive/문장 유형 분류 AI 경진대회/submission_bert.csv', index=False)