In [79]:
!pip install transformers
!pip install accelerate -U
!pip install transformers[torch]



In [80]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
df = pd.read_csv('/content/drive/MyDrive/dataset/kotrain.tsv', sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,contents,label
0,김길태가 게이엿다면 어떻게됫을까?,1
1,옷입고해...,1
2,간간히 음담패설 나오지? 그게 일반인들이 너거들보는 시선이야 평생말이다 알겠냐?,1
3,인류의 문명은 인간의본성의 기초해 발전하고 만들어온거다 그런대 기본을 부정한다면 인...,1
4,썩은 가지는 쳐내야한다~~~그래야 건강한 나무가 된다~~~,1


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8836 entries, 0 to 8835
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   contents  8836 non-null   object
 1   label     8836 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 138.2+ KB


In [84]:
# train / test set 나누기
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

In [85]:
#중복제거
train_data.drop_duplicates(subset=["contents"],inplace=True)
test_data.drop_duplicates(subset=["contents"],inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7069 entries, 4043 to 7059
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   contents  7069 non-null   object
 1   label     7069 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 165.7+ KB


## 토크나이징

In [86]:
Model_name = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(Model_name)

In [87]:
tokenized_train_contents = tokenizer(
    list(train_data["contents"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [88]:
print(tokenized_train_contents[0].tokens)
print(tokenized_train_contents[0].ids)
print(tokenized_train_contents[0].attention_mask)

['ĠëıĻìĦ±ìķł', 'ìŀĲëĵ¤', 'Ġë°Ģ', 'ì§ĳ', 'êµ¬ìĹŃ', 'ìĹĲìĦľëĤĺ', 'Ġíķĺì§Ģ', 'ĠìĻľ', 'Ġìĭľë¯¼ëĵ¤', 'Ġë§İìĿĢ', 'ê³³ìĹĲìĦľ', 'ĠíķĺëĥĲê³ł', 'ĠãħħãħĤ', 'ĠìķĪ', 'êµ¬', 'íħĮ', 'ëŁ¬ë', '¥¼', 'Ġê¼Ń', 'Ġíķ´ìķ¼', 'ĠìĭľìĽĲ', 'íķĺê²łëĥĲ', '?', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [89]:
tokenized_test_contents = tokenizer(
    list(test_data["contents"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
    )

In [90]:
class CurseDataset(torch.utils.data.Dataset):
  def __init__(self, encodings,labels):
    self.encodings = encodings
    self.labels=labels

  def __getitem__(self,idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item["labels"] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [91]:
train_label = train_data["label"].values
test_label = test_data["label"].values

train_dataset = CurseDataset(tokenized_train_contents, train_label)
test_dataset=CurseDataset(tokenized_test_contents,test_label)

## 모델 학습

In [92]:
model = AutoModelForSequenceClassification.from_pretrained(Model_name,num_labels=2)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [93]:
training_args = TrainingArguments(
    output_dir='./',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,  # 학습률을 2e-5로 설정
)

In [94]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels,preds,average='binary')
  acc = accuracy_score(labels, preds)
  return{
      'accuracy':acc,
      'f1':f1,
      'precision':precision,
      'recall':recall
  }

In [95]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [96]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
500,0.1394
1000,0.0214
1500,0.008
2000,0.0054


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=2210, training_loss=0.039674627646062166, metrics={'train_runtime': 369.4716, 'train_samples_per_second': 191.327, 'train_steps_per_second': 5.982, 'total_flos': 4649830125849600.0, 'train_loss': 0.039674627646062166, 'epoch': 10.0})

## 모델 평가

In [97]:
trainer.evaluate(eval_dataset=test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.0753539428114891,
 'eval_accuracy': 0.9864176570458404,
 'eval_f1': 0.9893522626441881,
 'eval_precision': 0.9875996457041629,
 'eval_recall': 0.9911111111111112,
 'eval_runtime': 2.788,
 'eval_samples_per_second': 633.799,
 'eval_steps_per_second': 10.043,
 'epoch': 10.0}

In [98]:
real_test = pd.read_csv('/content/drive/MyDrive/dataset/kotest.tsv', sep='\t', encoding='utf-8')
real_test.head()

Unnamed: 0,contents,label
0,기자들 이제 게이기삿거리 찾아다니겠네 ㅋㅋㅋ반응이월드컵,1.0
1,트럭앞에 게이창조라니게이창조라니 게이창조라니창조라니 게이창조라니아니 이게 무슨소리요...,1.0
2,게이라고 단체로 커밍아웃하는건신경안쓰는데 왜 팬티바람으로부대끼고있냐고 게이가 아니라...,1.0
3,당연히 외국이겠거니 들어와봤더니게이창조라니 창조할게 따로있지폭풍치질들아,1.0
4,저색기들 지금 어디박혀서 광란의파티를 즐기겠지,1.0


In [99]:
real_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2205 entries, 0 to 2204
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   contents  2205 non-null   object 
 1   label     2205 non-null   float64
dtypes: float64(1), object(1)
memory usage: 34.6+ KB


In [100]:
real_test = real_test.astype({"label":"int"})
real_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2205 entries, 0 to 2204
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   contents  2205 non-null   object
 1   label     2205 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 34.6+ KB


In [101]:
tokenized_real_test_contents = tokenizer(
    list(real_test["contents"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
    )

In [102]:
real_label = real_test["label"].values

In [103]:
real_test_dataset=CurseDataset(tokenized_real_test_contents,real_label)

In [104]:
trainer.evaluate(eval_dataset=real_test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 1.7559102773666382,
 'eval_accuracy': 0.8072562358276644,
 'eval_f1': 0.882304070894489,
 'eval_precision': 0.8037336024217961,
 'eval_recall': 0.9779005524861878,
 'eval_runtime': 3.487,
 'eval_samples_per_second': 632.353,
 'eval_steps_per_second': 10.037,
 'epoch': 10.0}