<a href="https://colab.research.google.com/github/hope04302/freeSearch/blob/main/mainCode/hate_detection_fix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 한국어 혐오 발언 탐지

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 필요 라이브러리 설치

In [17]:
!pip install transformers
!pip install datasets
!pip install torchmetrics
!pip install accelerate -U



## 데이터셋 로드

- 학습, 검증, 테스트 데이터셋 준비
- 라벨 정보

      class_label:
        names:
          0: origin
          1: physical
          2: politics
          3: profanity
          4: age
          5: gender
          6: race
          7: religion
          8: not_hate_speech

In [18]:
from datasets import load_dataset

train = load_dataset("jeanlee/kmhas_korean_hate_speech", split="train")
validation = load_dataset("jeanlee/kmhas_korean_hate_speech", split="validation")
test = load_dataset("jeanlee/kmhas_korean_hate_speech", split="test")

In [19]:
# 데이터 예제 출력

print(train)
print(validation)
print(test)
print(train['text'][0])
print(train['label'][0])

Dataset({
    features: ['text', 'label'],
    num_rows: 78977
})
Dataset({
    features: ['text', 'label'],
    num_rows: 8776
})
Dataset({
    features: ['text', 'label'],
    num_rows: 21939
})
"자한당틀딱들.. 악플질 고만해라."
[2, 4]


In [20]:
# 사전
print(train['text'][0:10])

# 리스트
print(train[0:10])


['"자한당틀딱들.. 악플질 고만해라."', '정치적으로 편향된 평론한은 분은 별로...', '적당히좀 쳐먹지.그랬냐??? 안그래도 문재인 때문에 나라 엉망진창인데...', '"안서는 아재들 풀발기 ㅋㄲㅋ"', '우와 ㅋ 능력자', '맛녀석 콩트보다 약했음맛녀석 애청자로써 70%실력발휘', '주영훈 솔직히 호감임 잉꼬부부로 소문났잖아', '이게주간아이돌이랑머가달라...', '아오 슈박 회사생활도 졑깥고 돈벌기 힘들어 죽겠구만 뭔 저딴것들 자꾸 tv나와서 사람 짜증나게하냐 외국서 편히살려면 아닥하고 살아라 대한민국서 취미로 돈벌어가지말고 좀 끄지라고!', '"문재인 하는게 뭐 별거있냐?ㅂㅅㅅㅋ가 하는짓인데 어련하겠어.ㅋㅋㅋ"']
{'text': ['"자한당틀딱들.. 악플질 고만해라."', '정치적으로 편향된 평론한은 분은 별로...', '적당히좀 쳐먹지.그랬냐??? 안그래도 문재인 때문에 나라 엉망진창인데...', '"안서는 아재들 풀발기 ㅋㄲㅋ"', '우와 ㅋ 능력자', '맛녀석 콩트보다 약했음맛녀석 애청자로써 70%실력발휘', '주영훈 솔직히 호감임 잉꼬부부로 소문났잖아', '이게주간아이돌이랑머가달라...', '아오 슈박 회사생활도 졑깥고 돈벌기 힘들어 죽겠구만 뭔 저딴것들 자꾸 tv나와서 사람 짜증나게하냐 외국서 편히살려면 아닥하고 살아라 대한민국서 취미로 돈벌어가지말고 좀 끄지라고!', '"문재인 하는게 뭐 별거있냐?ㅂㅅㅅㅋ가 하는짓인데 어련하겠어.ㅋㅋㅋ"'], 'label': [[2, 4], [8], [2], [4], [8], [8], [8], [8], [3], [2, 3]]}


## 모델 및 토크나이저 로드

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

name_card = "bert-base-multilingual-cased"
num_labels = 9
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(name_card, do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained(name_card, num_labels=num_labels, problem_type="multi_label_classification")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [22]:
# 토크나이저 예시

ids = tokenizer.encode(train['text'][0])
tokenized_words = tokenizer.convert_ids_to_tokens(ids)
model_input = tokenizer(train['text'][0])

print(tokenized_words)
print(ids)
print(model_input)

['[CLS]', '"', '자', '##한', '##당', '##틀', '##딱', '##들', '.', '.', '악', '##플', '##질', '고', '##만', '##해', '##라', '.', '"', '[SEP]']
[101, 107, 9651, 11102, 21928, 119373, 118826, 27023, 119, 119, 9520, 119412, 48599, 8888, 19105, 14523, 17342, 119, 107, 102]
{'input_ids': [101, 107, 9651, 11102, 21928, 119373, 118826, 27023, 119, 119, 9520, 119412, 48599, 8888, 19105, 14523, 17342, 119, 107, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [23]:
# 토크나이저 예시2

model_input = tokenizer(train['text'][0], max_length=4, truncation=True, padding="max_length")
print(model_input)
reverse = tokenizer.convert_ids_to_tokens(model_input['input_ids'])
print(reverse)

{'input_ids': [101, 107, 9651, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
['[CLS]', '"', '자', '[SEP]']


In [24]:
# 토크나이저 예시3

print(model_input)

another_name_card = 'roberta-base'
another_tokenizer = AutoTokenizer.from_pretrained(another_name_card, do_lower_case=False)
model_input = another_tokenizer(train['text'][0], max_length=4, truncation=True, padding="max_length")
print(model_input)

another_name_card = 'distilbert-base-uncased'
another_tokenizer = AutoTokenizer.from_pretrained(another_name_card, do_lower_case=False)
model_input = another_tokenizer(train['text'][0], max_length=4, truncation=True, padding="max_length")
print(model_input)


{'input_ids': [101, 107, 9651, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
{'input_ids': [0, 113, 43998, 2], 'attention_mask': [1, 1, 1, 1]}
{'input_ids': [101, 1000, 100, 102], 'attention_mask': [1, 1, 1, 1]}


In [25]:
from sklearn.preprocessing import MultiLabelBinarizer

def tokenize_function(examples):
    model_input = tokenizer(examples['text'], max_length=128, truncation=True, padding="max_length")

    mlb = MultiLabelBinarizer(classes=[0,1,2,3,4,5,6,7,8])
    one_hot_labels = mlb.fit_transform(examples['label'])

    model_input['label'] = one_hot_labels

    return model_input

In [None]:
print(train)

tokenized_train = train.map(tokenize_function, batched=True)
tokenized_valid = validation.map(tokenize_function, batched=True)

print(tokenized_train)

Dataset({
    features: ['text', 'label'],
    num_rows: 78977
})


Map:   0%|          | 0/8776 [00:00<?, ? examples/s]

## 모델 학습

In [None]:
from transformers import TrainingArguments

BATCH_SIZE = 64
EPOCHS = 1

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/freeSearch2/model',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_strategy="epoch",
    logging_dir='./logs',
    learning_rate=2e-5,
    run_name="v1",
    seed=42,
)

In [None]:
import torch
from torchmetrics import Accuracy, F1Score, HammingDistance, AUROC

def compute_metrics(eval_pred):
    threshold = 0.5

    logits, labels = eval_pred
    logits = torch.Tensor(logits)
    labels = torch.Tensor(labels).long()

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(logits))

    preds = torch.zeros(size = probs.size())
    preds[probs >= threshold] = 1

    accuracy = Accuracy(task='multilabel', num_labels=9)
    f1_macro = F1Score(task="multilabel", num_labels=9, average='macro')
    f1_micro = F1Score(task="multilabel", num_labels=9, average='micro')
    f1_weight = F1Score(task="multilabel", num_labels=9, average='weighted')
    auroc = AUROC(task='multilabel', num_labels=9, average='micro')
    hamming = HammingDistance(task="multiclass", num_classes=2)


    metrics = {'accuracy': accuracy(preds, labels),
               'f1_macro': f1_macro(preds, labels),
               'f1_micro': f1_micro(preds, labels),
               'f1_weighted': f1_weight(preds, labels),
               'auroc': auroc(preds, labels),
               'hamming_loss': hamming(preds, labels)}
    return metrics

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
trainer.save_model()

## 예측

In [None]:
tokenized_test = test.map(tokenize_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
    '/content/drive/MyDrive/Colab Notebooks/freeSearch2/model'
)
model.to(device)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()