Korean hate speech detection model using
* pretrined - KcElectra (discriminator) (electra base model trained on 댓글)
* Dataset - Korean hate speech (labeled) 

In [1]:
import os
import sys
import pandas as pd
import numpy as np 
import torch
import random
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.nn.utils import clip_grad_norm_

In [2]:
# seed
seed = 7777
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
device = torch.device("cuda")

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
# %cd /content/drive/MyDrive/data

In [6]:
# !git clone https://github.com/kocohub/korean-hate-speech.git

In [5]:
%cd /content/drive/MyDrive/data/korean-hate-speech/labeled

/content/drive/MyDrive/data/korean-hate-speech/labeled


In [6]:
df_train = pd.read_csv('train.tsv', sep='\t')#.dropna() (no NaN values to drop)
df_dev = pd.read_csv('dev.tsv', sep='\t')#.dropna() (no NaN values to drop)

In [7]:
print(f'train_dataset shape: {df_train.shape}')
print(f'dev_dataset shape: {df_dev.shape}')
df_train.head()

train_dataset shape: (7896, 4)
dev_dataset shape: (471, 4)


Unnamed: 0,comments,contain_gender_bias,bias,hate
0,(현재 호텔주인 심정) 아18 난 마른하늘에 날벼락맞고 호텔망하게생겼는데 누군 계속...,False,others,hate
1,....한국적인 미인의 대표적인 분...너무나 곱고아름다운모습...그모습뒤의 슬픔을...,False,none,none
2,"...못된 넘들...남의 고통을 즐겼던 넘들..이젠 마땅한 처벌을 받아야지..,그래...",False,none,hate
3,"1,2화 어설펐는데 3,4화 지나서부터는 갈수록 너무 재밌던데",False,none,none
4,1. 사람 얼굴 손톱으로 긁은것은 인격살해이고2. 동영상이 몰카냐? 메걸리안들 생각...,True,gender,hate


In [8]:
# print(f'{df_train.contain_gender_bias.value_counts()}\n')
# print(f'{df_train.bias.value_counts()}\n')
print(f'{df_train.hate.value_counts()}\n')
print(f'none: {3486/7896} of the dataset')
print(f'offensive: {2499/7896} of the dataset')
print(f'hate: {1911/7896} of the dataset')

none         3486
offensive    2499
hate         1911
Name: hate, dtype: int64

none: 0.44148936170212766 of the dataset
offensive: 0.31648936170212766 of the dataset
hate: 0.24202127659574468 of the dataset


Since the 'minority' of the dataset (hate) is above 20% of the total data count (13.3%) off from being a fully balanced dataset, I will proceed with current data distribution for now.

# Preprocessing


Furthermore, stemming/stopword removal will not be done as it is not effective in hate speech detection. 

Only,
* Chinese Characters
* Special Characters other
* URL
* HTML

will be removed

Hate Label:
* 0: None
* 1: Offensive
* 2: Hate

In [9]:
train_comment = df_train['comments'].to_list()
train_hate = df_train['hate'].to_list()
dev_hate = df_dev['hate'].to_list()

In [10]:
import html
import regex as re
from bs4 import BeautifulSoup

def preprocess(comment):
  sen = re.sub(r'(http|https)\S+', '', comment)                   # url
  sen = BeautifulSoup(html.unescape(sen), 'html.parser').text     # html parse
  sen = sen.replace("\n", " ")                                    # \n
  sen = re.sub('"',' ', sen)                                      # 따옴표 
  sen = re.sub("[^a-zA-Z0-9가-힣]", " ", sen)                  #영문, 한글, 숫자 만
  return sen


In [11]:
train_label = []
dev_label = []

for index in range(len(train_comment)):
    train_comment[index] = preprocess(train_comment[index])


# 0: [1,0,0]  1: [0,1,0]  2: [0,0,1]
for element in train_hate:
    if element == 'none':
        train_label.append([1,0,0])
    elif element == 'offensive':
        train_label.append([0,1,0])
    elif element == 'hate':
        train_label.append([0,0,1])

for element in dev_hate:
    if element == 'none':
        dev_label.append(0)
    elif element == 'offensive':
        dev_label.append(1)
    elif element == 'hate':
        dev_label.append(2)

In [12]:
print(train_label[:10])
print(train_comment[:3])

[[0, 0, 1], [1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 1, 0]]
[' 현재 호텔주인 심정  아18 난 마른하늘에 날벼락맞고 호텔망하게생겼는데 누군 계속 추모받네    ', '    한국적인 미인의 대표적인 분   너무나 곱고아름다운모습   그모습뒤의 슬픔을 미처 알지못했네요 ', '   못된 넘들   남의 고통을 즐겼던 넘들  이젠 마땅한 처벌을 받아야지   그래야  공정한 사회지   심은대로 거두거라   ']


# Dataset/Dataloader

In [13]:
!pip install transformers
from transformers import AutoTokenizer, AutoModel, ElectraForSequenceClassification
from transformers import get_linear_schedule_with_warmup, get_constant_schedule 

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 8.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 89.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 90.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 61.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [14]:
kctokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
kcmodel = AutoModel.from_pretrained("beomi/KcELECTRA-base")
# temp_model = model = ElectraForSequenceClassification.from_pretrained(
#     "monologg/koelectra-base-v3-generator")
kcmodel.to(device)

# for name, param in temp_model.named_parameters():
#     print(name)

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/387k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(50135, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0): ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [15]:
class CustomDataset(Dataset):
    def __init__(self, input_data, label_data):
        self.X = input_data #list
        self.Y = label_data #list

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [16]:
train_dataset = CustomDataset(train_comment, train_label)
valid_dataset = CustomDataset(df_dev['comments'].to_list(), dev_label)
print(f'Dataset length: {len(train_dataset)}')

Dataset length: 7896


In [17]:
def CustomCollateFn(batch):
    input_list, label_list = [], []

    for text, label in batch:
        input_list.append(text)
        label_list.append(label)

    tokenized_text = kctokenizer(input_list, add_special_tokens=True, padding='longest',
                                truncation=True, max_length=512, return_tensors='pt')
    label_list = torch.Tensor(label_list)

    return (tokenized_text, label_list)

In [18]:
def CustomCollateFn_dev(batch):
    input_list, label_list = [], []

    for text, label in batch:
        input_list.append(text)
        label_list.append(label)

    tokenized_text = kctokenizer(input_list, add_special_tokens=True, padding='longest',
                                truncation=True, max_length=512, return_tensors='pt')
    label_list = torch.Tensor(label_list).type(torch.LongTensor)

    return (tokenized_text, label_list)

In [19]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size = 32,
    sampler = RandomSampler(train_dataset),
    collate_fn = CustomCollateFn,
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = 64,
    sampler = SequentialSampler(valid_dataset),
    collate_fn = CustomCollateFn_dev,
)

# Classifier
* linear hidden layer size = 768 (same as electra classifier head)
* dropout rate = 0.1 (same as electra classifier head) 
* added ReLU and changed output to 3 from 2 (0,1,2)

In [21]:
# tokenized_temp = tokenizer(text=['현재 호텔주인 심정  아18 난 마른하늘에 날벼락맞고 호텔망하게생겼는데 누군 계속 추모받네'
# , '한국적인 미인의 대표적인 분   너무나 곱고아름다운모습   그모습뒤의 슬픔을 미처 알지못했네요'], truncation=True, padding='longest', return_tensors='pt')
# temp = model(input_ids=tokenized_temp['input_ids'], attention_mask=tokenized_temp['attention_mask']
#                                 , token_type_ids=tokenized_temp['token_type_ids'], output_hidden_states=True)





In [20]:
class HateClassifier(nn.Module):
    def __init__(self, hidden_size, n_label):
        super(HateClassifier, self).__init__()

        dropout_rate = 0.5
        linear_layer_size = 515 
        self.kcelectra = AutoModel.from_pretrained("beomi/KcELECTRA-base")
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, linear_layer_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(linear_layer_size, n_label),
        )

    def forward(self, input_ids = None, attention_mask = None, token_type_ids = None):
        output = self.kcelectra(input_ids=input_ids, attention_mask=attention_mask
                                , token_type_ids=token_type_ids)
        cls = output[0][:,0]
        logit = self.classifier(cls)

        return logit

In [21]:
epochs=4

In [22]:
def initializer(train_dataloader, epochs):

    model = HateClassifier(hidden_size=768, n_label=3)

    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    print(f'total step: {len(train_dataloader) * epochs}')

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = 0, #round(len(train_dataloader)*0.1),
        num_training_steps = len(train_dataloader) * epochs,

    )

    return model, optimizer, scheduler

In [25]:
# print(kcmodel.keys)

# Train

In [23]:
def save_checkpoint(path, model, optimizer, scheduler, epoch, loss):
    file_name = f'/content/drive/MyDrive/data/checkpoints/hatemodel_v4.ckpt.{epoch}'
    torch.save({
        'epoch':epoch,
        'model_state_dict':model.state_dict(),
        'optimizer_state_dict':optimizer.state_dict(),
        'scheduler_state_dict':scheduler.state_dict(),
        'loss':loss
    }, file_name)

    print(f'SAVING EPOCH {epoch} ...')

In [24]:
def train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, epochs):
    for epoch in range(epochs):
        print(f'****** STARTING TO TRAIN EPOCH #{epoch} ******')

        total_loss = 0
        batch_loss = 0
        batch_count = 0

        model.train()
        model.to(device)

        for step, batch in enumerate(train_dataloader):
            batch_count += 1
            batch = tuple(items.to(device) for items in batch)

            (x_batch, y_batch) = batch

            model.zero_grad()

            logit = model(**x_batch)
            loss = loss_fct(logit, y_batch)

            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
            clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if(step % 10 == 0 and step != 0):
                learning_rate = optimizer.param_groups[0]['lr']
                print(f"Epoch: {epoch}, Step : {step}, LR : {learning_rate}, Avg Loss : {batch_loss / batch_count:.4f}")

                batch_loss, batch_count = 0,0

        print(f"Epoch {epoch} Total Mean Loss : {total_loss/(step+1):.4f}")
        print(f"*****Epoch {epoch} Train Finish*****\n")

        if valid_dataloader is not None:
            print(f"*****Epoch {epoch} Valid Start*****")
            valid_loss, valid_acc = validate(model, loss_fct, valid_dataloader)
            print(f"Epoch {epoch} Valid Loss : {valid_loss:.4f} Valid Acc : {valid_acc:.2f}")
            print(f"*****Epoch {epoch} Valid Finish*****\n")
        
        save_checkpoint(".", model, optimizer, scheduler, epoch, total_loss/(step+1))
    
    print('** Train Completed! **')


# Validate


In [25]:
def validate(model, loss_fct, valid_dataloader):

    model.eval()
    model.to(device)
    # model.cuda()
    # loss_fct.to(device)

    total_loss = 0
    total_acc = 0

    for step, batch in enumerate(valid_dataloader):
        batch = tuple(items.to(device) for items in batch)

        batch_x, batch_y = batch

        with torch.no_grad():
            logit = model(**batch_x)

        loss = loss_fct(logit, batch_y)
        total_loss += loss.item()

        # print(batch_y)
        probability = F.softmax(logit, dim=1) #(1,3)
        prediction = torch.argmax(probability, dim=1).flatten()
        accuracy = (prediction == batch_y).cpu().numpy().mean()

        total_acc += accuracy
    
    total_loss = total_loss / (step+1)
    total_acc = total_acc / (step+1) * 100

    return total_loss, total_acc


In [29]:
# # 001
# practice_data = ['현재 호텔주인 심정  아18 난 마른하늘에 날벼락맞고 호텔망하게생겼는데 누군 계속 추모받네']
# pt_data = kctokenizer(practice_data, add_special_tokens=True, padding='longest',
#                                 truncation=True, max_length=512, return_tensors='pt')
# tempo_model = HateClassifier(hidden_size=768, n_label=3)
# tempo_logit = tempo_model(**pt_data)
# print(tempo_logit)

# loss_fct = CrossEntropyLoss()
# tempoo = torch.Tensor([[ 0.0609, -0.0392,  0.2162]]) 
# temp_probs = F.softmax(tempoo, dim=1) #tensor([[0.3254, 0.2944, 0.3801]])
# temp_probs = torch.Tensor([[0,0,1], [1,0,0]])
# losss = loss_fct(tempoo, temp_probs)
# print(losss)
# preds = torch.argmax(temp_probs, dim=1).flatten() #tensor([2])
# print(preds)

# Training!

In [38]:
device = torch.device("cuda")
train_dataloader = DataLoader(
    train_dataset,
    batch_size = 32,
    sampler = RandomSampler(train_dataset),
    collate_fn = CustomCollateFn,
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = 64,
    sampler = SequentialSampler(valid_dataset),
    collate_fn = CustomCollateFn_dev,
)
# dropout_rate = 0.1
# AdamW(model.parameters(), lr=2e-5, eps=1e-8)
# round(len(train_dataloader)*0.1)
# saving at /content/drive/MyDrive/data/checkpoints

epochs = 4
loss_fct = CrossEntropyLoss()
model, optimizer, scheduler = initializer(train_dataloader, epochs)
train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, epochs)

# validate(model, loss_fct, valid_dataloader, device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


total step: 988
****** STARTING TO TRAIN EPOCH #0 ******
Epoch: 0, Step : 10, LR : 1.9777327935222675e-05, Avg Loss : 1.0726
Epoch: 0, Step : 20, LR : 1.9574898785425103e-05, Avg Loss : 1.0804
Epoch: 0, Step : 30, LR : 1.937246963562753e-05, Avg Loss : 1.0528
Epoch: 0, Step : 40, LR : 1.917004048582996e-05, Avg Loss : 1.0441
Epoch: 0, Step : 50, LR : 1.896761133603239e-05, Avg Loss : 0.9925
Epoch: 0, Step : 60, LR : 1.876518218623482e-05, Avg Loss : 0.9789
Epoch: 0, Step : 70, LR : 1.856275303643725e-05, Avg Loss : 0.9478
Epoch: 0, Step : 80, LR : 1.8360323886639677e-05, Avg Loss : 0.8960
Epoch: 0, Step : 90, LR : 1.8157894736842107e-05, Avg Loss : 0.9114
Epoch: 0, Step : 100, LR : 1.7955465587044535e-05, Avg Loss : 0.8110
Epoch: 0, Step : 110, LR : 1.7753036437246965e-05, Avg Loss : 0.8387
Epoch: 0, Step : 120, LR : 1.7550607287449396e-05, Avg Loss : 0.8608
Epoch: 0, Step : 130, LR : 1.7348178137651823e-05, Avg Loss : 0.9555
Epoch: 0, Step : 140, LR : 1.714574898785425e-05, Avg Loss :

# V2
linear hidden layer size 768 -> 515
dropout: 0.1 -> 0.5

In [45]:
device = torch.device("cuda")
train_dataloader = DataLoader(
    train_dataset,
    batch_size = 32,
    sampler = RandomSampler(train_dataset),
    collate_fn = CustomCollateFn,
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = 64,
    sampler = SequentialSampler(valid_dataset),
    collate_fn = CustomCollateFn_dev,
)
# dropout_rate = 0.1
# AdamW(model.parameters(), lr=2e-5, eps=1e-8)
# round(len(train_dataloader)*0.1)
# saving at /content/drive/MyDrive/data/checkpoints

epochs = 4
loss_fct = CrossEntropyLoss()
model, optimizer, scheduler = initializer(train_dataloader, epochs)
train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, epochs)

# validate(model, loss_fct, valid_dataloader, device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


total step: 988
****** STARTING TO TRAIN EPOCH #0 ******
Epoch: 0, Step : 10, LR : 1.9777327935222675e-05, Avg Loss : 1.0949
Epoch: 0, Step : 20, LR : 1.9574898785425103e-05, Avg Loss : 1.0832
Epoch: 0, Step : 30, LR : 1.937246963562753e-05, Avg Loss : 1.0732
Epoch: 0, Step : 40, LR : 1.917004048582996e-05, Avg Loss : 1.0272
Epoch: 0, Step : 50, LR : 1.896761133603239e-05, Avg Loss : 1.0252
Epoch: 0, Step : 60, LR : 1.876518218623482e-05, Avg Loss : 1.0310
Epoch: 0, Step : 70, LR : 1.856275303643725e-05, Avg Loss : 0.9780
Epoch: 0, Step : 80, LR : 1.8360323886639677e-05, Avg Loss : 0.9340
Epoch: 0, Step : 90, LR : 1.8157894736842107e-05, Avg Loss : 0.9419
Epoch: 0, Step : 100, LR : 1.7955465587044535e-05, Avg Loss : 0.8781
Epoch: 0, Step : 110, LR : 1.7753036437246965e-05, Avg Loss : 0.8556
Epoch: 0, Step : 120, LR : 1.7550607287449396e-05, Avg Loss : 0.8495
Epoch: 0, Step : 130, LR : 1.7348178137651823e-05, Avg Loss : 0.8661
Epoch: 0, Step : 140, LR : 1.714574898785425e-05, Avg Loss :

# V3
When preprocessing the data, only erased html/url while keeping all special characters and punctuations (with v2 settings)

In [26]:
device = torch.device("cuda")
train_dataloader = DataLoader(
    train_dataset,
    batch_size = 32,
    sampler = RandomSampler(train_dataset),
    collate_fn = CustomCollateFn,
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = 64,
    sampler = SequentialSampler(valid_dataset),
    collate_fn = CustomCollateFn_dev,
)
# dropout_rate = 0.1
# AdamW(model.parameters(), lr=2e-5, eps=1e-8)
# round(len(train_dataloader)*0.1)
# saving at /content/drive/MyDrive/data/checkpoints

epochs = 4
loss_fct = CrossEntropyLoss()
model, optimizer, scheduler = initializer(train_dataloader, epochs)
train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, epochs)

# validate(model, loss_fct, valid_dataloader, device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


total step: 988
****** STARTING TO TRAIN EPOCH #0 ******
Epoch: 0, Step : 10, LR : 1.9777327935222675e-05, Avg Loss : 1.0867
Epoch: 0, Step : 20, LR : 1.9574898785425103e-05, Avg Loss : 1.0871
Epoch: 0, Step : 30, LR : 1.937246963562753e-05, Avg Loss : 1.0686
Epoch: 0, Step : 40, LR : 1.917004048582996e-05, Avg Loss : 1.0734
Epoch: 0, Step : 50, LR : 1.896761133603239e-05, Avg Loss : 1.0437
Epoch: 0, Step : 60, LR : 1.876518218623482e-05, Avg Loss : 1.0309
Epoch: 0, Step : 70, LR : 1.856275303643725e-05, Avg Loss : 1.0246
Epoch: 0, Step : 80, LR : 1.8360323886639677e-05, Avg Loss : 1.0117
Epoch: 0, Step : 90, LR : 1.8157894736842107e-05, Avg Loss : 0.9673
Epoch: 0, Step : 100, LR : 1.7955465587044535e-05, Avg Loss : 0.9543
Epoch: 0, Step : 110, LR : 1.7753036437246965e-05, Avg Loss : 0.9031
Epoch: 0, Step : 120, LR : 1.7550607287449396e-05, Avg Loss : 0.9343
Epoch: 0, Step : 130, LR : 1.7348178137651823e-05, Avg Loss : 0.8996
Epoch: 0, Step : 140, LR : 1.714574898785425e-05, Avg Loss :

# V4
KcElectra-base -> "monologg/koelectra-base-v3-discriminator"

In [26]:
device = torch.device("cuda")
train_dataloader = DataLoader(
    train_dataset,
    batch_size = 32,
    sampler = RandomSampler(train_dataset),
    collate_fn = CustomCollateFn,
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = 64,
    sampler = SequentialSampler(valid_dataset),
    collate_fn = CustomCollateFn_dev,
)
# dropout_rate = 0.1
# AdamW(model.parameters(), lr=2e-5, eps=1e-8)
# round(len(train_dataloader)*0.1)
# saving at /content/drive/MyDrive/data/checkpoints

epochs = 4
loss_fct = CrossEntropyLoss()
model, optimizer, scheduler = initializer(train_dataloader, epochs)
train(model, loss_fct, scheduler, optimizer, train_dataloader, valid_dataloader, epochs)

# validate(model, loss_fct, valid_dataloader, device)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


total step: 988
****** STARTING TO TRAIN EPOCH #0 ******
Epoch: 0, Step : 10, LR : 1.9777327935222675e-05, Avg Loss : 1.0920
Epoch: 0, Step : 20, LR : 1.9574898785425103e-05, Avg Loss : 1.0677
Epoch: 0, Step : 30, LR : 1.937246963562753e-05, Avg Loss : 1.0442
Epoch: 0, Step : 40, LR : 1.917004048582996e-05, Avg Loss : 1.0672
Epoch: 0, Step : 50, LR : 1.896761133603239e-05, Avg Loss : 1.0568
Epoch: 0, Step : 60, LR : 1.876518218623482e-05, Avg Loss : 1.0505
Epoch: 0, Step : 70, LR : 1.856275303643725e-05, Avg Loss : 1.0161
Epoch: 0, Step : 80, LR : 1.8360323886639677e-05, Avg Loss : 1.0202
Epoch: 0, Step : 90, LR : 1.8157894736842107e-05, Avg Loss : 1.0107
Epoch: 0, Step : 100, LR : 1.7955465587044535e-05, Avg Loss : 1.0018
Epoch: 0, Step : 110, LR : 1.7753036437246965e-05, Avg Loss : 0.9744
Epoch: 0, Step : 120, LR : 1.7550607287449396e-05, Avg Loss : 0.9357
Epoch: 0, Step : 130, LR : 1.7348178137651823e-05, Avg Loss : 0.9060
Epoch: 0, Step : 140, LR : 1.714574898785425e-05, Avg Loss :

## Out of all v1,v2,v3,v4 HateModel_v2 epoch1 had the highest Validation Accuracy with 74.0%


In [121]:
v2_1_checkpoint = torch.load('/content/drive/MyDrive/data/checkpoints/hatemodel_v2.ckpt.1')

In [122]:
v2_1_checkpoint.keys()

dict_keys(['epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'loss'])

In [123]:
model, optimizer, scheduler = initializer(train_dataloader, 1)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


total step: 247


In [124]:
model.load_state_dict(v2_1_checkpoint["model_state_dict"])

<All keys matched successfully>

In [125]:
unlabeled_test = pd.read_csv('/content/drive/MyDrive/data/test.hate.no_label.csv')

In [126]:
unlabeled_test.head()
unlabeled = unlabeled_test['comments'].to_list()
print(len(unlabeled))

974


In [127]:
def CustomCollateFn_unlabeled(batch):
    input_list = []

    for text in batch:
        input_list.append(text)

    tokenized_text = kctokenizer(input_list, add_special_tokens=True, padding='longest',
                                truncation=True, max_length=512, return_tensors='pt')

    return tokenized_text

In [128]:
unlabeled_dataloader = DataLoader(
    unlabeled,
    batch_size = 64,
    sampler = SequentialSampler(unlabeled),
    collate_fn = CustomCollateFn_unlabeled,
)

In [135]:
def predict(model, unlabeled_dataloader):
    model.eval()
    model.to(device)
    pred_list = []
    label_list = []
    
    for step, batch in enumerate(unlabeled_dataloader):

        comments = batch.to(device)
        with torch.no_grad():
            logit = model(**comments)
            probability = F.softmax(logit, dim=1)
            prediction = torch.argmax(probability, dim=1).flatten()
            pred_list.append(prediction.cpu())
    # print((pred_list))
    # print(len(pred_list))

    for element in pred_list:
        for label in element:
            if label == 0:
                label_list.append(0)
            elif label == 1:
                label_list.append(1)
            elif label == 2:
                label_list.append(2)
    print(label_list)
    return label_list


generated = predict(model, unlabeled_dataloader)

[2, 0, 0, 0, 1, 0, 1, 2, 1, 2, 2, 1, 1, 0, 2, 1, 2, 2, 1, 0, 1, 1, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 1, 1, 0, 0, 0, 0, 2, 1, 0, 1, 2, 0, 1, 2, 0, 0, 0, 2, 2, 1, 1, 1, 0, 2, 0, 2, 0, 2, 1, 1, 2, 0, 0, 0, 2, 1, 2, 0, 0, 2, 2, 1, 1, 0, 2, 2, 1, 0, 2, 2, 0, 1, 2, 1, 0, 2, 1, 1, 2, 1, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 1, 2, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 1, 1, 2, 2, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 2, 1, 1, 1, 2, 2, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 1, 1, 0, 0, 1, 2, 1, 0, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 0, 0, 2, 1, 0, 1, 1, 0, 1, 2, 0, 2, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 0, 2, 2, 2, 1, 0, 0, 2, 2, 1, 0, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 1, 1, 0, 2, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 0, 1, 0, 2, 0, 1, 1, 2, 1, 2, 2, 0, 2, 1, 2, 2, 2, 1, 0, 0, 1, 2, 2, 0, 0, 0, 1, 0, 0, 2, 

In [136]:
df = pd.DataFrame(list(zip(unlabeled, generated)), columns =['comments', 'label']) 
df.head()
# df = pd.DataFrame(generated, columns =['label']) 
# df.head()
# print(len(df))

Unnamed: 0,comments,label
0,ㅋㅋㅋㅋ 그래도 조아해주는 팬들 많아서 좋겠다 ㅠㅠ 니들은 온유가 안만져줌 ㅠㅠ,2
1,둘다 넘 좋다~행복하세요,0
2,근데 만원이하는 현금결제만 하라고 써놓은집 우리나라에 엄청 많은데,0
3,원곡생각하나도 안나고 러블리즈 신곡나온줄!!! 너무 예쁘게 잘봤어요,0
4,장현승 얘도 참 이젠 짠하다...,1


In [137]:
print(generated)

[2, 0, 0, 0, 1, 0, 1, 2, 1, 2, 2, 1, 1, 0, 2, 1, 2, 2, 1, 0, 1, 1, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 1, 1, 0, 0, 0, 0, 2, 1, 0, 1, 2, 0, 1, 2, 0, 0, 0, 2, 2, 1, 1, 1, 0, 2, 0, 2, 0, 2, 1, 1, 2, 0, 0, 0, 2, 1, 2, 0, 0, 2, 2, 1, 1, 0, 2, 2, 1, 0, 2, 2, 0, 1, 2, 1, 0, 2, 1, 1, 2, 1, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 1, 2, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 1, 1, 2, 2, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 2, 1, 1, 1, 2, 2, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 1, 1, 0, 0, 1, 2, 1, 0, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 0, 0, 2, 1, 0, 1, 1, 0, 1, 2, 0, 2, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 0, 2, 2, 2, 1, 0, 0, 2, 2, 1, 0, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 1, 1, 0, 2, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 0, 1, 0, 2, 0, 1, 1, 2, 1, 2, 2, 0, 2, 1, 2, 2, 2, 1, 0, 0, 1, 2, 2, 0, 0, 0, 1, 0, 0, 2, 

In [138]:
df.to_csv('/content/drive/MyDrive/data/submission.csv', index=False)

## Although Korean-Hate-Speech model does provide with labeled training/dev data, it does not provide labeled test data (testing available only through kaggle competition) 

## Kaggle submission was graded with Score: 0.66421 (which I am not entirely sure if its the accuracy of the generated values or calculated differently)