In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from argparse import Namespace

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from pororo import Pororo


class UnsmileDataset(Dataset):
    '''
    unsmile_df(pandas.DataFrame): unsmile dataset with vectorized sentence('문장')
    '''
    def __init__(self, unsmile_df):
        self.unsmile_df = unsmile_df
      
    def __len__(self):
        return self.unsmile_df.shape[0]
      
    def __getitem__(self, index):
        sentence_vector = self.unsmile_df.iloc[index][0]
        label_vector = self.unsmile_df.iloc[index][1:].to_numpy(dtype=np.int32) # note dtype
        return sentence_vector, label_vector
    
class MultiLayerPerceptron(nn.Module):
    '''
    input: 768 dimension sentence vector transformed by Pororo sentence embedding 
    output: 11 dimension vector which contains values for '여성/가족', .... , '개인지칭'
    '''
    input_dim = 768
    hidden_dim = 64
    output_dim = 11  
    
    def __init__(self):
        super(MultiLayerPerceptron, self).__init__()
        self.fc= nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.BatchNorm1d(self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.output_dim),
        )

    def forward(self, x):
        return self.fc(x)
    
def prune_dataset(data_file):
    data_dir = os.getcwd() + '/dataset/korean_unsmile_dataset/'
    # data_dir = os.getcwd() + '/drive/MyDrive/dataset/korean_unsmile_dataset-main/'
    df = pd.read_csv(data_dir + data_file, sep='\t')

    categories = df.columns.to_list()[1:]

    for category in categories:
        if category == 'clean':
            continue
        
        indexes = list()
        for i, _ in df.iterrows():
            data = df.loc[i]
            if data[category] == 1:
                indexes.append(i)

        mask = np.random.random(len(indexes)) > 0.5
        indexes = mask * indexes
        df.drop(indexes, inplace=True, errors='ignore')
    
    return df

def vectorize_dataset(df, vectorizer):
    '''
    transform '문장' column's elements from string to numpy array,
    and return the pandas dataframe. Pororo is used for sentence embedding.
    '''
    
    arr = []
    sentence_col = df.columns.to_list()[0]
    
    for i, _ in df.iterrows():
        vectorized_sentence = vectorizer(df.loc[i][sentence_col])
        arr.append(vectorized_sentence)

    s = pd.Series(arr, name=sentence_col)
    df.drop(columns=sentence_col, axis=1, inplace=True) # remove a column with raw sentences
    return pd.concat([s, df], axis=1)

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device='cpu'):
    '''
    returns iterator for batch-size data.
    
    1. drop_last set to True: if the number of data is not divisible by batch size,
    do not use the last batch whose size is smaller than batch size

    2. shuffle set to True: shuffle dataset at every epoch
    '''

    train_dataloader = DataLoader(dataset=dataset, batch_size=batch_size, drop_last=drop_last, shuffle=shuffle)
  
    for sentences, labels in train_dataloader:
        sentences = sentences.to(device)
        labels = labels.to(device)
        yield sentences, labels

def compute_metrics(y_pred, y_label):
    '''
    calculate accuracy and recall of batch-size data    
    '''
    
    y_label = y_label.cpu()
    y_pred = (torch.sigmoid(y_pred) > 0.5).cpu().int()
    accuracy = (y_label == y_pred).sum().item() / (y_label.shape[0] * y_label.shape[1])
    recall = y_label[y_pred == 1].sum().item() / (y_label == 1).sum().item()
    return accuracy, recall

def make_train_state(args):
    return {'train_loss': [], 'test_loss': [],
            'train_acc': [], 'test_acc': [],
            'train_recall': [], 'test_recall': []}

def count(df):
    clean_data_num = df.sum()[-2]
    hatred_data_num = df.sum()[1:].sum() - clean_data_num
    print(f'hatred data: {hatred_data_num}, clean data: {clean_data_num}')

In [None]:
# # data_dir = os.getcwd() + '/drive/MyDrive/dataset/korean_unsmile_dataset-main/'
data_dir = os.getcwd() + '/dataset/korean_unsmile_dataset/'
train_data_file = 'unsmile_train_v1.0.tsv'
test_data_file = 'unsmile_valid_v1.0.tsv'
vectorizer = Pororo(task='sentence_embedding', lang='ko')

In [None]:
# the original implementation was that the sentence is vectorized dynamically,
# but it spends quite a long time(about 150 sec) to vectorize one sentence.
# so vectorize all sentences in advance and keep in memory.

# 1. unbalanced dataset configuration
train_df = pd.read_csv(data_dir + train_data_file, sep='\t')
test_df = pd.read_csv(data_dir + test_data_file, sep='\t')
vectorized_train_df = vectorize_dataset(train_df, vectorizer)
vectorized_test_df = vectorize_dataset(test_df, vectorizer)
train_data = UnsmileDataset(vectorized_train_df)
test_data = UnsmileDataset(vectorized_test_df)

In [None]:
# 2. balanced dataset. reduce the number of hatred data (hatred:clean = 3:2)
pruned_train_df = prune_dataset('unsmile_train_v1.0.tsv')
pruned_test_df = prune_dataset('unsmile_valid_v1.0.tsv')
pruned_train_df.reset_index(drop=True, inplace=True) # to prevent NaNs after joining
pruned_test_df.reset_index(drop=True, inplace=True)
vectorized_pruned_train_df = vectorize_dataset(pruned_train_df, vectorizer)
vectorized_pruned_test_df = vectorize_dataset(pruned_test_df, vectorizer)
train_data = UnsmileDataset(vectorized_pruned_train_df)
test_data = UnsmileDataset(vectorized_pruned_test_df)

In [None]:
# 3. balanced dataset configuration. increase the number of clean data (hatred:clean = about 1:1)
train_df = pd.read_csv(data_dir + train_data_file, sep='\t')
test_df = pd.read_csv(data_dir + test_data_file, sep='\t')
vectorized_train_df = vectorize_dataset(train_df, vectorizer)
vectorized_test_df = vectorize_dataset(test_df, vectorizer)
count(vectorized_train_df)

clean_data_indexes = []
for i in range(len(vectorized_train_df)):
    data = vectorized_train_df.iloc[i]
    if data['clean'] == 1:
        clean_data_indexes.append(i)
clean_df = vectorized_train_df.iloc[clean_data_indexes]
copied_dfs = [clean_df.copy() for i in range(3)]
ors_vectorized_train_df = pd.concat([vectorized_train_df, *copied_dfs], axis=0, ignore_index=True)
count(ors_vectorized_train_df)
train_data = UnsmileDataset(ors_vectorized_train_df)
test_data = UnsmileDataset(vectorized_test_df)

In [None]:
args = Namespace(
    batch_size=128,
    learning_rate=0.0005,
    num_epochs=30,
    cuda=False,
    device='cpu'
)

train_state = make_train_state(args)

if torch.cuda.is_available():
    args.cuda = True
args.device = torch.device('cuda' if args.cuda else 'cpu')

# model
model = MultiLayerPerceptron()
model = model.to(args.device)

# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

# train starts
start_time = time.time()
print(f'<training start!>')
print(f'-total epochs: {args.num_epochs}')
print(f'-batch size: {args.batch_size}')
print(f'-learning rate: {args.learning_rate}')
print(f"-cuda {'avaialble' if args.cuda else 'not available'}")
    
for epoch_index in range(args.num_epochs):
    print(f'epoch{epoch_index + 1} : [', end='')

    running_loss = 0.0
    running_acc = 0.0
    running_recall = 0.0
    model.train() # this has effects on certain modules (ex, dropout)
  
    batch_generator = generate_batches(train_data, args.batch_size, device=args.device)

    for batch_index, (x, y) in enumerate(batch_generator):
        if batch_index % (int(int(len(train_data) / args.batch_size) / 20)) == 0:
            print('>', end='')
        
        # set all gradients to zero
        optimizer.zero_grad()
        
        # forward pass
        y_pred = model(x)
        loss = loss_func(y_pred, y.float())
    
        # backward pass
        loss.backward()
        optimizer.step()
        
        # metrics
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1) # moving average
        batch_acc, batch_recall = compute_metrics(y_pred, y)
        running_acc += (batch_acc - running_acc) / (batch_index + 1)
        running_recall += (batch_recall - running_recall) / (batch_index + 1)
    
    print(']')
    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)
    train_state['train_recall'].append(running_recall)
  
    running_loss = 0.0
    running_acc = 0.0
    running_recall = 0.0
    model.eval()

    batch_generator = generate_batches(test_data, args.batch_size, device=args.device)
    
    for batch_index, (x, y) in enumerate(batch_generator):
        # forward pass
        y_pred = model(x)
        loss = loss_func(y_pred, y.float())
        
        # metrics
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1) # moving average
        batch_acc, batch_recall = compute_metrics(y_pred, y)
        running_acc += (batch_acc - running_acc) / (batch_index + 1)
        running_recall += (batch_recall - running_recall) / (batch_index + 1)
  
    train_state['test_loss'].append(running_loss)
    train_state['test_acc'].append(running_acc)
    train_state['test_recall'].append(running_recall)

print(f'time flied: {start_time - time.time()}')
print('<end training!>')

In [None]:
plt.subplot(1, 3, 1)
plt.title('Loss')
plt.xlabel('epoch')
plt.plot(np.arange(0, args.num_epochs), train_state['train_loss'])
plt.plot(np.arange(0, args.num_epochs), train_state['test_loss'])
plt.legend(['train', 'test'])

plt.subplot(1, 3, 2)
plt.title('Accuracy')
plt.xlabel('epoch')
plt.plot(np.arange(0, args.num_epochs), train_state['train_acc'])
plt.plot(np.arange(0, args.num_epochs), train_state['test_acc'])
plt.legend(['train', 'test'])

plt.subplot(1, 3, 3)
plt.title('Recall')
plt.xlabel('epoch')
plt.plot(np.arange(0, args.num_epochs), train_state['train_recall'])
plt.plot(np.arange(0, args.num_epochs), train_state['test_recall'])
plt.legend(['train', 'test'])

plt.plot()

print('-Final (moving average) values from the last epoch')
print(f"train loss: {train_state['train_loss'][-1]}")
print(f"test loss: {train_state['test_loss'][-1]}")
print(f"train accuracy: {train_state['train_acc'][-1]}")
print(f"test accuracy: {train_state['test_acc'][-1]}")
print(f"train recall: {train_state['train_recall'][-1]}")
print(f"test recall: {train_state['test_recall'][-1]}")

In [None]:
# total test data metrics: accuracy and recall
batch_generator = generate_batches(test_data, 1, device=args.device)

y_preds = []
y_labels = []

for batch_index, (x, y) in enumerate(batch_generator):
    y_pred = model(x)
    y_pred = (torch.sigmoid(y_pred) > 0.5).int()
    y_preds.append(y_pred.detach().cpu().numpy().reshape(-1))
    y_labels.append(y.detach().cpu().numpy().reshape(-1))

y_preds = np.array(y_preds)
y_labels = np.array(y_labels)

# accuracy
accuracy = (y_preds == y_labels).sum() / (y_preds.shape[0] * y_preds.shape[1])

precisions = []
recalls = []
for i in range(11):
    
    pred_col = y_preds[:, i]
    label_col = y_labels[:, i]
    
    # precision
    TP_plus_FP = pred_col[pred_col == 1].sum()
    TP = (label_col[pred_col == 1] == 1).sum()
    
    if TP_plus_FP == 0:
        precisions.append(-1)  
    else:
        precisions.append(TP / TP_plus_FP)
   
    # recall
    TP_plus_FN = (label_col[label_col == 1]).sum()
    recalls.append(TP / TP_plus_FN)
    
print('-Results of test data using a trained model\ncategory\t\tprecision\t\trecall')
for category, precision, recall in zip(vectorized_train_df.columns[1:], precisions, recalls):
    print(f'{category}\t\t{precision}\t\t{recall}')

print(f'\naverage precision: {np.average(precisions)}')
print(f'average recall: {np.average(recalls)}\n')

recall_dict = {}
for category, recall in zip(vectorized_train_df.columns[1:], recalls):
    recall_dict[category] = recall
print('-Sorted by recall\ncategory\t\trecall')
for category, recall in sorted(recall_dict.items(), key=lambda x: x[1]):
    print(f'{category}\t\t{recall}')


# '개인지칭' 과 '기타 혐오' 의 성능이 매우 낮음,, 왜?
# 데이터 통계를 보면, '개인지칭', '기타 혐오', '연령' 순서대로 데이터 수가 가장 적다. -> 적은 데이터 수가 문제?
# 그런데 '연령' 보다 '악플/욕설' 데이터 수가 약 5배 많음에도 불구하고, recall 은 큰 차이 안남,,
# 반대로, recall 이 가장 높은 '지역' 이나 '종교' 데이터 수가 많은 것도 아님.


In [None]:
### Baseline

               precision    recall  f1-score   support

     여성/가족       0.85      0.70      0.76       394
         남성       0.87      0.83      0.85       334
      성소수자       0.90      0.78      0.83       280
     인종/국적       0.87      0.79      0.82       426
         연령       0.92      0.75      0.83       146
         지역       0.87      0.88      0.88       260
         종교       0.87      0.86      0.87       290
      기타혐오       0.92      0.18      0.30       134
     악플/욕설       0.76      0.59      0.67       786
       clean       0.74      0.79      0.77       935

    micro avg      0.82      0.73      0.77      3985
    macro avg      0.86      0.72      0.76      3985
    weighted avg   0.82      0.73      0.77      3985
    samples avg    0.76      0.74      0.75      3985


In [None]:
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)