In [2]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from argparse import Namespace
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from pororo import Pororo


class UnsmileDataset(Dataset):
    '''
    unsmile_df(pandas.DataFrame): unsmile dataset with vectorized sentence('문장')
    '''
    def __init__(self, unsmile_df):
        self.unsmile_df = unsmile_df
      
    def __len__(self):
        return self.unsmile_df.shape[0]
      
    def __getitem__(self, index):
        sentence_vector = self.unsmile_df.iloc[index][0]
        label_vector = self.unsmile_df.iloc[index][1:].to_numpy(dtype=np.int32) # note dtype
        return sentence_vector, label_vector
    
class MultiLayerPerceptron(nn.Module):
    '''
    input: 768 dimension sentence vector transformed by Pororo sentence embedding 
    output: 11 dimension vector which contains values for '여성/가족', .... , '개인지칭'
    '''
    input_dim = 768
    hidden_dim = 512
    output_dim = 11  
    
    def __init__(self):
        super(MultiLayerPerceptron, self).__init__()
        self.fc= nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(self.hidden_dim, self.output_dim),
        )

    def forward(self, x):
        return self.fc(x)
    
def prune_dataset(data_file):
    data_dir = os.getcwd() + '/dataset/korean_unsmile_dataset/'
    # data_dir = os.getcwd() + '/drive/MyDrive/dataset/korean_unsmile_dataset-main/'
    df = pd.read_csv(data_dir + data_file, sep='\t')

    categories = df.columns.to_list()[1:]

    for category in categories:
        if category == 'clean':
            continue
        
        indexes = list()
        for i, _ in df.iterrows():
            data = df.loc[i]
            if data[category] == 1:
                indexes.append(i)

        mask = np.random.random(len(indexes)) > 0.5
        indexes = mask * indexes
        df.drop(indexes, inplace=True, errors='ignore')
    
    return df

def vectorize_dataset(df, vectorizer):
    '''
    transform '문장' column's elements from string to numpy array,
    and return the pandas dataframe. Pororo is used for sentence embedding.
    '''
    
    arr = []
    sentence_col = df.columns.to_list()[0]
    
    for i, _ in df.iterrows():
        vectorized_sentence = vectorizer(df.loc[i][sentence_col])
        arr.append(vectorized_sentence)

    s = pd.Series(arr, name=sentence_col)
    df.drop(columns=sentence_col, axis=1, inplace=True) # remove a column with raw sentences
    return pd.concat([s, df], axis=1)

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device='cpu'):
    '''
    returns iterator for batch-size data.
    
    1. drop_last set to True: if the number of data is not divisible by batch size,
    do not use the last batch whose size is smaller than batch size

    2. shuffle set to True: shuffle dataset at every epoch
    '''

    train_dataloader = DataLoader(dataset=dataset, batch_size=batch_size, drop_last=drop_last, shuffle=shuffle)
  
    for sentences, labels in train_dataloader:
        sentences = sentences.to(device)
        labels = labels.to(device)
        yield sentences, labels

def compute_metrics(y_pred, y_label, prev_precision):
    '''
    calculate precision and recall of batch-size data    
    '''
    
    y_label = y_label.cpu()
    y_pred = (torch.sigmoid(y_pred) > 0.5).cpu().int()
    
    if y_pred.sum().item() == 0:
        precision = prev_precision
    else:
        precision = y_label[y_pred == 1].sum().item() / y_pred.sum().item()
        
    recall = y_label[y_pred == 1].sum().item() / (y_label == 1).sum().item()
    return precision, recall

def make_train_state(args):
    return {'train_loss': [], 'test_loss': [],
            'train_precision': [], 'test_precision': [],
            'train_recall': [], 'test_recall': []}

def count(df):
    clean_data_num = df.sum()[-2]
    hatred_data_num = df.sum()[1:].sum() - clean_data_num
    print(f'hatred data: {hatred_data_num}, clean data: {clean_data_num}')

In [3]:
data_dir = os.getcwd() + '/dataset/korean_unsmile_dataset/'
train_data_file = 'unsmile_train_v1.0.tsv'
test_data_file = 'unsmile_valid_v1.0.tsv'
processed_train_data_file = 'cleaned_unsmile_train_v1.0.tsv'
processed_test_data_file = 'cleaned_unsmile_valid_v1.0.tsv'

vectorizer = Pororo(task='sentence_embedding', lang='ko')

In [4]:
train_df = pd.read_csv(data_dir + train_data_file, sep='\t')
test_df = pd.read_csv(data_dir + test_data_file, sep='\t')
processed_train_df = pd.read_csv(data_dir + processed_train_data_file, sep='\t')
processed_test_df = pd.read_csv(data_dir + processed_test_data_file, sep='\t')

# the original implementation was that the sentence is vectorized dynamically,
# but it spends quite a long time(about 150 sec) to vectorize one sentence.
# so vectorize all sentences in advance and keep in memory.
vectorized_train_df = vectorize_dataset(train_df, vectorizer)
vectorized_test_df = vectorize_dataset(test_df, vectorizer)
vectorized_processed_train_df = vectorize_dataset(processed_train_df, vectorizer)
vectorized_processed_test_df = vectorize_dataset(processed_test_df, vectorizer)

# original data
ud_train = UnsmileDataset(vectorized_train_df)
ud_test = UnsmileDataset(vectorized_test_df)

# preprocessed data
ud_train1 = UnsmileDataset(vectorized_processed_train_df)
ud_test1 = UnsmileDataset(vectorized_processed_test_df)

In [79]:
# select original dataset or cleaned dataset
train_data = ud_train
test_data = ud_test

In [111]:
args = Namespace(
    batch_size=128,
    learning_rate=0.001,
    num_epochs=30,
    cuda=False,
    device='cpu'
)

train_state = make_train_state(args)

if torch.cuda.is_available():
    args.cuda = True
args.device = torch.device('cuda' if args.cuda else 'cpu')

# model
model = MultiLayerPerceptron()
model = model.to(args.device)

# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate)

# train starts
start_time = time.time()
print(f'<training start!>')
print(f'-learning rate: {args.learning_rate}')
print(f'-total epochs: {args.num_epochs}')
print(f'-batch size: {args.batch_size}')
print(f"-cuda {'avaialble' if args.cuda else 'not available'}")
    
for epoch_index in range(args.num_epochs):
    print(f'epoch{epoch_index + 1} : [', end='')

    running_loss = 0.0
    running_precision = 0.0
    running_recall = 0.0
    model.train() # this has effects on certain modules (ex, dropout)
  
    batch_generator = generate_batches(train_data, args.batch_size, device=args.device)

    for batch_index, (x, y) in enumerate(batch_generator):
        if batch_index % (int(int(len(train_data) / args.batch_size) / 20)) == 0:
            print('>', end='')
        
        # set all gradients to zero
        optimizer.zero_grad()
        
        # forward pass
        y_pred = model(x)
        loss = loss_func(y_pred, y.float())
    
        # backward pass
        loss.backward()
        optimizer.step()
        
        # metrics
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1) # moving average
        batch_precision, batch_recall = compute_metrics(y_pred, y, running_precision)
        running_precision += (batch_precision - running_precision) / (batch_index + 1)
        running_recall += (batch_recall - running_recall) / (batch_index + 1)
    
    print(']')
    train_state['train_loss'].append(running_loss)
    train_state['train_precision'].append(running_precision)
    train_state['train_recall'].append(running_recall)
  
    running_loss = 0.0
    running_precision = 0.0
    running_recall = 0.0
    model.eval()

    batch_generator = generate_batches(test_data, args.batch_size, device=args.device)
    
    for batch_index, (x, y) in enumerate(batch_generator):
        # forward pass
        y_pred = model(x)
        loss = loss_func(y_pred, y.float())
        
        # metrics
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1) # moving average
        batch_precision, batch_recall = compute_metrics(y_pred, y, running_precision)
        running_precision += (batch_precision - running_precision) / (batch_index + 1)
        running_recall += (batch_recall - running_recall) / (batch_index + 1)
  
    train_state['test_loss'].append(running_loss)
    train_state['test_precision'].append(running_precision)
    train_state['test_recall'].append(running_recall)

print(f'time flied: {time.time() - start_time} sec')
print('<end training!>')

<training start!>
-learning rate: 0.001
-total epochs: 30
-batch size: 128
-cuda avaialble
epoch1 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch2 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch3 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch4 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch5 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch6 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch7 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch8 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch9 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch10 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch11 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch12 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch13 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch14 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch15 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch16 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch17 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch18 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch19 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch20 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch21 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch22 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch23 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch24 : [>>>>>>>>>>>>>>>>>>>>>>>>]
epoch25 : [>>>>>>>>>>>>>>>>>>>

In [112]:
from collections import defaultdict

'''
 precisions_per_categories = {0: [70, 71, ,,, 75]
                              1: [50, 55, ,,, 54]]}
'''

itocol = {0: '여성/가족', 1: '남성', 2: '성소수자', 3: '인종/국적', 4: '연령',
          5: '지역', 6: '종교', 7: '기타 혐오', 8: '악플/욕설', 9: '깨끗', 10: '개인지칭'}

precisions_per_categories = defaultdict(list)
recalls_per_categories = defaultdict(list)
f1_scores_per_categories = defaultdict(list)

total_precisions = []
total_recalls = []
total_f1_scores = []

iterations = 5

for i in range(iterations):
    batch_generator = generate_batches(test_data, 1, device=args.device)

    y_preds = []
    y_labels = []

    for batch_index, (x, y) in enumerate(batch_generator):
        y_pred = model(x)
        y_pred = (torch.sigmoid(y_pred) > 0.5).int()
        y_preds.append(y_pred.detach().cpu().numpy().reshape(-1))
        y_labels.append(y.detach().cpu().numpy().reshape(-1))

    y_preds = np.array(y_preds)
    y_labels = np.array(y_labels)

    precisions = []
    recalls = []
    col_num = len(vectorized_train_df.columns[1:])
    
    # precision, recall and f1 score for each category
    for i in range(col_num):
        pred_col = y_preds[:, i]
        label_col = y_labels[:, i]
    
        # precision and recall
        precision = (label_col[pred_col == 1] == 1).sum() / pred_col.sum()
        recall = (label_col[pred_col == 1] == 1).sum() / label_col.sum()
        f1_score = 2 * precision * recall / (precision + recall)
        
        precisions_per_categories[i].append(precision)
        recalls_per_categories[i].append(recall)
        f1_scores_per_categories[i].append(f1_score)
        
    # precision, recall and f1 score for overall data
    precision = y_labels[y_preds == 1].sum()/ y_preds.sum()
    recall = y_labels[y_preds == 1].sum() / y_labels.sum()
    f1_score = 2 * precision * recall / (precision + recall)
    
    total_precisions.append(precision)
    total_recalls.append(recall)
    total_f1_scores.append(f1_score)

In [113]:
# print results
print('-Test data result of each category\ncategory      precision recall f1 score')
for i in itocol:
    category = itocol[i]
    avg_precision = np.average(precisions_per_categories[i])
    avg_recall = np.average(recalls_per_categories[i])
    avg_f1_score = np.average(f1_scores_per_categories[i])
    print(f'{category:6}\t{avg_precision: .2f}\t{avg_recall: .2f}\t{avg_f1_score: .2f}')

print('\n-Overall metrics')
print(f'average precision: {np.average(total_precisions): .2f}')
print(f'average recall: {np.average(total_recalls): .2f}')
print(f'average f1_score: {np.average(total_f1_scores): .2f}')

-Test data result of each category
category      precision recall f1 score
여성/가족 	 0.74	 0.55	 0.63
남성    	 0.77	 0.56	 0.65
성소수자  	 0.89	 0.64	 0.74
인종/국적 	 0.76	 0.61	 0.68
연령    	 0.84	 0.42	 0.56
지역    	 0.84	 0.71	 0.77
종교    	 0.87	 0.78	 0.82
기타 혐오 	 0.75	 0.13	 0.23
악플/욕설 	 0.63	 0.38	 0.47
깨끗    	 0.73	 0.59	 0.65
개인지칭  	 0.67	 0.08	 0.14

-Overall metrics
average precision:  0.76
average recall:  0.54
average f1_score:  0.63
