In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from argparse import Namespace
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from pororo import Pororo


class UnsmileDataset(Dataset):
    '''
    unsmile_df(pandas.DataFrame): unsmile dataset with vectorized sentence('문장')
    '''
    def __init__(self, unsmile_df):
        self.unsmile_df = unsmile_df
      
    def __len__(self):
        return self.unsmile_df.shape[0]
      
    def __getitem__(self, index):
        sentence_id = self.unsmile_df.iloc[index][0]
        sentence_vector = self.unsmile_df.iloc[index][2]
        label_vector = self.unsmile_df.iloc[index][3:].to_numpy(dtype=np.int32) # note dtype
        
        return {'id': sentence_id, 'x': sentence_vector, 'y': label_vector}
    
class MultiLayerPerceptron(nn.Module):
    '''
    input: 768 dimension sentence vector transformed by Pororo sentence embedding 
    output: 11 dimension vector which contains values for '여성/가족', .... , '개인지칭'
    '''
    input_dim = 768
    hidden_dim = 512
    output_dim = 11  
    
    def __init__(self):
        super(MultiLayerPerceptron, self).__init__()
        self.fc= nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(self.hidden_dim, self.output_dim),
        )

    def forward(self, x):
        return self.fc(x)
    
def prune_dataset(data_file):
    data_dir = os.getcwd() + '/dataset/korean_unsmile_dataset/'
    # data_dir = os.getcwd() + '/drive/MyDrive/dataset/korean_unsmile_dataset-main/'
    df = pd.read_csv(data_dir + data_file, sep='\t')

    categories = df.columns.to_list()[1:]

    for category in categories:
        if category == 'clean':
            continue
        
        indexes = list()
        for i, _ in df.iterrows():
            data = df.loc[i]
            if data[category] == 1:
                indexes.append(i)

        mask = np.random.random(len(indexes)) > 0.5
        indexes = mask * indexes
        df.drop(indexes, inplace=True, errors='ignore')
    
    return df

def vectorize_dataset(df, vectorizer):
    '''
    transform '문장' column's elements from string to numpy array,
    and return the pandas dataframe. Pororo is used for sentence embedding.
    '''
    
    arr = []
    sentence_col_idx = 2
    sentence_col_name = df.columns.to_list()[sentence_col_idx]
    
    for i, _ in df.iterrows():
        vectorized_sentence = vectorizer(df.loc[i][sentence_col_name])
        arr.append(vectorized_sentence)

    vectorized_sentence_col = pd.Series(arr, name=sentence_col_name)
    df.drop(columns=sentence_col_name, axis=1, inplace=True) # remove a column with raw sentences
    df.insert(sentence_col_idx, sentence_col_name, vectorized_sentence_col) # insert a new vectorized column
    return df

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device='cpu'):
    '''
    returns iterator for batch-size data.
    
    1. drop_last set to True: if the number of data is not divisible by batch size,
    do not use the last batch whose size is smaller than batch size

    2. shuffle set to True: shuffle dataset at every epoch
    '''

    train_dataloader = DataLoader(dataset=dataset, batch_size=batch_size, drop_last=drop_last, shuffle=shuffle)
  
    for batch_dict in train_dataloader:
        out_dict = {}
        for k, v in batch_dict.items():
            out_dict[k] = batch_dict[k].to(device)
        yield out_dict

def compute_metrics(y_pred, y_label, prev_precision):
    '''
    calculate precision and recall of batch-size data    
    '''
    
    y_label = y_label.cpu()
    y_pred = (torch.sigmoid(y_pred) > 0.5).cpu().int()
    
    if y_pred.sum().item() == 0:
        precision = prev_precision
    else:
        precision = y_label[y_pred == 1].sum().item() / y_pred.sum().item()
        
    recall = y_label[y_pred == 1].sum().item() / (y_label == 1).sum().item()
    return precision, recall

def make_train_state(args):
    return {'train_loss': [], 'test_loss': [],
            'train_precision': [], 'test_precision': [],
            'train_recall': [], 'test_recall': []}

def count(df):
    clean_data_num = df.sum()[-2]
    hatred_data_num = df.sum()[1:].sum() - clean_data_num
    print(f'hatred data: {hatred_data_num}, clean data: {clean_data_num}')

In [None]:
data_dir = os.getcwd() + '/dataset/korean_unsmile_dataset/'
train_data_file = 'unsmile_train_v1.0.tsv'
test_data_file = 'unsmile_valid_v1.0.tsv'
vectorizer = Pororo(task='sentence_embedding', lang='ko')

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
train_df = pd.read_csv(data_dir + train_data_file, sep='\t')
test_df = pd.read_csv(data_dir + test_data_file, sep='\t')
print(len(train_df), len(test_df))

In [None]:
merged_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
print(len(merged_df))
print(len(train_df) + len(test_df) == len(merged_df))
merged_df.tail()

In [None]:
# 결측치 제거
indexes = []
for i in range(len(merged_df)):
    data = merged_df.iloc[i]
    if data[1:].sum() == 0:
        indexes.append(i)
merged_df.iloc[indexes]

In [None]:
merged_df.drop(indexes, inplace=True)
merged_df.reset_index(drop=True, inplace=True)
print(len(merged_df))
merged_df.tail()

In [None]:
# shuffle, re-index and partition
shuffled_df = merged_df.sample(frac=1).reset_index(drop=True)
shuffled_df.tail()

In [None]:
p = -1
n = 3000
ids = []
partitions = []

for i in range(len(shuffled_df)):
    if i % n == 0 and p < 5:
        p += 1
    ids.append(i)
    partitions.append(p)

ids = pd.Series(ids, name='id')
partitions = pd.Series(partitions, name='partition')

shuffled_and_partitioned_df = pd.concat([ids, partitions, shuffled_df], axis=1)
shuffled_and_partitioned_df.tail()

In [None]:
vectorized_df = vectorize_dataset(shuffled_and_partitioned_df.copy(), vectorizer)
vectorized_df.tail()

In [None]:
shuffled_and_partitioned_df.tail()

In [None]:
args = Namespace(
    batch_size=128,
    learning_rate=0.0005,
    num_epochs=30,
    cuda=False,
    device='cpu'
)

if torch.cuda.is_available():
    args.cuda = True
args.device = torch.device('cuda' if args.cuda else 'cpu')

In [None]:
train_df = None
test_df = None
K = 6 # number of partitions
partitions = list(range(K))

high_indexes = defaultdict(list)

for i in range(K):
    # prepare data
    test_partition = partitions[i: i + 1]
    train_partition = partitions[0: i] + partitions[i + 1:]
    c1 = vectorized_df['partition'].isin(test_partition)
    c2 = vectorized_df['partition'].isin(train_partition)
    
    test_df = vectorized_df.loc[c1]
    train_df = vectorized_df.loc[c2]
    train_data = UnsmileDataset(train_df)
    test_data = UnsmileDataset(test_df)
    
    train_state = make_train_state(args)

    # model
    model = MultiLayerPerceptron()
    model = model.to(args.device)

    # loss and optimizer
    loss_func = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    # train starts
    start_time = time.time()
    print(f'\n<training start!>')
    print(f'-learning rate: {args.learning_rate}')
    print(f'-total epochs: {args.num_epochs}')
    print(f'-batch size: {args.batch_size}')
    print(f"-cuda {'avaialble' if args.cuda else 'not available'}")

    for epoch_index in range(args.num_epochs):
        print(f'epoch{epoch_index + 1} : [', end='')

        running_loss = 0.0
        running_precision = 0.0
        running_recall = 0.0
        model.train() # this has effects on certain modules (ex, dropout)
    
        batch_generator = generate_batches(train_data, args.batch_size, device=args.device)

        for batch_index, batch_dict in enumerate(batch_generator):
            if batch_index % (int(int(len(train_data) / args.batch_size) / 20)) == 0:
                print('>', end='')

            x = batch_dict['x']
            y = batch_dict['y']

            # set all gradients to zero
            optimizer.zero_grad()

            # forward pass
            y_pred = model(x)
            loss = loss_func(y_pred, y.float())

            # backward pass
            loss.backward()
            optimizer.step()

            # metrics
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1) # moving average
            batch_precision, batch_recall = compute_metrics(y_pred, y, running_precision)
            running_precision += (batch_precision - running_precision) / (batch_index + 1)
            running_recall += (batch_recall - running_recall) / (batch_index + 1)

        print(']')
        train_state['train_loss'].append(running_loss)
        train_state['train_precision'].append(running_precision)
        train_state['train_recall'].append(running_recall)
    
        running_loss = 0.0
        running_precision = 0.0
        running_recall = 0.0
        model.eval()

        batch_generator = generate_batches(test_data, args.batch_size, device=args.device)

        for batch_index, batch_dict in enumerate(batch_generator):
            x = batch_dict['x']
            y = batch_dict['y']
            
            # forward pass
            y_pred = model(x)
            loss = loss_func(y_pred, y.float())

            # metrics
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1) # moving average
            batch_precision, batch_recall = compute_metrics(y_pred, y, running_precision)
            running_precision += (batch_precision - running_precision) / (batch_index + 1)
            running_recall += (batch_recall - running_recall) / (batch_index + 1)
    
        train_state['test_loss'].append(running_loss)
        train_state['test_precision'].append(running_precision)
        train_state['test_recall'].append(running_recall)

    print(f'time flied: {time.time() - start_time} sec')
    print('<end training!>')
    
    # data re-labeling using model results
    batch_generator = generate_batches(test_data, 1, device=args.device)
    y_preds = []
    y_labels = []
    ids = []
    threshold = 0.9
    
    for batch_index, batch_dict in enumerate(batch_generator):
        x = batch_dict['x']
        y = batch_dict['y']
        id = batch_dict['id']
        
        y_pred = model(x)
        y_pred = (torch.sigmoid(y_pred) > threshold).int()
        y_preds.append(y_pred.detach().cpu().numpy().reshape(-1))
        y_labels.append(y.detach().cpu().numpy().reshape(-1))
        ids.append(id.detach().cpu().numpy().reshape(-1))
        
    y_preds = np.array(y_preds)
    y_labels = np.array(y_labels)
    ids = np.array(ids)
    
    for col_idx in range(11):
        indexes = (y_preds[:, col_idx] == 1)
        high_indexes[col_idx].extend(ids[indexes].reshape(-1))

In [None]:
itocol = {0: '여성/가족', 1: '남성', 2: '성소수자', 3: '인종/국적', 4: '연령',
          5: '지역', 6: '종교', 7: '기타 혐오', 8: '악플/욕설', 9: 'clean', 10: '개인지칭'}

In [None]:
i = 0
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 1
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 2
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 3
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 4
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 5
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 6
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 7
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 8
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 9
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]

In [None]:
i = 10
print(f'category: {itocol[i]}')
df = shuffled_and_partitioned_df.loc[high_indexes[i]]
print(len(df.loc[df[itocol[i]] == 0]))
df.loc[df[itocol[i]] == 0]