In [19]:
# built-in module
import os
import pickle
import random

# 3rd-party module
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorboard

# self-made module
import util

In [20]:
# hyperparameter setting
experiment_no = 30
config = util.config.BaseConfig(# experiment no
                                experiment_no=experiment_no,
                                # preprocess
                                tokenizer='WordPunctTokenizer',
                                filter='punctonly',
                                min_count=5,
                                max_seq_len=20,
                                # dataset and lexicon
                                stance_dataset='semeval2016',
                                embedding_file='glove/glove.twitter.27B.200d.txt',
                                lexicon_file='emolex_emotion',
                                # hyperparameter
                                embedding_dim=200,
                                task_hidden_dim=100,
                                shared_hidden_dim=100,
                                num_rnn_layers=1,
                                num_linear_layers=1,
                                attention='dot',
                                dropout=0.2,
                                learning_rate=1e-4,
                                clip_grad_value=0,
                                weight_decay=0,
                                lr_decay_step=10,
                                lr_decay=1,
                                nli_loss_weight=1.0,
                                lexicon_loss_weight=0,
                                random_seed=77,
                                kfold=5,
                                train_test_split=0.15,
                                epoch=50,
                                batch_size=32)

# initialize random seed and device
device = torch.device('cpu')
os.environ['PYTHONHASHSEED'] = str(config.random_seed)
random.seed(config.random_seed)
np.random.seed(config.random_seed)
torch.manual_seed(config.random_seed)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.cuda.manual_seed(config.random_seed)
    # torch.cuda.manual_seed_all(config.random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [21]:
# load data
if config.stance_dataset == 'semeval2016':
    stance_data_df = util.data.load_dataset('semeval2016_train')
elif config.stance_dataset == 'fnc-1':
    stance_data_df = util.data.load_dataset('fnc_train')

nli_data_df = util.data.load_dataset('mnli_train')

# initialize tokenizer
if config.tokenizer == 'BaseTokenizer':
    tokenizer = util.tokenizer.BaseTokenizer(config)
elif config.tokenizer == 'WordPunctTokenizer':
    tokenizer = util.tokenizer.WordPunctTokenizer(config)

# initialize embedding
if any([embedding in config.embedding_file for embedding in ['glove', 'fasttext']]):
    embedding = util.embedding.BaseEmbedding(embedding_dim=config.embedding_dim)

# get all tokens and embeddings
all_sentence = []
all_sentence.extend(stance_data_df['target'].drop_duplicates().tolist())
all_sentence.extend(stance_data_df['claim'].drop_duplicates().tolist())
all_sentence.extend(nli_data_df['target'].drop_duplicates().tolist())
all_sentence.extend(nli_data_df['claim'].drop_duplicates().tolist())

all_tokens = tokenizer.get_all_tokens(all_sentence)
embedding.load_embedding(embedding_path=f'data/embedding/{config.embedding_file}',
                        tokens=all_tokens)

# build vocabulary dictionary
tokenizer.build_dict(embedding.word_dict)

# content encode to id
print('content encode --')
stance_data_df['target_encode'] = \
    tokenizer.encode(stance_data_df['target'].tolist())
stance_data_df['claim_encode'] = \
    tokenizer.encode(stance_data_df['claim'].tolist())
nli_data_df['target_encode'] = \
    tokenizer.encode(nli_data_df['target'].tolist())
nli_data_df['claim_encode'] = \
    tokenizer.encode(nli_data_df['claim'].tolist())

# label encode
print('label encode --')
if 'semeval' in config.stance_dataset:
    stance_label = {'favor': 0, 'against': 1, 'none': 2}
elif 'fnc' in config.stance_dataset:
    stance_label = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}
nli_label = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

stance_data_df['label_encode'] = stance_data_df['label'].apply(
    lambda label: stance_label[label])
nli_data_df['label_encode'] = nli_data_df['label'].apply(
    lambda label: nli_label[label])

# load lexicon
lexicon = util.data.load_lexicon(lexicon=config.lexicon_file)

# content encode to lexicon vector
print('lexicon encode --')
stance_data_df['claim_lexicon'] = \
    tokenizer.encode_to_lexicon(stance_data_df['claim_encode'].tolist(), lexicon)
nli_data_df['claim_lexicon'] = \
    tokenizer.encode_to_lexicon(nli_data_df['claim_encode'].tolist(), lexicon)

loading SemEval2016 training data: 100%|██████████| 2814/2814 [00:00<00:00, 1300580.88it/s]
loading MultiNLI training data: 100%|██████████| 392702/392702 [00:01<00:00, 318210.61it/s]
get all tokens: 100%|██████████| 522232/522232 [00:01<00:00, 398041.95it/s]
load embedding: 100%|██████████| 1193514/1193514 [00:12<00:00, 94246.23it/s] 
content encode --
loading EmoLex lexicon data: 100%|██████████| 141820/141820 [00:00<00:00, 1991743.52it/s]label encode --
lexicon encode --



In [40]:
def print_format_string(input_list):

    for item in input_list:
        print('{:<20}'.format(str(item)), end='')

In [41]:
temp_df = stance_data_df.iloc[:10]

for _, line in temp_df.iterrows():
    print(line['claim'])
    print_format_string(tokenizer.convert_ids_to_tokens([line['claim_encode']])[0])
    print()
    print_format_string(line['claim_lexicon'])

dear lord thank u for all of ur blessings forgive my sins lord give me strength and energy for this busy day ahead #blessed #hope #semst
[bos]               dear                lord                thank               u                   for                 all                 of                  ur                  blessings           forgive             my                  sins                lord                give                me                  strength            and                 energy              [eos]               
0                   0                   1                   0                   0                   0                   0                   0                   0                   1                   0                   0                   1                   1                   0                   0                   1                   0                   0                   0                   blessed are the peacemakers, for they shall be called children