In [1]:
# built-in module
import sys
import os
import pickle
import random

# 3rd-party module
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import sklearn

# self-made module
sys.path.append(os.path.abspath(os.path.join('..')))
import util

In [2]:
# parameter and model path setting
experiment_no = 7
epoch = 21
model_path = f'../model_v1.1/{experiment_no}'

In [3]:
# load config, tokenizer, embedding
config = util.config.BaseConfig()
config = config.load(f'{model_path}/config.json')

tokenizer = util.tokenizer.WordPunctTokenizer(config)
tokenizer.load(f'{model_path}/tokenizer.pickle')

embedding = util.embedding.BaseEmbedding()
embedding.load(f'{model_path}/embedding.pickle')

In [4]:
# initialize random seed and device
device = torch.device('cpu')

if torch.cuda.is_available():
    device = torch.device('cuda:0')

In [5]:
# load data
data_df = util.data.load_dataset_semeval2016_origin('test')

In [6]:
# content encode to id
data_df['claim_tokenize'] = \
    tokenizer.tokenize(data_df['claim_pre'].tolist())
data_df['target_encode'] = \
    tokenizer.encode(data_df['target_pre'].tolist())
data_df['claim_encode'] = \
    tokenizer.encode(data_df['claim_pre'].tolist())

# content decode to token
data_df['claim_decode'] = \
    tokenizer.convert_ids_to_tokens(data_df['claim_encode'].tolist())

In [7]:
# label encode
stance_label = {'FAVOR': 0, 'AGAINST': 1, 'NONE': 2}
data_df['label_encode'] = data_df['label'].apply(
    lambda label: stance_label[label])

In [8]:
# encode content to lexicon vector
data_df['claim_lexicon'] = \
    tokenizer.encode_to_lexicon(data_df['claim_encode'].tolist())

In [9]:
# define dataset and dataloader
dataset = util.data.SingleTaskDataset(
    task_id=0,
    target_encode=data_df['target_encode'],
    claim_encode=data_df['claim_encode'],
    claim_lexicon=data_df['claim_lexicon'],
    label_encode=data_df['label_encode'])
dataloader = DataLoader(
    dataset=dataset,
    batch_size=config.batch_size,
    shuffle=False,
    collate_fn=util.data.SingleTaskDataset.collate_fn)

In [10]:
# define evaluate function
def evaluate(model, batch_iterator):
    all_label_y, all_pred_y = [], []
    all_task_weight, all_shared_weight = [], []

    model.eval()
    with torch.no_grad():
        for task_id, x1, x2, lexicon, y in batch_iterator:
            # device
            x1 = x1.to(device)
            x2 = x2.to(device)
            lexicon = lexicon.to(device)

            # fed into model
            pred_y, (task_weight, shared_weight) = model(task_id, x1, x2)

            all_label_y.extend(y.tolist())
            all_pred_y.extend(torch.argmax(pred_y, axis=1).cpu().tolist())
            all_task_weight.extend(task_weight.tolist())
            all_shared_weight.extend(shared_weight.tolist())

    return (all_label_y, all_pred_y, 
            all_task_weight, all_shared_weight)

In [11]:
# load model
model = util.model.BaseModel(config=config,
                             num_embeddings=embedding.get_num_embeddings(),
                             padding_idx=tokenizer.pad_token_id,
                             embedding_weight=embedding.vector)
model.load_state_dict(
    torch.load(f'{model_path}/model_{epoch}.ckpt'))
model = model.to(device)

In [12]:
# evaluate
label, pred_label, task_weight, shared_weight = \
    evaluate(model, dataloader)

In [13]:
# insert pred label into weight into dataframe
data_df['label_pred'] = pred_label
data_df['task_weight'] = task_weight
data_df['shared_weight'] = shared_weight

In [14]:
# write out to csv
data_path = '../data/attn_weight'
if not os.path.exists(data_path):
    os.makedirs(data_path)

data_df.to_csv(f'{data_path}/v1.1_test_weight.csv', index=False)