In [2]:
import os
os.chdir("/home/dongwon/korbertsum/src")

In [11]:
import easydict
import argparse
import glob
import os
import random
import signal
import time
import numpy as np
import torch
from pytorch_pretrained_bert import BertConfig

import distributed
from models import data_loader, model_builder
from models.data_loader import load_dataset
from models.model_builder import Summarizer
from prepro.data_builder import format_to_dict
from tensorboardX import SummaryWriter
from models.reporter import ReportMgr
from models.stats import Statistics
from others.logging import logger
from others.logging import logger, init_logger
from models.trainer import build_trainer

# bertsum을 위한 파라미터 지정
args = easydict.EasyDict({
    "encoder":'classifier',
    "mode":'summary',
    "bert_data_path":'/home/dongwon/korbertsum/bert_data/korean',
    "model_path":'/home/dongwon/korbertsum/models/bert_classifier',
    "bert_model":'/home/dongwon/001_bert_morp_pytorch',
    "result_path":'/home/dongwon/korbertsum/results/korean',
    "temp_dir":'.',
    "bert_config_path":'/home/dongwon/001_bert_morp_pytorch/bert_config.json',
    "batch_size":1000,
    "use_interval":True,
    "hidden_size":128,
    "ff_size":512,
    "heads":4,
    "inter_layers":2,
    "rnn_size":512,
    "param_init":0,
    "param_init_glorot":True,
    "dropout":0.1,
    "optim":'adam',
    "lr":2e-3,
    "report_every":1,
    "save_checkpoint_steps":5,
    "block_trigram":True,
    "recall_eval":False,
    
    "accum_count":1,
    "world_size":1,
    "visible_gpus":'-1',
    "gpu_ranks":'0',
    "log_file":'/home/dongwon/korbertsum/logs/bert_classifier',
    "test_from":'/home/dongwon/korbertsum/models/bert_classifier/model_step_40000.pt' # 사용할 fine-tuning된 모델을 따로 지정해야 함
})

In [5]:
def summary(args, b_list, device_id, pt, step):

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    config = BertConfig.from_json_file(args.bert_config_path)
    model = Summarizer(args, device, load_pretrained_bert=False, bert_config = config)
    model.load_cp(checkpoint)
    model.eval()

    test_iter =data_loader.Dataloader(args, _lazy_dataset_loader(b_list),
                                  args.batch_size, device,
                                  shuffle=False, is_test=True)
    trainer = build_trainer(args, device_id, model, None)
    result = trainer.summary(test_iter,step)
    return result

model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers','encoder','ff_actv', 'use_interval','rnn_size']
def _lazy_dataset_loader(pt_file):
    dataset = pt_file
    yield dataset

In [8]:
# 인풋 데이터 POS 태깅 
import json
from kiwipiepy import Kiwi
crawled = json.load(open("../crawled_data/result (16).json"))["content"]
kiwi = Kiwi(num_workers = 6)
postagged = list()
tokenize_list = list(kiwi.tokenize(crawled))
for sen in tokenize_list:
    sen_pos = list()
    for char in sen:
        token = char.form + "/" + char.tag
        sen_pos.append(token)
    postagged.append(sen_pos)
    
# bert 인풋으로 가공
news = format_to_dict(postagged, crawled)
print(news)

[{'src': [2, 7525, 1974, 9, 1, 1, 19, 261, 23, 677, 13, 208, 14, 908, 53, 1383, 1716, 19, 205, 1, 977, 19, 1412, 12, 8, 7, 3, 2, 596, 18, 1, 12, 74, 2259, 911, 1, 3555, 14, 18, 334, 318, 22, 2596, 386, 16, 70, 12, 8, 7, 3, 2, 1, 5865, 1, 1261, 28, 1282, 9, 464, 57, 526, 145, 1, 21, 754, 24, 135, 14, 2862, 498, 12, 94, 1, 432, 318, 526, 7, 1, 3, 2, 1, 1054, 4343, 3938, 9, 1, 195, 208, 19, 908, 23, 1282, 9, 12, 8, 7, 3, 2, 2435, 15, 265, 734, 16, 1267, 4287, 12, 74, 410, 464, 91, 298, 2275, 1, 14, 2413, 9, 1, 55, 38, 12, 8, 7, 3, 2, 5166, 1, 14, 3119, 19, 40, 23, 1, 1, 94, 677, 13, 1, 15, 104, 1, 1, 1, 386, 14, 2413, 9, 12, 8, 7, 3, 2, 1, 3249, 2722, 11, 432, 1120, 22, 452, 15, 1, 17, 231, 24, 2413, 9, 12, 93, 110, 4249, 54, 87, 364, 24, 816, 20, 37, 12, 23, 231, 24, 122, 364, 14, 2322, 4349, 1036, 1, 20, 139, 36, 16, 2722, 11, 1120, 49, 224, 9, 12, 8, 7, 3, 2, 1, 1, 17, 231, 1, 2083, 16, 204, 12, 526, 7, 1, 3, 2, 1, 1054, 47, 461, 145, 1, 754, 24, 1, 4754, 15, 265, 1054, 1, 1953, 524, 1

In [13]:
summary_result = summary(args, news, 0, "", None)[0]
print()
print(summary_result)

{'encoder': 'transformer', 'mode': 'summary', 'bert_data_path': '/home/dongwon/korbertsum/bert_data/korean', 'model_path': '/home/dongwon/korbertsum/models/bert_classifier', 'bert_model': '/home/dongwon/001_bert_morp_pytorch', 'result_path': '/home/dongwon/korbertsum/results/korean', 'temp_dir': '.', 'bert_config_path': '/home/dongwon/001_bert_morp_pytorch/bert_config.json', 'batch_size': 1000, 'use_interval': True, 'hidden_size': 128, 'ff_size': 512, 'heads': 4, 'inter_layers': 2, 'rnn_size': 512, 'param_init': 0, 'param_init_glorot': True, 'dropout': 0.1, 'optim': 'adam', 'lr': 0.002, 'report_every': 1, 'save_checkpoint_steps': 5, 'block_trigram': True, 'recall_eval': False, 'accum_count': 1, 'world_size': 1, 'visible_gpus': '-1', 'gpu_ranks': '0', 'log_file': '/home/dongwon/korbertsum/logs/bert_classifier', 'test_from': '/home/dongwon/korbertsum/models/bert_classifier/model_step_40000.pt'}
gpu_rank 0
고기를 먹은 다음 단 음식이라니, 이 사람이 뭘 좀 아는군 싶어서 마음에 들었다.<q>더위를 가시게 해줄 시원한 하이볼 2잔을 주문하고, 화로에 소고