In [1]:
import os
os.chdir("/home/dongwon/korbertsum/src")

In [14]:
import easydict
import argparse
import glob
import os
import random
import signal
import time
import numpy as np
import torch
from pytorch_pretrained_bert import BertConfig

import distributed
from models import data_loader, model_builder
from models.data_loader import load_dataset
from models.model_builder import Summarizer
from tensorboardX import SummaryWriter
from models.reporter import ReportMgr
from models.stats import Statistics
from others.logging import logger
from others.logging import logger, init_logger
from models.trainer import build_trainer

args = easydict.EasyDict({
    "encoder":'classifier',
    "mode":'summary',
    "bert_data_path":'/home/dongwon/korbertsum/bert_data/korean',
    "model_path":'/home/dongwon/korbertsum/models/bert_classifier',
    "bert_model":'/home/dongwon/001_bert_morp_pytorch',
    "result_path":'/home/dongwon/korbertsum/results/korean',
    "temp_dir":'.',
    "bert_config_path":'/home/dongwon/001_bert_morp_pytorch/bert_config.json',
    "batch_size":1000,
    "use_interval":True,
    "hidden_size":128,
    "ff_size":512,
    "heads":4,
    "inter_layers":2,
    "rnn_size":512,
    "param_init":0,
    "param_init_glorot":True,
    "dropout":0.1,
    "optim":'adam',
    "lr":2e-3,
    "report_every":1,
    "save_checkpoint_steps":5,
    "block_trigram":True,
    "recall_eval":False,
    
    "accum_count":1,
    "world_size":1,
    "visible_gpus":'-1',
    "gpu_ranks":'0',
    "log_file":'/home/dongwon/korbertsum/logs/bert_classifier',
    "test_from":'/home/dongwon/korbertsum/models/bert_classifier/model_step_1000.pt'
})

In [15]:
def summary(args, b_list, device_id, pt, step):

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    config = BertConfig.from_json_file(args.bert_config_path)
    model = Summarizer(args, device, load_pretrained_bert=False, bert_config = config)
    model.load_cp(checkpoint)
    model.eval()

    test_iter =data_loader.Dataloader(args, _lazy_dataset_loader(b_list),
                                  args.batch_size, device,
                                  shuffle=False, is_test=True)
    trainer = build_trainer(args, device_id, model, None)
    result = trainer.summary(test_iter,step)
    return result

model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers','encoder','ff_actv', 'use_interval','rnn_size']
def _lazy_dataset_loader(pt_file):
    dataset = pt_file
    yield dataset

In [17]:
# POS 태깅 
import json
from kiwipiepy import Kiwi
crawled = json.load(open("../crawled_data/result.json"))["content"]
kiwi = Kiwi(num_workers = 6)
postagged = list()
tokenize_list = list(kiwi.tokenize(crawled))
for sen in tokenize_list:
    sen_pos = list()
    for char in sen:
        token = char.form + "/" + char.tag
        sen_pos.append(token)
    postagged.append(sen_pos)
    
# bert 인풋으로 가공
news = format_to_dict(postagged, crawled)
print(news)

[{'src': [2, 616, 3168, 17, 1, 7925, 558, 81, 390, 3, 2, 1, 1, 1519, 10965, 2896, 182, 2322, 45, 1519, 319, 112, 20, 1, 1, 14, 2322, 20, 1, 19, 1412, 23, 37, 68, 3168, 54, 87, 16, 3119, 9, 7819, 457, 14, 2886, 9, 49, 2981, 252, 7, 239, 287, 475, 4910, 2546, 1, 1, 547, 1, 8892, 3, 2, 92, 1044, 16, 1, 9, 22, 543, 27, 1, 3203, 14, 3168, 24, 2322, 94, 1729, 19, 1412, 23, 37, 22, 33, 11, 808, 9, 48, 806, 9, 20, 77, 36, 84, 2886, 9, 1, 569, 24, 551, 9, 12, 8, 7, 1, 36, 21, 1044, 1, 13, 7925, 558, 81, 15, 312, 1, 2279, 85, 86, 11, 703, 9, 23, 37, 22, 33, 28, 276, 20, 111, 12, 8, 7, 3, 2, 1, 1, 477, 1, 1, 24, 673, 9, 22, 4910, 2546, 1, 1, 547, 1, 14, 18, 1, 1, 14, 2322, 94, 1, 19, 1412, 23, 37, 68, 3168, 19, 806, 102, 23, 208, 31, 3329, 569, 1, 15, 104, 2787, 13, 670, 16, 2645, 12, 8, 7, 3, 2, 670, 1, 3792, 1044, 1, 313, 109, 14, 119, 71, 90, 18, 115, 1, 125, 60, 519, 44, 429, 125, 298, 1776, 1461, 6532, 13, 92, 6276, 2743, 1, 14, 3119, 9, 49, 97, 20, 3113, 9, 12, 8, 7, 169, 3119, 364, 19, 508

In [18]:
summary_result = summary(args, news, 0, "", None)[0]
print(summary_result)

{'encoder': 'transformer', 'mode': 'summary', 'bert_data_path': '/home/dongwon/korbertsum/bert_data/korean', 'model_path': '/home/dongwon/korbertsum/models/bert_classifier', 'bert_model': '/home/dongwon/001_bert_morp_pytorch', 'result_path': '/home/dongwon/korbertsum/results/korean', 'temp_dir': '.', 'bert_config_path': '/home/dongwon/001_bert_morp_pytorch/bert_config.json', 'batch_size': 1000, 'use_interval': True, 'hidden_size': 128, 'ff_size': 512, 'heads': 4, 'inter_layers': 2, 'rnn_size': 512, 'param_init': 0, 'param_init_glorot': True, 'dropout': 0.1, 'optim': 'adam', 'lr': 0.002, 'report_every': 1, 'save_checkpoint_steps': 5, 'block_trigram': True, 'recall_eval': False, 'accum_count': 1, 'world_size': 1, 'visible_gpus': '-1', 'gpu_ranks': '0', 'log_file': '/home/dongwon/korbertsum/logs/bert_classifier', 'test_from': '/home/dongwon/korbertsum/models/bert_classifier/model_step_1000.pt'}
gpu_rank 0
피해 남녀, 운전자 과실 100% 주장<q>영상 제보자이자 차량 운전자 A씨에 따르면 그는 지난 7월 25일 오후 7시께 인천 서구의 한 상가 지상 주