## Setting

In [None]:
# !pip install pyrouge --upgrade
# !pip install https://github.com/bheinzerling/pyrouge/archive/master.zip
# !pip install pyrouge
# !pip show pyrouge
# !git clone https://github.com/andersjo/pyrouge.git
# from pyrouge import Rouge155
# !pyrouge_set_rouge_path 'pyrouge/tools/ROUGE-1.5.5'

In [2]:
# !pip install transformers
# !pip install tensorboardX
# !pip install easydict

## Preprocessing

In [1]:
import os

os.chdir('KorBertSum/src')

import torch
import numpy as np
from models import data_loader, model_builder
from models.model_builder import Summarizer
from others.logging import logger, init_logger
from models.data_loader import load_dataset
from transformers import BertConfig, BertTokenizer
from tensorboardX import SummaryWriter
from models.reporter import ReportMgr
from models.stats import Statistics
import easydict
from multiprocessing.dummy import Pool as ThreadPool

In [3]:
def _tally_parameters(model):
    n_params = sum([p.nelement() for p in model.parameters()])
    return n_params

def build_trainer(args, device_id, model,
                  optim):
    """
    Simplify `Trainer` creation based on user `opt`s*
    Args:
        opt (:obj:`Namespace`): user options (usually from argument parsing)
        model (:obj:`onmt.models.NMTModel`): the model to train
        fields (dict): dict of fields
        optim (:obj:`onmt.utils.Optimizer`): optimizer used during training
        data_type (str): string describing the type of data
            e.g. "text", "img", "audio"
        model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object
            used to save the model
    """
    device = "cpu" if args.visible_gpus == '-1' else "cuda"


    grad_accum_count = args.accum_count
    n_gpu = args.world_size

    if device_id >= 0:
        gpu_rank = int(args.gpu_ranks[device_id])
    else:
        gpu_rank = 0
        n_gpu = 0

    print('gpu_rank %d' % gpu_rank)

    tensorboard_log_dir = args.model_path

    writer = SummaryWriter(tensorboard_log_dir, comment="Unmt")

    report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer)

    trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager)

    # print(tr)
    if (model):
        n_params = _tally_parameters(model)
        logger.info('* number of parameters: %d' % n_params)

    return trainer

class Trainer(object):
    """
    Class that controls the training process.

    Args:
            model(:py:class:`onmt.models.model.NMTModel`): translation model
                to train
            train_loss(:obj:`onmt.utils.loss.LossComputeBase`):
               training loss computation
            valid_loss(:obj:`onmt.utils.loss.LossComputeBase`):
               training loss computation
            optim(:obj:`onmt.utils.optimizers.Optimizer`):
               the optimizer responsible for update
            trunc_size(int): length of truncated back propagation through time
            shard_size(int): compute loss in shards of this size for efficiency
            data_type(string): type of the source input: [text|img|audio]
            norm_method(string): normalization methods: [sents|tokens]
            grad_accum_count(int): accumulate gradients this many times.
            report_manager(:obj:`onmt.utils.ReportMgrBase`):
                the object that creates reports, or None
            model_saver(:obj:`onmt.models.ModelSaverBase`): the saver is
                used to save a checkpoint.
                Thus nothing will be saved if this parameter is None
    """

    def __init__(self,  args, model,  optim,
                  grad_accum_count=1, n_gpu=1, gpu_rank=1,
                  report_manager=None):
        # Basic attributes.
        self.args = args
        self.save_checkpoint_steps = args.save_checkpoint_steps
        self.model = model
        self.optim = optim
        self.grad_accum_count = grad_accum_count
        self.n_gpu = n_gpu
        self.gpu_rank = gpu_rank
        self.report_manager = report_manager

        self.loss = torch.nn.BCELoss(reduction='none')
        assert grad_accum_count > 0
        # Set model in training mode.
        if (model):
            self.model.train()
            
    def summ(self, test_iter, step, cal_lead=False, cal_oracle=False):
          """ Validate model.
              valid_iter: validate data iterator
          Returns:
              :obj:`nmt.Statistics`: validation loss statistics
          """
          # Set model in validating mode.
          def _get_ngrams(n, text):
              ngram_set = set()
              text_length = len(text)
              max_index_ngram_start = text_length - n
              for i in range(max_index_ngram_start + 1):
                  ngram_set.add(tuple(text[i:i + n]))
              return ngram_set

          def _block_tri(c, p):
              tri_c = _get_ngrams(3, c.split())
              for s in p:
                  tri_s = _get_ngrams(3, s.split())
                  if len(tri_c.intersection(tri_s))>0:
                      return True
              return False

          if (not cal_lead and not cal_oracle):
              self.model.eval()
          stats = Statistics()

          with torch.no_grad():
              for batch in test_iter:
                  src = batch.src
                  labels = batch.labels
                  segs = batch.segs
                  clss = batch.clss
                  mask = batch.mask
                  mask_cls = batch.mask_cls

                  if (cal_lead):
                      selected_ids = [list(range(batch.clss.size(1)))] * batch.batch_size
                  elif (cal_oracle):
                      selected_ids = [[j for j in range(batch.clss.size(1)) if labels[i][j] == 1] for i in
                                      range(batch.batch_size)]
                  else:
                      sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)
                      sent_scores = sent_scores + mask.float()
                      sent_scores = sent_scores.cpu().data.numpy()
                      selected_ids = np.argsort(-sent_scores, 1)
          return selected_ids

    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask
            mask_cls = batch.mask_cls

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)

            loss = self.loss(sent_scores, labels.float())
            loss = (loss*mask.float()).sum()
            (loss/loss.numel()).backward()
            # loss.div(float(normalization)).backward()

            batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)


            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()
            
    def _save(self, step):
        real_model = self.model
        # real_generator = (self.generator.module
        #                   if isinstance(self.generator, torch.nn.DataParallel)
        #                   else self.generator)

        model_state_dict = real_model.state_dict()
        # generator_state_dict = real_generator.state_dict()
        checkpoint = {
            'model': model_state_dict,
            # 'generator': generator_state_dict,
            'opt': self.args,
            'optim': self.optim,
        }
        checkpoint_path = os.path.join(self.args.model_path, 'model_step_%d.pt' % step)
        logger.info("Saving checkpoint %s" % checkpoint_path)
        # checkpoint_path = '%s_step_%d.pt' % (FLAGS.model_path, step)
        if (not os.path.exists(checkpoint_path)):
            torch.save(checkpoint, checkpoint_path)
            return checkpoint, checkpoint_path

    def _start_report_manager(self, start_time=None):
        """
        Simple function to start report manager (if any)
        """
        if self.report_manager is not None:
            if start_time is None:
                self.report_manager.start()
            else:
                self.report_manager.start_time = start_time

    def _maybe_gather_stats(self, stat):
        """
        Gather statistics in multi-processes cases

        Args:
            stat(:obj:onmt.utils.Statistics): a Statistics object to gather
                or None (it returns None in this case)

        Returns:
            stat: the updated (or unchanged) stat object
        """
        if stat is not None and self.n_gpu > 1:
            return Statistics.all_gather_stats(stat)
        return stat

    def _maybe_report_training(self, step, num_steps, learning_rate,
                               report_stats):
        """
        Simple function to report training stats (if report_manager is set)
        see `onmt.utils.ReportManagerBase.report_training` for doc
        """
        if self.report_manager is not None:
            return self.report_manager.report_training(
                step, num_steps, learning_rate, report_stats,
                multigpu=self.n_gpu > 1)
        
    def _report_step(self, learning_rate, step, train_stats=None,
                     valid_stats=None):
        """
        Simple function to report stats (if report_manager is set)
        see `onmt.utils.ReportManagerBase.report_step` for doc
        """
        if self.report_manager is not None:
            return self.report_manager.report_step(
                learning_rate, step, train_stats=train_stats,
                valid_stats=valid_stats)

    def _maybe_save(self, step):
        """
        Save the model if a model saver is set
        """
        if self.model_saver is not None:
            self.model_saver.maybe_save(step)

class BertData():
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.sep_vid = self.tokenizer.vocab['[SEP]']
        self.cls_vid = self.tokenizer.vocab['[CLS]']
        self.pad_vid = self.tokenizer.vocab['[PAD]']

    def preprocess(self, src):

        if (len(src) == 0):
            return None

        original_src_txt = [' '.join(s) for s in src]
        idxs = [i for i, s in enumerate(src) if (len(s) > 1)]

        src = [src[i][:2000] for i in idxs]
        src = src[:1000]

        if (len(src) < 3):
            return None

        src_txt = [' '.join(sent) for sent in src]
        text = ' [SEP] [CLS] '.join(src_txt)
        src_subtokens = self.tokenizer.tokenize(text)
        src_subtokens = src_subtokens[:510]
        src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]']

        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]
        cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == self.cls_vid]
        labels = None
        src_txt = [original_src_txt[i] for i in idxs]
        tgt_txt = None
        return src_subtoken_idxs, labels, segments_ids, cls_ids, src_txt, tgt_txt
    
def _lazy_dataset_loader(pt_file):
  yield  pt_file

## Params

In [4]:
args = easydict.EasyDict({
    "encoder":'classifier',
    "mode":'test',
    "bert_data_path":'../bert_data/korean',
    "model_path":'../models/bert_classifier',
    "result_path":'../results',
    "temp_dir":'../temp',
    "batch_size":1000,
    "use_interval":True,
    "hidden_size":128,
    "ff_size":512,
    "heads":4,
    "inter_layers":2,
    "rnn_size":512,
    "param_init":0,
    "param_init_glorot":True,
    "dropout":0.1,
    "optim":'adam',
    "lr":2e-3,
    "report_every":1,
    "save_checkpoint_steps":100,
    "block_trigram":True,
    "recall_eval":False,
    
    "accum_count":1,
    "world_size":1,
    "visible_gpus":'0', # cpu
    "gpu_ranks":'0',
    "log_file":'../logs/train.txt',
    "test_from":'../models/bert_classifier/model_step_65000.pt'
})
model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers','encoder','ff_actv', 'use_interval','rnn_size']

##############################################################################

# Model Load
# test(args, input_data, -1, '', None)
pt = ''
step = None

init_logger(args.log_file)
device = "cpu" if args.visible_gpus == '-1' else "cuda"
device_id = 0 if device == "cuda" else -1

cp = args.test_from
try:
    step = int(cp.split('.')[-2].split('_')[-1])
except:
    step = 0

device = "cpu" if args.visible_gpus == '-1' else "cuda"
if (pt != ''):
    test_from = pt
else:
    test_from = args.test_from
logger.info('Loading checkpoint from %s' % test_from)
checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])
for k in opt.keys():
    if (k in model_flags):
        setattr(args, k, opt[k])

config = BertConfig.from_pretrained('bert-base-multilingual-cased')
model = Summarizer(args, device, load_pretrained_bert=False, bert_config = config)
model.load_cp(checkpoint)
model.eval()

In [6]:
def test(args, input_list):
  test_iter = data_loader.Dataloader(args, _lazy_dataset_loader(input_list),
                                args.batch_size, device,
                                shuffle=False, is_test=True)
  trainer = build_trainer(args, device_id, model, None)
  result = trainer.summ(test_iter, step)
  return result, input_list

##############################################################################

def txt2input(text):
  data = list(filter(None, text.split('\n')))
  bertdata = BertData()
  txt_data = bertdata.preprocess(data)
  data_dict = {"src":txt_data[0],
               "labels":[0,1,2],
               "segs":txt_data[2],
               "clss":txt_data[3],
               "src_txt":txt_data[4],
               "tgt_txt":None}
  input_data = []
  input_data.append(data_dict)
  return input_data

## Data Processing

In [8]:
import pandas as pd
from tqdm import tqdm

### 입력 csv 파일 바꿔야함
df_covid = pd.read_csv("../test_content_f_54.csv").drop('Unnamed: 0', axis=1)
# df_covid.head()

##############################################################################

# index 열 추가
# idxs = list(range(27251))
# df_covid['index'] = idxs
# df_covid.head()

##############################################################################

# 리스트 생성
# idxs = df_covid['Unnamed: 0'].tolist()
idxs = df_covid['index'].tolist()
texts = df_covid['content'].tolist()
titles = df_covid['title'].tolist()
dates = df_covid['date'].tolist()
urls = df_covid['url'].tolist()
idx_text = list(zip(idxs, texts, titles, dates, urls))

##############################################################################

# # 기준 정하기
# print(len('박임근 기자 hanjeoung990111@kookmin.ac.kr'))
# print(len('지난해 3월 만경강 도보여행길 걷기에 앞서 발원지인 밤샘에서 촬영한 모습. 박영환씨 제공'))
# 논문에서 50 미만 제거 함

# 문자열 길이 확인 및 제거 & 기자 및 이메일 제거
from tqdm import tqdm
import re

for i in tqdm(range(len(idx_text))):
    idx_text[i] = list(idx_text[i])
    if (idx_text[i][1] != ""):
        if (isinstance(idx_text[i][1], str)):
            text_list = idx_text[i][1].split('\n')
            re_text = ''

            for text in (text_list):
                if len(text) > 50:
                    text = re.sub(r"([\w\.-]+)@([\w\.-]+)(\.[\w\.]+)", "", text) # 이메일 검사

                    if " 기자" in text: # 기자 검사
                        repoter_check = text.split(" ")
                        while "기자" in repoter_check:
                            index = repoter_check.index("기자")
                            del repoter_check[index]
                            del repoter_check[index-1]
                        text = ' '.join(r for r in repoter_check)

                    re_text += (text + '\n')

            idx_text[i][1] = re_text
        
    else:
        idx_text[i][1] = ''

[2022-05-14 23:38:09,769 INFO] Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


Unnamed: 0,title,content,date,time,url,index
0,“8번째 확진자 군산 대중목욕탕·대형마트 등 방문”,"전북도, 방문한 내과 병원은 임시 휴업\n\n접촉자 72명에 대해 능동감시체계 돌입...",2020.2.2,오후 4시 50분 28초,https://www.hani.co.kr/arti/area/honam/926579....,26764
1,[세상읽기] 내가 그들의 사회안전망이다 / 류영재,1994년 르완다에서 다수민족인 후투족이 소수민족인 투치족을 학살하는 사태가 벌어졌...,2020.2.2,오후 6시 51분 55초,https://www.hani.co.kr/arti/opinion/column/926...,26770
2,11일만에 문 여는 중국 증시 9% 대폭락?,휴장기간 거래된 중국지수 ETF가 가늠자\n\n뉴욕증시 상장된 CSI300 ETF ...,2020.2.2,오후 6시 58분 43초,https://www.hani.co.kr/arti/economy/finance/92...,26771
3,2월 3일 알림,이해준 현대무용협회장\n\n◇ 한국현대무용협회는 지난달 31일 서울 대학로 한국장애...,2020.2.2,오후 7시 7분 11초,https://www.hani.co.kr/arti/society/ngo/926602...,26772
4,"손끝까지 꼼꼼히, 마스크 철벽…‘셀프 방역’이 가장 확실한 방역",지역사회 감염 불안감 커지지만\n\n위생수칙만 지켜도 피할 수 있어\n\n마스크 착...,2020.2.2,오후 8시 26분 10초,https://www.hani.co.kr/arti/economy/finance/92...,26775


## Inference

In [14]:
import time

def inference(texts):
    global df_covid
    time.sleep(1)
    idx = texts[0]
    text = texts[1]
    title = texts[2]
    date = texts[3]
    url = texts[4]
    df = df_covid[df_covid['index'] == idx]
    index = df.index[0]
    if text == "" or pd.isna(text):
        new_list.append(
            {
                "index" : idx,
                "title" : title,
                "content" : text,
                "date" : date,
                "url" : url,
                "summary" : ""
            }
        )
        df_covid = df_covid.drop(index, axis=0)
    else:
        if len(text.split('\n')) <= 3: # 원문 기사가 짧은 경우. txt2input에서 none 타입이 됨
            new_list.append(
                    {
                        "index" : idx,
                        "title" : title,
                        "content" : text,
                        "date" : date,
                        "url" : url,
                        "summary" : text
                    }
            )
        else:
            input_data = txt2input(text)
            sum_list = test(args, input_data)
            result = [list(filter(None, text.split('\n')))[i] for i in sum_list[0][0][:2]]
            try:
                summary = (result[0] + " " + result[1])
                new_list.append(
                        {
                            "index" : idx,
                            "title" : title,
                            "content" : text,
                            "date" : date,
                            "url" : url,
                            "summary" : summary
                        }
                )
            except:
                summary = (result[0])
                new_list.append(
                        {
                            "index" : idx,
                            "title" : title,
                            "content" : text,
                            "date" : date,
                            "url" : url,
                            "summary" : summary
                        }
                )
        df_covid = df_covid.drop(index, axis=0)
        return ''

In [17]:
# ThreadPool 방식
pool = ThreadPool(50)

new_list = []

try:
    for _ in tqdm(pool.imap_unordered(inference, idx_text), total=len(idx_text)):
        pass
    
    print("finish")
    
except RuntimeError:
    pool.close()
    pool.terminate()
    pool.join()
    
    df_covid.to_csv("../test_content_f_55.csv")
    
    col_name = ["index", "title", "content", "date", "url", "summary"]
    news_df = pd.DataFrame(new_list, columns=col_name)

    csv_file_name = "../test_summary_55.csv" ### 이름 바꿔줘야함

    news_df.to_csv(csv_file_name)
    news_df.head()

  0%|          | 0/14 [00:00<?, ?it/s]

gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-14 23:38:19,099 INFO] * number of parameters: 177854209
[2022-05-14 23:38:19,310 INFO] * number of parameters: 177854209


gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-14 23:38:19,330 INFO] * number of parameters: 177854209
[2022-05-14 23:38:19,663 INFO] * number of parameters: 177854209
[2022-05-14 23:38:19,682 INFO] * number of parameters: 177854209
[2022-05-14 23:38:19,694 INFO] * number of parameters: 177854209
[2022-05-14 23:38:19,883 INFO] * number of parameters: 177854209


gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-14 23:38:19,885 INFO] * number of parameters: 177854209
[2022-05-14 23:38:19,996 INFO] * number of parameters: 177854209
[2022-05-14 23:38:20,000 INFO] * number of parameters: 177854209
[2022-05-14 23:38:20,002 INFO] * number of parameters: 177854209


gpu_rank 0
gpu_rank 0


[2022-05-14 23:38:20,889 INFO] * number of parameters: 177854209
 14%|█▍        | 2/14 [00:10<00:54,  4.55s/it][2022-05-14 23:38:21,758 INFO] * number of parameters: 177854209
[2022-05-14 23:38:21,937 INFO] * number of parameters: 177854209


gpu_rank 0


100%|██████████| 14/14 [00:11<00:00,  1.20it/s]

finish





In [18]:
# 위 코드 정상적으로 끝날 경우, 저장

pool.close()
pool.terminate()
pool.join()
    
df_covid.to_csv("../test_content_f_55.csv")
    
col_name = ["index", "title", "content", "date", "url", "summary"]
news_df = pd.DataFrame(new_list, columns=col_name)

csv_file_name = "../test_summary_55.csv" # 이름 바꿔줘야함

news_df.to_csv(csv_file_name)
news_df.head()

Unnamed: 0,index,title,content,date,url,summary
0,26764,“8번째 확진자 군산 대중목욕탕·대형마트 등 방문”,신종 코로나바이러스 감염증 국내 8번 확진자가 지난달 말 귀국한 뒤 대형마트와 목욕...,2020.2.2,https://www.hani.co.kr/arti/area/honam/926579....,전북도와 군산시 등은 2일 “8번 확진자인 62살 여성은 지난달 23일 중국 우한에...
1,26854,[단독] 우한 알리는 중국 활동가의 페북 일기 “봉쇄가 공황 불러와”,신종 코로나 발병 이후 시민들의 이동이 통제되며 문을 닫는 우한의 상점들이 늘고 있...,2020.1.30,https://www.hani.co.kr/arti/society/society_ge...,생필품을 사기 위해 슈퍼마켓에 줄을 서 있는 우한 시민들. 쌀과 야채가 놓여져 있던...
2,26775,"손끝까지 꼼꼼히, 마스크 철벽…‘셀프 방역’이 가장 확실한 방역",국내에서도 신종 코로나바이러스 감염증 ‘2차 전파’ 사례가 발생하면서 지역사회 감염...,2020.2.2,https://www.hani.co.kr/arti/economy/finance/92...,국내에서도 신종 코로나바이러스 감염증 ‘2차 전파’ 사례가 발생하면서 지역사회 감염...
3,26970,"시중은행도 신종 코로나 비상대응 “손소독제 비치, 직원 마스크 착용”",케이이비(KEB)하나은행은 28일 오전 비상대책위원회를 열고 위기대응 단계를 ‘경계...,2020.1.28,https://www.hani.co.kr/arti/economy/finance/92...,케이이비(KEB)하나은행은 28일 오전 비상대책위원회를 열고 위기대응 단계를 ‘경계...
4,27218,"새내기 민나온, 데뷔전에서 ‘톱10’",새내기 민나온(19)이 어렵게 찾아온 미국여자프로골프(LPGA) 투어 데뷔 무대에서...,2007.4.30,https://www.hani.co.kr/arti/sports/golf/206180...,새내기 민나온(19)이 어렵게 찾아온 미국여자프로골프(LPGA) 투어 데뷔 무대에서...
