## Setting

In [None]:
# !pip install pyrouge --upgrade
# !pip install https://github.com/bheinzerling/pyrouge/archive/master.zip
# !pip install pyrouge
# !pip show pyrouge
# !git clone https://github.com/andersjo/pyrouge.git
# from pyrouge import Rouge155
# !pyrouge_set_rouge_path 'pyrouge/tools/ROUGE-1.5.5'

In [2]:
# !pip install transformers
# !pip install tensorboardX
# !pip install easydict

## Preprocessing

In [1]:
import os

os.chdir('KorBertSum/src')

import torch
import numpy as np
from models import data_loader, model_builder
from models.model_builder import Summarizer
from others.logging import logger, init_logger
from models.data_loader import load_dataset
from transformers import BertConfig, BertTokenizer
from tensorboardX import SummaryWriter
from models.reporter import ReportMgr
from models.stats import Statistics
import easydict
from multiprocessing.dummy import Pool as ThreadPool

In [2]:
def _tally_parameters(model):
    n_params = sum([p.nelement() for p in model.parameters()])
    return n_params

def build_trainer(args, device_id, model,
                  optim):
    """
    Simplify `Trainer` creation based on user `opt`s*
    Args:
        opt (:obj:`Namespace`): user options (usually from argument parsing)
        model (:obj:`onmt.models.NMTModel`): the model to train
        fields (dict): dict of fields
        optim (:obj:`onmt.utils.Optimizer`): optimizer used during training
        data_type (str): string describing the type of data
            e.g. "text", "img", "audio"
        model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object
            used to save the model
    """
    device = "cpu" if args.visible_gpus == '-1' else "cuda"


    grad_accum_count = args.accum_count
    n_gpu = args.world_size

    if device_id >= 0:
        gpu_rank = int(args.gpu_ranks[device_id])
    else:
        gpu_rank = 0
        n_gpu = 0

    print('gpu_rank %d' % gpu_rank)

    tensorboard_log_dir = args.model_path

    writer = SummaryWriter(tensorboard_log_dir, comment="Unmt")

    report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer)

    trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager)

    # print(tr)
    if (model):
        n_params = _tally_parameters(model)
        logger.info('* number of parameters: %d' % n_params)

    return trainer

class Trainer(object):
    """
    Class that controls the training process.

    Args:
            model(:py:class:`onmt.models.model.NMTModel`): translation model
                to train
            train_loss(:obj:`onmt.utils.loss.LossComputeBase`):
               training loss computation
            valid_loss(:obj:`onmt.utils.loss.LossComputeBase`):
               training loss computation
            optim(:obj:`onmt.utils.optimizers.Optimizer`):
               the optimizer responsible for update
            trunc_size(int): length of truncated back propagation through time
            shard_size(int): compute loss in shards of this size for efficiency
            data_type(string): type of the source input: [text|img|audio]
            norm_method(string): normalization methods: [sents|tokens]
            grad_accum_count(int): accumulate gradients this many times.
            report_manager(:obj:`onmt.utils.ReportMgrBase`):
                the object that creates reports, or None
            model_saver(:obj:`onmt.models.ModelSaverBase`): the saver is
                used to save a checkpoint.
                Thus nothing will be saved if this parameter is None
    """

    def __init__(self,  args, model,  optim,
                  grad_accum_count=1, n_gpu=1, gpu_rank=1,
                  report_manager=None):
        # Basic attributes.
        self.args = args
        self.save_checkpoint_steps = args.save_checkpoint_steps
        self.model = model
        self.optim = optim
        self.grad_accum_count = grad_accum_count
        self.n_gpu = n_gpu
        self.gpu_rank = gpu_rank
        self.report_manager = report_manager

        self.loss = torch.nn.BCELoss(reduction='none')
        assert grad_accum_count > 0
        # Set model in training mode.
        if (model):
            self.model.train()
            
    def summ(self, test_iter, step, cal_lead=False, cal_oracle=False):
          """ Validate model.
              valid_iter: validate data iterator
          Returns:
              :obj:`nmt.Statistics`: validation loss statistics
          """
          # Set model in validating mode.
          def _get_ngrams(n, text):
              ngram_set = set()
              text_length = len(text)
              max_index_ngram_start = text_length - n
              for i in range(max_index_ngram_start + 1):
                  ngram_set.add(tuple(text[i:i + n]))
              return ngram_set

          def _block_tri(c, p):
              tri_c = _get_ngrams(3, c.split())
              for s in p:
                  tri_s = _get_ngrams(3, s.split())
                  if len(tri_c.intersection(tri_s))>0:
                      return True
              return False

          if (not cal_lead and not cal_oracle):
              self.model.eval()
          stats = Statistics()

          with torch.no_grad():
              for batch in test_iter:
                  src = batch.src
                  labels = batch.labels
                  segs = batch.segs
                  clss = batch.clss
                  mask = batch.mask
                  mask_cls = batch.mask_cls

                  if (cal_lead):
                      selected_ids = [list(range(batch.clss.size(1)))] * batch.batch_size
                  elif (cal_oracle):
                      selected_ids = [[j for j in range(batch.clss.size(1)) if labels[i][j] == 1] for i in
                                      range(batch.batch_size)]
                  else:
                      sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)
                      sent_scores = sent_scores + mask.float()
                      sent_scores = sent_scores.cpu().data.numpy()
                      selected_ids = np.argsort(-sent_scores, 1)
          return selected_ids

    def _gradient_accumulation(self, true_batchs, normalization, total_stats,
                               report_stats):
        if self.grad_accum_count > 1:
            self.model.zero_grad()

        for batch in true_batchs:
            if self.grad_accum_count == 1:
                self.model.zero_grad()

            src = batch.src
            labels = batch.labels
            segs = batch.segs
            clss = batch.clss
            mask = batch.mask
            mask_cls = batch.mask_cls

            sent_scores, mask = self.model(src, segs, clss, mask, mask_cls)

            loss = self.loss(sent_scores, labels.float())
            loss = (loss*mask.float()).sum()
            (loss/loss.numel()).backward()
            # loss.div(float(normalization)).backward()

            batch_stats = Statistics(float(loss.cpu().data.numpy()), normalization)


            total_stats.update(batch_stats)
            report_stats.update(batch_stats)

            # 4. Update the parameters and statistics.
            if self.grad_accum_count == 1:
                # Multi GPU gradient gather
                if self.n_gpu > 1:
                    grads = [p.grad.data for p in self.model.parameters()
                             if p.requires_grad
                             and p.grad is not None]
                    distributed.all_reduce_and_rescale_tensors(
                        grads, float(1))
                self.optim.step()

        # in case of multi step gradient accumulation,
        # update only after accum batches
        if self.grad_accum_count > 1:
            if self.n_gpu > 1:
                grads = [p.grad.data for p in self.model.parameters()
                         if p.requires_grad
                         and p.grad is not None]
                distributed.all_reduce_and_rescale_tensors(
                    grads, float(1))
            self.optim.step()
            
    def _save(self, step):
        real_model = self.model
        # real_generator = (self.generator.module
        #                   if isinstance(self.generator, torch.nn.DataParallel)
        #                   else self.generator)

        model_state_dict = real_model.state_dict()
        # generator_state_dict = real_generator.state_dict()
        checkpoint = {
            'model': model_state_dict,
            # 'generator': generator_state_dict,
            'opt': self.args,
            'optim': self.optim,
        }
        checkpoint_path = os.path.join(self.args.model_path, 'model_step_%d.pt' % step)
        logger.info("Saving checkpoint %s" % checkpoint_path)
        # checkpoint_path = '%s_step_%d.pt' % (FLAGS.model_path, step)
        if (not os.path.exists(checkpoint_path)):
            torch.save(checkpoint, checkpoint_path)
            return checkpoint, checkpoint_path

    def _start_report_manager(self, start_time=None):
        """
        Simple function to start report manager (if any)
        """
        if self.report_manager is not None:
            if start_time is None:
                self.report_manager.start()
            else:
                self.report_manager.start_time = start_time

    def _maybe_gather_stats(self, stat):
        """
        Gather statistics in multi-processes cases

        Args:
            stat(:obj:onmt.utils.Statistics): a Statistics object to gather
                or None (it returns None in this case)

        Returns:
            stat: the updated (or unchanged) stat object
        """
        if stat is not None and self.n_gpu > 1:
            return Statistics.all_gather_stats(stat)
        return stat

    def _maybe_report_training(self, step, num_steps, learning_rate,
                               report_stats):
        """
        Simple function to report training stats (if report_manager is set)
        see `onmt.utils.ReportManagerBase.report_training` for doc
        """
        if self.report_manager is not None:
            return self.report_manager.report_training(
                step, num_steps, learning_rate, report_stats,
                multigpu=self.n_gpu > 1)
        
    def _report_step(self, learning_rate, step, train_stats=None,
                     valid_stats=None):
        """
        Simple function to report stats (if report_manager is set)
        see `onmt.utils.ReportManagerBase.report_step` for doc
        """
        if self.report_manager is not None:
            return self.report_manager.report_step(
                learning_rate, step, train_stats=train_stats,
                valid_stats=valid_stats)

    def _maybe_save(self, step):
        """
        Save the model if a model saver is set
        """
        if self.model_saver is not None:
            self.model_saver.maybe_save(step)

class BertData():
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.sep_vid = self.tokenizer.vocab['[SEP]']
        self.cls_vid = self.tokenizer.vocab['[CLS]']
        self.pad_vid = self.tokenizer.vocab['[PAD]']

    def preprocess(self, src):

        if (len(src) == 0):
            return None

        original_src_txt = [' '.join(s) for s in src]
        idxs = [i for i, s in enumerate(src) if (len(s) > 1)]

        src = [src[i][:2000] for i in idxs]
        src = src[:1000]

        if (len(src) < 3):
            return None

        src_txt = [' '.join(sent) for sent in src]
        text = ' [SEP] [CLS] '.join(src_txt)
        src_subtokens = self.tokenizer.tokenize(text)
        src_subtokens = src_subtokens[:510]
        src_subtokens = ['[CLS]'] + src_subtokens + ['[SEP]']

        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]
        cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == self.cls_vid]
        labels = None
        src_txt = [original_src_txt[i] for i in idxs]
        tgt_txt = None
        return src_subtoken_idxs, labels, segments_ids, cls_ids, src_txt, tgt_txt
    
def _lazy_dataset_loader(pt_file):
  yield  pt_file

## Params

In [3]:
args = easydict.EasyDict({
    "encoder":'transformer', ## classifier or transformer
    "mode":'test',
    "bert_data_path":'../bert_data/korean',
    "model_path":'../models/bert_trans_1', ## check
    "result_path":'../results',
    "temp_dir":'../temp',
    "batch_size":1000,
    "use_interval":True,
    "hidden_size":128,
    "ff_size":512,
    "heads":4,
    "inter_layers":2,
    "rnn_size":512,
    "param_init":0,
    "param_init_glorot":True,
    "dropout":0.1,
    "optim":'adamW', ## check
    "lr":2e-3,
    "report_every":1,
    "save_checkpoint_steps":100,
    "block_trigram":True,
    "recall_eval":False,
    
    "accum_count":1,
    "world_size":1,
    "visible_gpus":'0', # 0 = gpu, -1 = cpu ## check
    "gpu_ranks":'0',
    "log_file":'../logs/train_trans_1.txt', ## check
    "test_from":'../models/bert_trans_1/model_step_65000.pt' ## check
})
model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers','encoder','ff_actv', 'use_interval','rnn_size']

##############################################################################

# Model Load
# test(args, input_data, -1, '', None)
pt = ''
step = None

init_logger(args.log_file)
device = "cpu" if args.visible_gpus == '-1' else "cuda"
device_id = 0 if device == "cuda" else -1

cp = args.test_from
try:
    step = int(cp.split('.')[-2].split('_')[-1])
except:
    step = 0

device = "cpu" if args.visible_gpus == '-1' else "cuda"
if (pt != ''):
    test_from = pt
else:
    test_from = args.test_from
logger.info('Loading checkpoint from %s' % test_from)
checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])
for k in opt.keys():
    if (k in model_flags):
        setattr(args, k, opt[k])

config = BertConfig.from_pretrained('bert-base-multilingual-cased')
model = Summarizer(args, device, load_pretrained_bert=False, bert_config = config)
model.load_cp(checkpoint)
model.eval()

[2022-05-16 16:02:44,689 INFO] Loading checkpoint from ../models/bert_trans_1/model_step_65000.pt


Summarizer(
  (bert): Bert(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNo

In [4]:
def test(args, input_list):
  test_iter = data_loader.Dataloader(args, _lazy_dataset_loader(input_list),
                                args.batch_size, device,
                                shuffle=False, is_test=True)
  trainer = build_trainer(args, device_id, model, None)
  result = trainer.summ(test_iter, step)
  return result, input_list

##############################################################################

def txt2input(text):
  data = list(filter(None, text.split('\n')))
  bertdata = BertData()
  txt_data = bertdata.preprocess(data)
  data_dict = {"src":txt_data[0],
               "labels":[0,1,2],
               "segs":txt_data[2],
               "clss":txt_data[3],
               "src_txt":txt_data[4],
               "tgt_txt":None}
  input_data = []
  input_data.append(data_dict)
  return input_data

## Data Processing

In [13]:
### DB 검색어 데이터셋 가져오기

import requests
import json
import pandas as pd

res = requests.get('http://ec2-3-34-47-218.ap-northeast-2.compute.amazonaws.com:5000/news?query=' + '커피') # 검색어 바꾸기
news_dict = json.loads(res.text) # list 안에 딕셔너리 형태

col_name = ["_id", "content", "date", "journal", "summary", "title", "url"]
news_df = pd.DataFrame(news_dict, columns=col_name)

news_df.head()

Unnamed: 0,_id,content,date,journal,summary,title,url
0,627cdd4d7f98dd6342b32231,최인아 객원논설위원·최인아책방 대표\n\n이제 마스크도 벗나 했지만 앞으로도 얼마간...,2022-04-16,동아일보,이제 마스크도 벗나 했지만 앞으로도 얼마간은 더 써야 하는 모양이다. 그래도 다음 ...,‘존재의 이유’를 말해 보라[동아광장/최인아],https://www.donga.com/news/Opinion/article/all...
1,627cdd4d7f98dd6342b32249,"‘엄선한 원두와 뛰어난 기술력의 조화, 황금 비율 커피 ‘맥심 모카골드’\n\n커피...",2022-04-20,동아일보,대한민국 국민이라면 누구나 아는 노란색 커피믹스가 있다. 바로 맥심 모카골드다. 맥...,‘홈카페’하면 맥심… 모카골드와 커피 한잔의 여유,https://www.donga.com/news/Economy/article/all...
2,627cdd4d7f98dd6342b3229f,SPC그룹 계열사 비알코리아가 운영하는 던킨이 ‘뉴웨이브 프로젝트’의 일환으로 고메...,2022-04-20,동아일보,SPC그룹 계열사 비알코리아가 운영하는 던킨이 ‘뉴웨이브 프로젝트’의 일환으로 고메...,"고메 도넛, 프리미엄 디저트로 자리잡아",https://www.donga.com/news/Economy/article/all...
3,627cdd4d7f98dd6342b32313,연구진이 개발한 VR 시스템 ‘게임체인지’ 화면. 커피를 요청하거나 버스에 탑승하는...,2022-04-18,동아일보,연구진이 개발한 VR 시스템 ‘게임체인지’ 화면. 커피를 요청하거나 버스에 탑승하는...,‘마음의 병’ 고치러 가상현실 로그인… 실제 치료효과 입증됐다,https://www.donga.com/news/It/article/all/2022...
4,627cdd4d7f98dd6342b323a9,서울 중구 덕수궁 석조전에서 커피를 맛보며 밤하늘을 감상하는 야간 체험 행사가 열린...,2022-04-21,동아일보,서울 중구 덕수궁 석조전에서 커피를 맛보며 밤하늘을 감상하는 야간 체험 행사가 열린...,커피 마시며 덕수궁 밤하늘 감상… 내달 3일부터 ‘밤의 석조전’ 행사,https://www.donga.com/news/Culture/article/all...


In [36]:
### 동아일보 들여쓰기
from tqdm import tqdm
from more_itertools import locate

new_list = []

for i in tqdm(range(len(news_df))):
    idx = news_df.loc[i]['_id']
    test_txt = news_df.loc[i]['content']
    date = news_df.loc[i]['date']
    journal = news_df.loc[i]['journal']
    title = news_df.loc[i]['title']
    url = news_df.loc[i]['url']
        
    if journal == '동아일보':
        # 문장 끝 부호 인덱스 구하기
        pos_1 = list(locate(test_txt, (lambda x: x == ".")))
        pos_2 = list(locate(test_txt, (lambda x: x == "?")))
        pos_3 = list(locate(test_txt, (lambda x: x == "!")))

        pos = (pos_1 + pos_2 + pos_3)
        pos.sort()
        
        # 문장별 리스트로 쪼개기
        txts = []
        for i in range(len(pos)):
            if i == 0:
                txts.append(test_txt[:(pos[i]+1)])
            elif i == (len(pos)-1):
                txts.append(test_txt[(pos[i-1]+1):(pos[i]+1)])
                txts.append(test_txt[(pos[i]+1):])
            else:
                txts.append(test_txt[(pos[i-1]+1):(pos[i]+1)])
                
        # \n 추가
        # 쪼개서 추가하는 이유.. 확인하고 바로 \n 추가하면 인덱스가 달라져서..
        txt = ""
        for i in range(1, len(txts)):
            if len(txts[i]) == 0:
                continue
            elif txts[i][0] != " ":
                txts[i - 1] += "\n\n"

        # 문장 하나로 합치기
        for i in range(len(txts)):
            txt += txts[i]
        
        # Df에서 바로 변경하고 싶었지만, 변경되지 않아서 새로 만듦
        new_list.append(
            {
                "_id" : idx,
                "content" : txt,
                "date" : date,
                "journal" : journal,
                "summary" : "",
                "title" : title,
                "url" : url
            }
        )
    else: # 한겨레일 때
        new_list.append(
            {
                "_id" : idx,
                "content" : test_txt,
                "date" : date,
                "journal" : journal,
                "summary" : "",
                "title" : title,
                "url" : url
            }
        )

col_name = ["_id", "content", "date", "journal", "summary", "title", "url"]
news_df_test = pd.DataFrame(new_list, columns=col_name)

##############################################################################

### 리스트 생성
idxs = news_df_test['_id'].tolist()
texts = news_df_test['content'].tolist()
idx_text = list(zip(idxs, texts))

##############################################################################

# # 기준 정하기
# print(len('박임근 기자 hanjeoung990111@kookmin.ac.kr'))
# print(len('지난해 3월 만경강 도보여행길 걷기에 앞서 발원지인 밤샘에서 촬영한 모습. 박영환씨 제공'))
# 논문에서 50 미만 제거 함

# 문자열 길이 확인 및 제거 & 기자 및 이메일 제거
from tqdm import tqdm
import re

for i in tqdm(range(len(idx_text))):
    idx_text[i] = list(idx_text[i])
    if (idx_text[i][1] != ""):
        if (isinstance(idx_text[i][1], str)):
            text_list = idx_text[i][1].split('\n')
            re_text = ''

            for text in (text_list):
                if len(text) > 50:
                    text = re.sub(r"([\w\.-]+)@([\w\.-]+)(\.[\w\.]+)", "", text) # 이메일 검사

                    if " 기자" in text: # 기자 검사
                        repoter_check = text.split(" ")
                        while "기자" in repoter_check:
                            index = repoter_check.index("기자")
                            del repoter_check[index]
                            del repoter_check[index-1]
                        text = ' '.join(r for r in repoter_check)

                    re_text += (text + '\n')

            idx_text[i][1] = re_text
        
    else:
        idx_text[i][1] = ''

100%|██████████| 75/75 [00:00<00:00, 5172.19it/s]


## Inference

In [32]:
## DB 연결

from bson.objectid import ObjectId
from pymongo import MongoClient
client = MongoClient("mongodb+srv://BaekYeonsun:hello12345@cluster.3dypr.mongodb.net/database?retryWrites=true&w=majority")
collection = client.database.news

# collection.update_one({'_id': ObjectId('627cdd2c7f98dd6342b32229')}, {'$set': {'summary':'test'}})

Plink failed to import tkinter.


In [33]:
import time

def inference(texts):
    time.sleep(1)
    idx = texts[0]
    text = texts[1]
    
    if text == "" or pd.isna(text):
        collection.update_one({'_id': ObjectId(idx)}, {'$set': {'summary':''}})
    else:
        if len(text.split('\n')) <= 3: # 원문 기사가 짧은 경우. txt2input에서 none 타입이 됨
            collection.update_one({'_id': ObjectId(idx)}, {'$set': {'summary': text}})
        else:
            input_data = txt2input(text)
            sum_list = test(args, input_data)
            result = [list(filter(None, text.split('\n')))[i] for i in sum_list[0][0][:2]]
            try:
                summary = (result[0] + " " + result[1])
                collection.update_one({'_id': ObjectId(idx)}, {'$set': {'summary': summary}})
            except:
                summary = (result[0])
                collection.update_one({'_id': ObjectId(idx)}, {'$set': {'summary': summary}})
    return ''

In [38]:
# ThreadPool 방식
pool = ThreadPool(50)

for _ in tqdm(pool.imap_unordered(inference, idx_text), total=len(idx_text)):
    pass
    
print("finish")

# pool.close()
# pool.terminate()
# pool.join()

  1%|▏         | 1/75 [00:01<01:15,  1.02s/it]

gpu_rank 0


[2022-05-16 16:18:39,514 INFO] * number of parameters: 184162049
[2022-05-16 16:18:40,642 INFO] * number of parameters: 184162049
  9%|▉         | 7/75 [00:17<02:50,  2.51s/it]

gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-16 16:18:41,034 INFO] * number of parameters: 184162049
[2022-05-16 16:18:42,401 INFO] * number of parameters: 184162049
 11%|█         | 8/75 [00:21<03:05,  2.78s/it]

gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-16 16:18:42,427 INFO] * number of parameters: 184162049
[2022-05-16 16:18:42,609 INFO] * number of parameters: 184162049
 12%|█▏        | 9/75 [00:24<03:04,  2.80s/it]

gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-16 16:18:44,470 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0


 15%|█▍        | 11/75 [00:25<02:02,  1.92s/it][2022-05-16 16:18:44,609 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-16 16:18:44,966 INFO] * number of parameters: 184162049
[2022-05-16 16:18:45,497 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0
gpu_rank 0


 16%|█▌        | 12/75 [00:27<01:56,  1.86s/it][2022-05-16 16:18:45,605 INFO] * number of parameters: 184162049
[2022-05-16 16:18:45,635 INFO] * number of parameters: 184162049
[2022-05-16 16:18:46,181 INFO] * number of parameters: 184162049
[2022-05-16 16:18:46,219 INFO] * number of parameters: 184162049
[2022-05-16 16:18:46,339 INFO] * number of parameters: 184162049
[2022-05-16 16:18:47,823 INFO] * number of parameters: 184162049
[2022-05-16 16:18:47,843 INFO] * number of parameters: 184162049
[2022-05-16 16:18:47,908 INFO] * number of parameters: 184162049
[2022-05-16 16:18:47,956 INFO] * number of parameters: 184162049
[2022-05-16 16:18:48,154 INFO] * number of parameters: 184162049
[2022-05-16 16:18:48,161 INFO] * number of parameters: 184162049
[2022-05-16 16:18:48,368 INFO] * number of parameters: 184162049
[2022-05-16 16:18:48,941 INFO] * number of parameters: 184162049
[2022-05-16 16:18:48,962 INFO] * number of parameters: 184162049
[2022-05-16 16:18:49,543 INFO] * number of 

gpu_rank 0


[2022-05-16 16:18:52,875 INFO] * number of parameters: 184162049
 17%|█▋        | 13/75 [00:27<01:35,  1.53s/it][2022-05-16 16:18:52,877 INFO] * number of parameters: 184162049
[2022-05-16 16:18:53,739 INFO] * number of parameters: 184162049
[2022-05-16 16:18:53,811 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0


[2022-05-16 16:18:54,184 INFO] * number of parameters: 184162049
[2022-05-16 16:18:54,186 INFO] * number of parameters: 184162049
[2022-05-16 16:18:54,196 INFO] * number of parameters: 184162049
[2022-05-16 16:18:54,697 INFO] * number of parameters: 184162049
[2022-05-16 16:18:54,706 INFO] * number of parameters: 184162049
 19%|█▊        | 14/75 [00:38<04:14,  4.18s/it][2022-05-16 16:18:54,708 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0


[2022-05-16 16:18:54,718 INFO] * number of parameters: 184162049
 21%|██▏       | 16/75 [00:46<03:47,  3.85s/it][2022-05-16 16:18:55,122 INFO] * number of parameters: 184162049
 23%|██▎       | 17/75 [00:47<02:52,  2.98s/it][2022-05-16 16:18:56,166 INFO] * number of parameters: 184162049
 24%|██▍       | 18/75 [00:50<02:44,  2.88s/it][2022-05-16 16:19:12,064 INFO] * number of parameters: 184162049
[2022-05-16 16:19:12,655 INFO] * number of parameters: 184162049
 75%|███████▍  | 56/75 [00:54<00:03,  5.20it/s][2022-05-16 16:19:22,114 INFO] * number of parameters: 184162049
[2022-05-16 16:19:22,125 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0
gpu_rank 0


 76%|███████▌  | 57/75 [00:56<00:05,  3.16it/s][2022-05-16 16:19:23,538 INFO] * number of parameters: 184162049
 79%|███████▊  | 59/75 [00:57<00:05,  2.95it/s][2022-05-16 16:19:24,704 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0


[2022-05-16 16:19:25,221 INFO] * number of parameters: 184162049


gpu_rank 0


[2022-05-16 16:19:26,096 INFO] * number of parameters: 184162049
 83%|████████▎ | 62/75 [01:00<00:07,  1.71it/s]

gpu_rank 0
gpu_rank 0


[2022-05-16 16:19:28,541 INFO] * number of parameters: 184162049
[2022-05-16 16:19:28,796 INFO] * number of parameters: 184162049
[2022-05-16 16:19:28,798 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0
gpu_rank 0


[2022-05-16 16:19:28,941 INFO] * number of parameters: 184162049
[2022-05-16 16:19:28,980 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0


[2022-05-16 16:19:30,757 INFO] * number of parameters: 184162049
 84%|████████▍ | 63/75 [01:05<00:14,  1.25s/it][2022-05-16 16:19:31,393 INFO] * number of parameters: 184162049
 85%|████████▌ | 64/75 [01:05<00:11,  1.06s/it]

gpu_rank 0
gpu_rank 0


[2022-05-16 16:19:33,142 INFO] * number of parameters: 184162049
[2022-05-16 16:19:33,743 INFO] * number of parameters: 184162049
 91%|█████████ | 68/75 [01:06<00:04,  1.50it/s]

gpu_rank 0
gpu_rank 0


[2022-05-16 16:19:34,077 INFO] * number of parameters: 184162049
 92%|█████████▏| 69/75 [01:07<00:03,  1.64it/s][2022-05-16 16:19:34,084 INFO] * number of parameters: 184162049


gpu_rank 0
gpu_rank 0


[2022-05-16 16:19:34,890 INFO] * number of parameters: 184162049
 93%|█████████▎| 70/75 [01:08<00:04,  1.19it/s][2022-05-16 16:19:35,391 INFO] * number of parameters: 184162049
100%|██████████| 75/75 [01:10<00:00,  1.07it/s]

finish





In [44]:
collection.find_one({'_id': ObjectId(idx_text[0][0])})

{'_id': ObjectId('627cdd4d7f98dd6342b32231'),
 'content': '최인아 객원논설위원·최인아책방 대표\n\n이제 마스크도 벗나 했지만 앞으로도 얼마간은 더 써야 하는 모양이다. 그래도 다음 주부턴 사적 모임 인원과 영업시간 제한을 전면 해제하는 등 2년 넘게 유지돼 온 사회적 거리 두기가 끝이 난다. 휴∼ 다행이다. 그동안 참 힘들었다. 모두의 고생이 컸지만 특히 자영업자들은 죽을 맛이었다. 나 또한 책방을 시작한 지 7년 차 자영업자로 같은 사정에 처했고 어려움을 겪었다.요즘 책방들은 책만 팔지 않는다. 커피 등 음료도 판다는 뜻이 아니다. 그곳에선 책만 읽어서는 결코 알 수 없는 경험들을 다채롭게 제공한다. 당장 우리 책방만 해도 책을 낸 저자와 독자가 직접 만나는 북 토크는 기본이고 ‘파친코’ 등 영어로 쓰인 소설을 원문으로 읽거나 논어 같은 고전을 배우는 수업, 아티스트를 모셔서 평소엔 듣기 어려운 크리에이티브에 대한 이야기를 듣는 등 다채로운 콘텐츠를 마련하고 있다. 우리 고객 한 분은 이렇게 말할 정도다. 책방의 저녁은 날마다 새로운 프로그램이 펼쳐지는 뮤지컬 공연장 같다고.이런 일을 기획하고 준비하자면 많은 수고가 들어갈 텐데 왜 이런 일을 벌이는 걸까? 우선 생존의 이유가 있다. 우리는 디지털 시대를 살고 있지 않은가. 게다가 우리나라 사람들은 온라인에서 보내는 시간이 길다. 얼마 전 발표된 자료에 따르면 한국 사람들은 일생 동안 34년을 인터넷을 하며 보낸다고 한다. 미국, 영국, 프랑스 같은 나라들보다 훨씬 길다. 웬만한 필요는 거의 다 온라인상에서 충족되는 셈이다. 도서도 온라인 구매가 점점 더 늘고 있다. 사정이 이러니 우리 같은 오프라인 책방, 동네 책방들은 책 판매만으론 임차료 내기도 어렵다.이래서였을까? 내가 책방을 하기로 마음먹고 가장 오래 성심을 다해 준비한 것이 우리 책방의 ‘존재의 이유’를 찾는 일이었다. 책을 읽지 않는 시대에 책방이 웬 말이냐고, 적은 나이가 아닌데 이제 실패하면 만회