In [2]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, Adafactor
from datasets import load_metric, load_from_disk, load_dataset, Dataset, DatasetDict
import torch
import nltk
model_name = 'KETI-AIR/ke-t5-large-newslike'
metric = load_metric('squad')
config = AutoConfig.from_pretrained(
    model_name,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    #cache_dir=None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    config=config,
    cache_dir=None,
)

model.load_state_dict(torch.load('/opt/ml/code/models/train_dataset/pytorch_model.bin'))

model = model.to('cuda')

In [23]:
import os
from subprocess import Popen, PIPE, STDOUT
from haystack.document_store import ElasticsearchDocumentStore
from haystack.retriever import ElasticsearchRetriever
from haystack.pipeline import DocumentSearchPipeline
import json

es_server = Popen(['/opt/ml/elasticsearch-7.12.1/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

mapping = {
                      'settings':{
                          'analysis':{
                              'analyzer':{
                                  'my_analyzer':{
                                      "type": "custom",
                                      'tokenizer':'nori_tokenizer',
                                      'decompound_mode':'mixed',
                                      'stopwords':'_korean_',
                                      "filter": ["lowercase",
                                                 "my_shingle_f",
                                                 "nori_readingform",
                                                 "nori_number"]
                                  }
                              },
                              'filter':{
                                  'my_shingle_f':{
                                      "type": "shingle"
                                  }
                              }
                          },
                          'similarity':{
                              'my_similarity':{
                                  'type':'BM25',
                              }
                          }
                      },
                      'mappings':{
                          'properties':{
                              'title':{
                                  'type':'text',
                                  'analyzer':'my_analyzer',
                                  'similarity':'my_similarity'
                              },
                              'text':{
                                  'type':'text',
                                  'analyzer':'my_analyzer',
                                  'similarity':'my_similarity'
                              }
                          }
                      }
                  }


# from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", custom_mapping=mapping)

if len(document_store.get_all_documents()) == 0:
    with open('/opt/ml/input/data/data/wikipedia_documents.json', "r") as f:
        wiki = json.load(f)
    contexts = list(dict.fromkeys([v['text'] for v in wiki.values()]))

    dicts = [
        {
            'text': context,
            'meta': {}
        } for context in contexts
    ]
    document_store.write_documents(dicts)

retriever = ElasticsearchRetriever(document_store)
pipe = DocumentSearchPipeline(retriever)

In [14]:
import string
import re
def normalize_answer(s):
    def remove_(text):
        """ 불필요한 기호 제거 """
        text = re.sub("'", " ", text)
        text = re.sub('"', " ", text)
        text = re.sub("《", " ", text)
        text = re.sub("》", " ", text)
        text = re.sub("<", " ", text)
        text = re.sub(">", " ", text)
        text = re.sub("〈", " ", text)
        text = re.sub("〉", " ", text)
        text = re.sub("\(", " ", text)
        text = re.sub("\)", " ", text)
        text = re.sub("‘", " ", text)
        text = re.sub("’", " ", text)
        return text

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(remove_(s))))

In [7]:
test_dataset = load_from_disk('/opt/ml/input/data/data/test_dataset/')['validation']

In [22]:
from tqdm.auto import tqdm
import numpy as np
import json
import pickle

submit = {}
submit_detail = {}
topk = 1

for t in tqdm(test_dataset):
    b = pipe.run(t['question'], top_k_retriever=topk)
    pred_list = []
    prop = []
    docs = []
    doc_score = []
    doc_prop = []
    for idx, context in enumerate(b['documents']):
        question = re.sub(r'\\n+|날짜=[\d]+-[\d]+-[\d]+', ' ', t['question']).strip()
        question = re.sub(r'\([一-龥]+\)', '', question)
        p = re.sub(r'\\n+|날짜=[\d]+-[\d]+-[\d]+', ' ', context['text']).strip()
        p = re.sub(r'\([一-龥]+\)', '', p)

        tok = tokenizer(
            f"question : {question} context : ",
            f"{p}",
            truncation = 'only_second',
            max_length = 1041,
            stride = 300,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding=False,
        )

        for inputs in tok['input_ids']:
            eos_index = inputs.index(1)
            k = inputs.pop(eos_index)

        for input_ids in tok['input_ids']:
            outputs = model.generate(torch.tensor([input_ids]).to('cuda'), num_beams=5, num_return_sequences=5, return_dict_in_generate=True, output_scores=True)
            pred = normalize_answer(tokenizer.decode(list(outputs.values())[0][0], skip_special_tokens=True).strip())
            pred_list.append(pred)
            prop.append((outputs.sequences_scores[0]))
            docs.append(tokenizer.decode(input_ids))
            doc_score.append(context['score'])
            doc_prop.append(context['probability'])

    submit[t['id']] = pred_list[np.argmax(prop)]
    submit_detail[t['id']] = {'doc_score':doc_score, 'docs':docs, 'doc_prop':doc_prop, 'prop':prop, 'pred_list':pred_list}
    
output_path = '/opt/ml/code/outputs/test_dataset/'
pred_path = os.path.join(output_path, 'predictions.json')
pred_info_path = os.path.join(output_path, 'pred_info')

if not os.path.isdir(output_path):
    os.makedirs(output_path)

with open(pred_path, "w") as writer:
    writer.write(json.dumps(submit, indent=4, ensure_ascii=False) + "\n")

with open(pred_info_path, "wb") as file:
    pickle.dump(submit_detail, file)