# 터미널에서 실행할 것

## elasticsearch 설치
```bash
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.12.1-linux-x86_64.tar.gz
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.12.1-linux-x86_64.tar.gz.sha512
shasum -a 512 -c elasticsearch-7.12.1-linux-x86_64.tar.gz.sha512 
tar -xzf elasticsearch-7.12.1-linux-x86_64.tar.gz
```

## haystack 설치
`pip install farm-haystack`

## 한국어 형태소 분석기 설치
```bash
cd elasticsearch-7.12.1
elasticsearch-7.12.1/elasticsearch-plugin install analysis-nori;

chown -R daemon:daemon elasticsearch-7.12.1
```

In [None]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, Adafactor
from datasets import load_metric, load_from_disk, load_dataset, Dataset, DatasetDict
import torch
import nltk
model_name = 'KETI-AIR/ke-t5-large-newslike'
metric = load_metric('squad')
config = AutoConfig.from_pretrained(
    model_name,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    #cache_dir=None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    config=config,
    cache_dir=None,
)

model.load_state_dict(torch.load('/opt/ml/p3-mrc-dok/t5_train_pred/KETI-AIR/ke-t5-large/outputs/pytorch_model.bin'))
max_source_length = 1040
max_target_length = 40
padding = False

model = model.to('cuda')

In [None]:
import string
import re
def normalize_answer(s):
    def remove_(text):
        """ 불필요한 기호 제거 """
        text = re.sub("'", " ", text)
        text = re.sub('"', " ", text)
        text = re.sub("《", " ", text)
        text = re.sub("》", " ", text)
        text = re.sub("<", " ", text)
        text = re.sub(">", " ", text)
        text = re.sub("〈", " ", text)
        text = re.sub("〉", " ", text)
        text = re.sub("\(", " ", text)
        text = re.sub("\)", " ", text)
        text = re.sub("‘", " ", text)
        text = re.sub("’", " ", text)
        return text

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(remove_(s))))

In [None]:
test_dataset = load_from_disk('/opt/ml/input/data/data/test_dataset/')

In [None]:
import os
import json
from tqdm import tqdm
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['/opt/ml/elasticsearch-7.12.1/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

mapping = {
                      'settings':{
                          'analysis':{
                              'analyzer':{
                                  'my_analyzer':{
                                      "type": "custom",
                                      'tokenizer':'nori_tokenizer',
                                      'decompound_mode':'mixed',
                                      'stopwords':'_korean_',
                                      "filter": ["lowercase",
                                                 "my_shingle_f",
                                                 "nori_readingform",
                                                 "nori_number"]
                                  }
                              },
                              'filter':{
                                  'my_shingle_f':{
                                      "type": "shingle"
                                  }
                              }
                          },
                          'similarity':{
                              'my_similarity':{
                                  'type':'BM25',
                              }
                          }
                      },
                      'mappings':{
                          'properties':{
                              'title':{
                                  'type':'text',
                                  'analyzer':'my_analyzer',
                                  'similarity':'my_similarity'
                              },
                              'text':{
                                  'type':'text',
                                  'analyzer':'my_analyzer',
                                  'similarity':'my_similarity'
                              }
                          }
                      }
                  }

from haystack.document_store import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", custom_mapping=mapping, timeout=60)

with open('/opt/ml/input/data/data/wikipedia_documents.json', "r") as f:
    wiki = json.load(f)
contexts = list(dict.fromkeys([v['text'] for v in wiki.values()]))


dicts = [
    {
        'text': context,
        'meta': {}
    } for context in tqdm(contexts)
]
document_store.write_documents(dicts)

from haystack.retriever import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store)
from haystack.pipeline import DocumentSearchPipeline
pipe = DocumentSearchPipeline(retriever)

In [None]:
import numpy as np
import math

topk = 5
submit = {}

for t in test_dataset['validation']:
    b = pipe.run(t['question'], top_k_retriever=topk)
    #print(b)
    prop = []
    pred_list = []
    docs = []
    doc_score = []
    doc_prop = []
    for context in b['documents']:
        question = re.sub(r'\\n+|날짜=[\d]+-[\d]+-[\d]+', ' ', t['question']).strip()
        question = re.sub(r'\([一-龥]+\)', '', question)
        p = re.sub(r'\\n+|날짜=[\d]+-[\d]+-[\d]+', ' ', context['text']).strip()
        p = re.sub(r'\([一-龥]+\)', '', p)
        tok = tokenizer(
            f"question : {question} context : ",
            f"{p}",
            truncation = 'only_second',
            max_length = 1041,
            stride = 300,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding=False,
        )
        
        for inputs in tok['input_ids']:
            eos_index = inputs.index(1)
            k = inputs.pop(eos_index)
            
        for input_ids in tok['input_ids']:
            outputs = model.generate(torch.tensor([input_ids]).to('cuda'), num_beams=5, num_return_sequences=5, return_dict_in_generate=True, output_scores=True)
            pred = normalize_answer(tokenizer.decode(list(outputs.values())[0][0], skip_special_tokens=True).strip())
            pred_list.append(pred)
            prop.append(outputs.sequences_scores[0].item() + math.log(context['probability']))
            docs.append(tokenizer.decode(input_ids))
            doc_score.append(context['score'])
            doc_prop.append(context['probability'])
    
    submit[t['id']] = pred_list[np.argmax(prop)]

In [None]:
with open("pred_t5_inference.json", "w") as writer:
    writer.write(json.dumps(submit, indent=4, ensure_ascii=False) + "\n")