In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cd "/content/drive/My Drive/DSTC9-Track1/"

In [None]:
import json
import os
import logging
from collections import defaultdict
from itertools import chain

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
from tqdm import tqdm

logger = logging.getLogger(__name__)

SPECIAL_TOKENS = {
    "additional_special_tokens": ["<speaker1>", "<speaker2>", "<knowledge_sep>", "<knowledge_tag>"],
}
SPECIAL_TOKENS_VALUES = ["<bos>", "<eos>", "<pad>", "<speaker1>", "<speaker2>", "<knowledge_sep>", "<knowledge_tag>"]

task = "generation"
dataroot = '/content/drive/MyDrive/DSTC9-Track1/data_eval'

In [None]:
class DSTC9_DatasetWalker(object):
    def __init__(self, dataset, dataroot, labels=False, labels_file=None):
        path = os.path.join(os.path.abspath(dataroot))
            
        if dataset not in ['test']:
            raise ValueError('Wrong dataset name: %s' % (dataset))

        logs_file = os.path.join(path, dataset, 'logs.json')
        with open(logs_file, 'r') as f:
            self.logs = json.load(f)

        self.labels = None

        if labels is True:
            if labels_file is None:
                labels_file = os.path.join(path, dataset, 'labels.json')

            with open(labels_file, 'r') as f:
                self.labels = json.load(f)

    def __iter__(self):
        if self.labels is not None:
            for log, label in zip(self.logs, self.labels):
                yield(log, label)
        else:
            for log in self.logs:
                yield(log, None)

    def __len__(self, ):
        return len(self.logs)

In [None]:
class DSTC9_KnowledgeReader(object):
    def __init__(self, dataroot, knowledge_file):
        path = os.path.join(os.path.abspath(dataroot))

        with open(os.path.join(path, knowledge_file), 'r') as f:
            self.knowledge = json.load(f)

    def get_domain_list(self):
        return list(self.knowledge.keys())

    def get_entity_list(self, domain):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name")

        entity_ids = []
        for entity_id in self.knowledge[domain].keys():
            try:
                entity_id = int(entity_id)
                entity_ids.append(int(entity_id))
            except:
                pass

        result = []
        for entity_id in sorted(entity_ids):
            entity_name = self.knowledge[domain][str(entity_id)]['name']
            result.append({'id': entity_id, 'name': entity_name})

        return result

    def get_entity_name(self, domain, entity_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))

        result = self.knowledge[domain][str(entity_id)]['name'] or None

        return result


    def get_doc_list(self, domain=None, entity_id=None):
        if domain is None:
            domain_list = self.get_domain_list()
        else:
            if domain not in self.get_domain_list():
                raise ValueError("invalid domain name: %s" % domain)
            domain_list = [domain]

        result = []
        for domain in domain_list:
            if entity_id is None:
                for item_id, item_obj in self.knowledge[domain].items():
                    item_name = self.get_entity_name(domain, item_id)
                    
                    if item_id != '*':
                        item_id = int(item_id)

                    for doc_id, doc_obj in item_obj['docs'].items():
                        result.append({'domain': domain, 'entity_id': item_id, 'entity_name': item_name, 'doc_id': doc_id, 'doc': {'title': doc_obj['title'], 'body': doc_obj['body']}})
            else:
                if str(entity_id) not in self.knowledge[domain]:
                    raise ValueError("invalid entity id: %s" % str(entity_id))

                entity_name = self.get_entity_name(domain, entity_id)
                
                entity_obj = self.knowledge[domain][str(entity_id)]
                for doc_id, doc_obj in entity_obj['docs'].items():
                  
                    result.append({'domain': domain, 'entity_id': entity_id, 'entity_name': entity_name, 'doc_id': doc_id, 
                                   'doc': {'title': doc_obj['title'], 'body': doc_obj['body']}})
        return result

    def get_doc(self, domain, entity_id, doc_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))

        entity_name = self.get_entity_name(domain, entity_id)

        if str(doc_id) not in self.knowledge[domain][str(entity_id)]['docs']:
            raise ValueError("invalid doc id: %s" % str(doc_id))

        doc_obj = self.knowledge[domain][str(entity_id)]['docs'][str(doc_id)]
        result = {'domain': domain, 'entity_id': entity_id, 'entity_name': entity_name, 'doc_id': doc_id, 'doc': {'title': doc_obj['title'], 'body': doc_obj['body']}}

        return result

In [None]:
class DSTC9_BaseDataset(torch.utils.data.Dataset): 
  global dstc9
  df = pd.DataFrame({"history" : 'aaa', "knowledge": 'bbb', "response_text" : 'ccc'}, index = [0])
  dstc9 = pd.DataFrame(df, columns = ["history", "knowledge", "response_text"])

  def __init__(self, split_type, labels=True, labels_file = None) :
    
    self.dataroot = dataroot
    #self.tokenizer = tokenizer
    self.split_type = split_type

    #self.negative_sample_method = negative_sample_method
    #self.n_candidates = 2
    
    '''
    self.SPECIAL_TOKENS = SPECIAL_TOKENS
    self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES
    self.bos = self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS["bos_token"])
    self.eos = self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS["eos_token"])
    self.pad = self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS["pad_token"])
    self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
      self.SPECIAL_TOKENS["additional_special_tokens"]
    )
    self.knowledge_sep_token = self.SPECIAL_TOKENS["additional_special_tokens"][2]

    self.all_response_tokenized = []    
    '''
    self.dataset_walker = DSTC9_DatasetWalker(split_type, labels=labels, dataroot = self.dataroot, labels_file=labels_file)
    self.dialogs = self._prepare_conversations()
    #self.all_response_tokenized = list(map(eval, set(map(str, self.all_response_tokenized))))

    self.knowledge_reader = DSTC9_KnowledgeReader(self.dataroot, "knowledge.json")
    self.knowledge, self.snippets = self._prepare_knowledge() #knowledge_key baseline_dataset.py에서 추가
    #self.knowledge = self._prepare_knowledge() #knowledge_key baseline_dataset.py에서 추가

    self._create_examples()
    self.dstc9 = dstc9

  def _prepare_conversations(self):
    tokenized_dialogs = []
    for i, (log, label) in enumerate(tqdm(self.dataset_walker)) :
      dialog = {}
      dialog["id"] = i
      dialog["log"] = log
      '''
      if label is not None :
        if "response" in label:
          label["response_tokenized"] = self.tokenizer.convert_tokens_to_ids(
              self.tokenizer.tokenize(label["response"])
          )
          self.all_response_tokenized.append(label["response_tokenized"])
      '''
      dialog["label"] = label
      tokenized_dialogs.append(dialog)

    return tokenized_dialogs

  def _knowledge_to_string(self, doc, name = ""):
    return doc["body"]

  def _prepare_knowledge(self): 
    knowledge = self.knowledge_reader.knowledge
    self.knowledge_docs = self.knowledge_reader.get_doc_list()
    
    #tokenized_snippets = dict()
    snippets = dict()
    for snippet in self.knowledge_docs :
      if snippet["domain"] in ["hotel", "restaurant"]:
        key = "{}__{}__{}".format(snippet["domain"], str(snippet["entity_id"]) or "", snippet["doc_id"])
        knowledge = self._knowledge_to_string(snippet["doc"], name = snippet["entity_name"] or "")
        
        snippets[key] = knowledge
    print(snippets)
      
      ##토큰화하는 과정!
      #tokenized_knowledge = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(knowledge))
      #tokenized_snippets[key] = tokenized_knowledge[:1024]

    return knowledge, snippets #, tokenized_snippets ## 해결!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1

  def _create_examples(self):
    self.examples = []
    idx = 0
    for dialog in tqdm(self.dialogs):
      dialog_id = dialog["id"]
      label = dialog["label"]
      dialog = dialog["log"]
      if label is None:
        label = {"target": False}

      target = label["target"]

      if not target and task != "detection":
        continue
            

      if target:
        if "knowledge" not in label:
          label["knowledge"] = [self.knowledge_docs[0]]

        knowledge = label["knowledge"][0]

        if knowledge["domain"] in ["hotel", "restaurant"]:
          knowledge_key = "{}__{}__{}".format(knowledge["domain"], knowledge["entity_id"], knowledge["doc_id"])

          history = [turn["text"] for turn in dialog]
          gt_resp = label.get("response", "")

          ''' 
          # find snippets with same entity as candidates
          prefix = "{}__{}".format(knowledge["domain"], knowledge["entity_id"])
          knowledge_candidates = [
            cand
            for cand in self.snippets.keys() 
            #if "__".join(cand.split("__")[:-1]) == prefix
            if cand.startswith(prefix)
          ]
              
        if self.split_type == "train" and self.negative_sample_method == "oracle":
          # if there's not enough candidates during training, we just skip this example
          if len(knowledge_candidates) < self.n_candidates:
            continue
        '''
        used_knowledge = self.snippets[knowledge_key]
        #used_knowledge = used_knowledge[:-1]
      
      else:
        knowledge_candidates = None
        used_knowledge = []

      self.examples.append({
        "history": history,
        "knowledge": used_knowledge,
        #"candidates": knowledge_candidates,
        #"response": tokenized_gt_resp,
        "response_text": gt_resp,
        #"label": label,
        #"knowledge_seeking": target,
        #"dialog_id": dialog_id
      })
      
      dstc9.loc[idx] = [history, used_knowledge, gt_resp]
      idx += 1

    #dstc9 = dstc9.append(self.examples, ignore_index = True)
    #print(dstc9)
    #return self.examples
    

  def build_input_from_segments(self, knowledge, history, response, with_eos=True):
    """ Build a sequence of input from 3 segments: knowledge, history and last reply """
    instance = {}

    sequence = [[self.bos] + knowledge] + history + [response + ([self.eos] if with_eos else [])]
    print(sequence)
    sequence_with_speaker = [
      [self.speaker1 if (len(sequence) - i) % 2 == 0 else self.speaker2] + s
      for i, s in enumerate(sequence[1:])
    ]
    sequence = [sequence[0]] + sequence_with_speaker
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [self.speaker2 if i % 2 else self.speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]

    return instance, sequence

  def __getitem__(self, index):
    raise NotImplementedError
    
  def __len__(self):
    return len(self.examples)

__________________________________________________________

In [None]:
class DSTC9_ResponseGenerationDataset(DSTC9_BaseDataset):
    def __init__(self, split_type, labels=True, labels_file=None):
        super(DSTC9_ResponseGenerationDataset, self).__init__(split_type, labels, labels_file)

    def __getitem__(self, index):
        example = self.examples[index]
        instance, _ = self.build_input_from_segments(
            example["knowledge"],
            example["history"],
            example["response_text"]
        )
        print(instance)
        return instance

class DSTC9_ResponseGenerationEvalDataset(DSTC9_BaseDataset):
    def __init__(self, split_type, labels=True, labels_file=None):
        super(DSTC9_ResponseGenerationEvalDataset, self).__init__(split_type, labels, labels_file)

    def __getitem__(self, index):
        example = self.examples[index]
        return example

    def collate_fn(self, batch):
        return batch

In [None]:
dstc9_dataset = DSTC9_ResponseGenerationDataset(split_type = 'test')

In [None]:
print(dstc9.head())

                                             history  \
0  [Yes, I'm going to be in Chinatown, San Franci...   
1  [Yes, I'm going to be in Chinatown, San Franci...   
2  [Yes, I'm going to be in Chinatown, San Franci...   
3  [hi hey i'm planning a trip to san francisco a...   
4  [I would like a train to Cambridge that leaves...   

                                           knowledge  \
0  Orchard Garden Hotel doesn't offer any breakfast.   
1  Orchard Garden Hotel doesn't offer any breakfast.   
2  Orchard Garden Hotel doesn't offer any breakfast.   
3   The Grant Hotel check-in time starts at 3:00 PM.   
4          Yes, Fitzbillies has gluten free options.   

                                       response_text  
0  Unfortunately, this hotel doesn't offer any br...  
1  Unfortunately, this hotel doesn't offer any br...  
2  Unfortunately, this hotel doesn't offer any br...  
3  that's a great question looking into that i se...  
4  Yes, Fitzbillies does offer gluten-free option..

In [None]:
idx = 0
for i in dstc9['history']:
  dstc9['history'].loc[idx] = ' '.join(s for s in i)
  idx += 1

print(dstc9.head())

                                             history  \
0  Yes, I'm going to be in Chinatown, San Francis...   
1  Yes, I'm going to be in Chinatown, San Francis...   
2  Yes, I'm going to be in Chinatown, San Francis...   
3  hi hey i'm planning a trip to san francisco an...   
4  I would like a train to Cambridge that leaves ...   

                                           knowledge  \
0  Orchard Garden Hotel doesn't offer any breakfast.   
1  Orchard Garden Hotel doesn't offer any breakfast.   
2  Orchard Garden Hotel doesn't offer any breakfast.   
3   The Grant Hotel check-in time starts at 3:00 PM.   
4          Yes, Fitzbillies has gluten free options.   

                                       response_text  
0  Unfortunately, this hotel doesn't offer any br...  
1  Unfortunately, this hotel doesn't offer any br...  
2  Unfortunately, this hotel doesn't offer any br...  
3  that's a great question looking into that i se...  
4  Yes, Fitzbillies does offer gluten-free option..

In [None]:
print(len(dstc9))

1981


In [None]:
dstc9.drop_duplicates(inplace = True)
print(len(dstc9))
print(dstc9.head())

1185
                                             history  \
0  Yes, I'm going to be in Chinatown, San Francis...   
3  hi hey i'm planning a trip to san francisco an...   
4  I would like a train to Cambridge that leaves ...   
6  Can you tell me about a nice guest house in yo...   
7  Hi. I'm trying to find a cheap place to eat th...   

                                           knowledge  \
0  Orchard Garden Hotel doesn't offer any breakfast.   
3   The Grant Hotel check-in time starts at 3:00 PM.   
4          Yes, Fitzbillies has gluten free options.   
6    Parking is available off-road at Archway House.   
7                           Yes, reservations taken.   

                                       response_text  
0  Unfortunately, this hotel doesn't offer any br...  
3  that's a great question looking into that i se...  
4  Yes, Fitzbillies does offer gluten-free option...  
6  You can park off-road at this hotel. Do you ne...  
7  Yes, they do take reservations. Would you l

In [None]:
dstc9.reset_index(inplace = True)
print(dstc9.head())
print(len(dstc9))

   index                                            history  \
0      0  Yes, I'm going to be in Chinatown, San Francis...   
1      3  hi hey i'm planning a trip to san francisco an...   
2      4  I would like a train to Cambridge that leaves ...   
3      6  Can you tell me about a nice guest house in yo...   
4      7  Hi. I'm trying to find a cheap place to eat th...   

                                           knowledge  \
0  Orchard Garden Hotel doesn't offer any breakfast.   
1   The Grant Hotel check-in time starts at 3:00 PM.   
2          Yes, Fitzbillies has gluten free options.   
3    Parking is available off-road at Archway House.   
4                           Yes, reservations taken.   

                                       response_text  
0  Unfortunately, this hotel doesn't offer any br...  
1  that's a great question looking into that i se...  
2  Yes, Fitzbillies does offer gluten-free option...  
3  You can park off-road at this hotel. Do you ne...  
4  Yes, t

# Back-translation

In [None]:
# 이 부분은 처음 한번만 실행하면 됌.
# 코드 수정 - "The reason is that the last Ubuntu update update supports chromium driver just via snap."
# 최근 우분투 업데이트에서 크롬 드라이버 설치를 snap을 이용해서만 하도록 바뀜
# 고로 snap 없이 설치하는 아래 우회 코드로 변경
# 출처 : https://colab.research.google.com/drive/1cbEvuZOhkouYLda3RqiwtbM-o9hxGLyC
# 출처2 : https://stackoverflow.com/questions/75155063/selenium-use-chrome-on-colab-got-unexpectedly-exited

%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

In [None]:
import sys
import numpy as np
import pandas as pd
import random
from tqdm.notebook import tqdm
import os
import re
import time
from collections import Counter

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
def chrome_setting():
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')        # Head-less 설정
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome('chromedriver', options=options)
  return driver

In [None]:
driver = chrome_setting()

# ENG >>> KOR

In [None]:
def trans_to_ko(transed_list, transed_lang,start_index,final_index): 
  
  target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))
  print(target_present)
  for i in tqdm(range(start_index,final_index)): 
    
    if (i!=0)&(i%99==0):
      time.sleep(1.5)
      print('{}th : '.format(i), backtrans)
      np.save(data_path+'eng_to_kor_train_knowledge_{}_{}.npy'.format(start_index,final_index),trans_list)
    
    try:
      driver.get('https://papago.naver.com/?sk=en&tk='+transed_lang+'&st='+transed_list[i])
      time.sleep(2)
      element=WebDriverWait(driver, 5).until(target_present)
      time.sleep(0.2)
      backtrans = element.text 

      if (backtrans=='')|(backtrans==' '):
        element=WebDriverWait(driver, 5).until(target_present)
        backtrans = element.text 
        trans_list.append(backtrans)
      else:
        trans_list.append(backtrans)
    
    except:
      print('error')
      trans_list.append('')

In [None]:
data_path = '/content/drive/MyDrive/dstc11-track5/trans/'

In [None]:
trans_list = []
trans_to_ko(dstc9['knowledge'], 'ko', 0, len(dstc9))
np.save(data_path + 'eng_to_kor_train_knowledge.npy', trans_list)

In [None]:
driver = chrome_setting()

In [None]:
def trans_to_ko(transed_list, transed_lang,start_index,final_index): 
  
  target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))
  for i in tqdm(range(start_index,final_index)): 
    
    if (i!=0)&(i%99==0):
      time.sleep(1.5)
      print('{}th : '.format(i), backtrans)
      np.save(data_path+'eng_to_kor_train_response_{}_{}.npy'.format(start_index,final_index),trans_list)
    
    try:
      driver.get('https://papago.naver.com/?sk=en&tk='+transed_lang+'&st='+transed_list[i])
      time.sleep(2)
      element=WebDriverWait(driver, 5).until(target_present)
      time.sleep(0.2)
      backtrans = element.text 

      if (backtrans=='')|(backtrans==' '):
        element=WebDriverWait(driver, 5).until(target_present)
        backtrans = element.text 
        trans_list.append(backtrans)
      else:
        trans_list.append(backtrans)
    
    except:
      print('error')
      trans_list.append('')

In [None]:
trans_list = []
trans_to_ko(dstc9['response_text'], 'ko', 0, len(dstc9))
np.save(data_path + 'eng_to_kor_train_response.npy', trans_list)

In [None]:
driver = chrome_setting()

In [None]:
def trans_to_ko(transed_list, transed_lang,start_index,final_index): 
  
  target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))
  for i in tqdm(range(start_index,final_index)): 
    
    if (i!=0)&(i%99==0):
      time.sleep(1.5)
      print('{}th : '.format(i), backtrans)
      np.save(data_path+'eng_to_kor_train_history_{}_{}.npy'.format(start_index,final_index),trans_list)
    
    try:
      driver.get('https://papago.naver.com/?sk=en&tk='+transed_lang+'&st='+transed_list[i])
      time.sleep(2)
      element=WebDriverWait(driver, 5).until(target_present)
      print('here')
      time.sleep(0.2)
      backtrans = element.text 

      if (backtrans=='')|(backtrans==' '):
        element=WebDriverWait(driver, 5).until(target_present)
        backtrans = element.text 
        trans_list.append(backtrans)
      else:
        trans_list.append(backtrans)
    
    except:
      print('error')
      trans_list.append('')

In [None]:
trans_list = []
trans_to_ko(dstc9['history'], 'ko', 0, len(dstc9))
np.save(data_path + 'eng_to_kor_train_history.npy', trans_list)

In [None]:
data_path = "/content/drive/My Drive/dstc11-track5/trans/"

In [None]:
kor_data_knowledge = np.load(data_path + 'eng_to_kor_train_knowledge.npy')
kor_data_response = np.load(data_path + 'eng_to_kor_train_response.npy')
kor_data_history = np.load(data_path + 'eng_to_kor_train_history.npy')

kor_knowledge =[*kor_data_knowledge]
kor_response = [*kor_data_response]
kor_history = [*kor_data_history]

print(kor_history)
print(kor_response)
print(kor_knowledge)

#kor_data = [*kor_data_knowledge, *kor_data_response, *kor_data_history]
#kor_data = pd.DataFrame(kor_data, columns = ['kor_title'])

['네, 저는 샌프란시스코 차이나타운에 있을 예정이고 숙소와 지역 명소를 찾고 있습니다. 차이나타운에는 많은 숙박 옵션이 있습니다. 당신이 생각하고 있는 가격대를 알려주시겠습니까? 적당한 가격대에 별이 3개인 것으로 찾아주실 수 있나요? 네, 오차드 가든 호텔과 SW 호텔 모두 고객님의 조건에 맞습니다. 어떤 걸로 하시겠어요? 오차드 가든 호텔이 좋을 것 같습니다. 그곳에서 이용할 수 있는 아침 식사 옵션은 무엇이 있습니까?', '', '', '안녕하세요, 저는 샌프란시스코 여행을 계획하고 있는데 당신이 저를 도와주실 수 있는지 궁금합니다. 음, 첫 번째 요청은 내부 리치몬드에서 음, 식사를 할 수 있는 곳입니다. 물론 내부 리치몬드에서 꽤 많은 장소를 찾고 있지만, 저는 그것들이 신데렐라 베이커리와 카페라고 불리는 훌륭한 장소를 알고 있습니다 적당한 가격대로 좋습니다. 제가 확인해 보겠습니다. 행복한 시간이 있나요. 그들은 행복한 시간을 가지고 있지 않습니다. 그래서 저는 언제든지 레스토랑의 주소가 무엇인지 갈 수 있을 것입니다. 네, 그들은 완벽한 436 발보아 거리에 위치해 있고 음, 제가 간 후에 그곳에 박물관이나 갈 곳이 있을 가능성이 있습니다. 저는 확실히 어두워진 후에 탐험대를 추천합니다 재미있을 것 같네요. 어, 그들이 15번 부두에 위치해 있는 그곳의 주소는 무엇입니까? 그리고 그들을 방문하기 위해 예약이 필요한가요? 우리가 예약을 해야만 하는 것은 제가 그것이 어떻게 생겼는지 확실히 확인할 수 있습니다. 그들은 약속에 의해서만 있는 것이 아니기 때문에 당신은 방문하는 것이 좋습니다. 좋아요. 그리고 그 후에 우리는 피곤할 것이고 나는 잠을 잘 곳이 필요할 것입니다. 그래서 적당한 가격대의 유니온 스퀘어에 머물 곳이 있을까요? 아마도 2성급 호텔이 있을 것입니다. 제가 찾고 있는 것은 그러한 사양을 가진 5개의 다른 장소입니다 유니온 스퀘어에 있는 그랜트 호텔을 확인해보시길 추천합니다. 그리고 그랜트 호텔은 음 음, 제가 갈 박물관과 꽤 가까운

# Retry untranslated sentences

In [None]:
print('knowledge :', kor_knowledge.count(''))
print('history :', kor_history.count(''))
print('response :', kor_response.count(''))

knowledge : 458
history : 469
response : 458


In [None]:
!pip install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator
translator = Translator()

for i, k in enumerate(kor_knowledge) :
  if k == '':
    kor_knowledge[i] = translator.translate(dstc9['knowledge'][i], src = "en", dest = "ko").text
print('knowledge :', kor_knowledge.count(''))

for i, k in enumerate(kor_history) :
  if k == '':
    kor_history[i] = translator.translate(dstc9['history'][i], src = "en", dest = "ko").text
print('history :', kor_history.count(''))

for i, k in enumerate(kor_response) :
  if k == '':
    kor_response[i] = translator.translate(dstc9['response_text'][i], src = "en", dest = "ko").text

print('response :', kor_response.count(''))

knowledge : 0
history : 0
response : 0


In [None]:
print(kor_history)
print(kor_response)
print(kor_knowledge)

['네, 저는 샌프란시스코 차이나타운에 있을 예정이고 숙소와 지역 명소를 찾고 있습니다. 차이나타운에는 많은 숙박 옵션이 있습니다. 당신이 생각하고 있는 가격대를 알려주시겠습니까? 적당한 가격대에 별이 3개인 것으로 찾아주실 수 있나요? 네, 오차드 가든 호텔과 SW 호텔 모두 고객님의 조건에 맞습니다. 어떤 걸로 하시겠어요? 오차드 가든 호텔이 좋을 것 같습니다. 그곳에서 이용할 수 있는 아침 식사 옵션은 무엇이 있습니까?', '안녕하세요, 저는 샌프란시스코로의 여행을 계획하고 있습니다. 당신이 나를 도울 수 있는지 궁금합니다Inner Richmond의 위치이지만 Cinderella Bakery와 Cafe라고 불리는 훌륭한 장소를 알고 있습니다.그들은 행복한 시간을 가지고 있지 않기 때문에 나는 언제라도 갈 수 있어야합니다. 식당의 주소는 무엇입니까?박물관이나 그곳을 돌아 다닐 장소는 확실히 어두워 진 후 탐험가를 정말로 추천합니다.우리는 예약을해야합니까?그들은 약속만으로도 당신이 방문하는 것이 좋지 않습니다.어쩌면 호텔은 아마도 2 개의 별을 가진 호텔의 범위 제가 찾은 것이 5 개의 다른 위치라고 확신합니다.어두운 예, 완벽한 uhhh 그 호텔의 우편 번호와 주소는 문제 없음 문제 없음 우편 번호는 9 개의 4 개의 Zero 8이고 주소는 7 개의 5 개의 Three Bush Street Perfect이며 그 시간은 무엇입니까?', '17:00 이후에 떠나는 케임브리지로가는 기차를 원합니다.요구 사항에 맞는 열차가 많이 있습니다. 어디에서 떠나고 언제 떠나고 싶습니까?월요일에 스탠스 테드 공항에서 떠나고 싶습니다.자, 이러한 요구 사항에 맞는 7 개의 열차가 있습니다. 몇시에 도착하고 싶은 시간을 말씀해 주시겠습니까?나는 우리가 언제든지 거기에 갈 수 있다고 생각하지 않습니다.TR4096은 17:24에 떠나 17:52에 도착합니다. 제가 좌석을 예약 하시겠습니까?향후 참조에 대한 서면 확인을받을 수 있습니까?예약이 완료되면 이메일 확인이 표시됩니다.내가 

In [None]:
cd /content/drive/My Drive/dstc11-track5/trans/

/content/drive/My Drive/dstc11-track5/trans


In [None]:
import csv

with open('eng_to_kor_knowledge.csv', 'w', newline = '') as f :
  writer = csv.writer(f)
  writer.writerow(kor_knowledge)

with open('eng_to_kor_history.csv', 'w', newline = '') as f :
  writer = csv.writer(f)
  writer.writerow(kor_history)

with open('eng_to_kor_response.csv', 'w', newline = '') as f :
  writer = csv.writer(f)
  writer.writerow(kor_response)

# KO >> ENG

In [None]:
def kor_to_trans(text_data, trans_lang,start_index,final_index):

  target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))

  for i in tqdm(range(start_index,final_index)): 
    
    if (i!=0)&(i%99==0):
      time.sleep(2)
      print('{}th : '.format(i), backtrans)
      np.save(data_path+'kor_to_eng_train_response_{}_{}.npy'.format(start_index,final_index),trans_list)
    
    try:
      driver.get('https://papago.naver.com/?sk=ko&tk='+trans_lang+'&st='+text_data[i])
      time.sleep(1.5)
      element=WebDriverWait(driver, 10).until(target_present)
      time.sleep(0.1)
      backtrans = element.text 

      if (backtrans=='')|(backtrans==' '):
        element=WebDriverWait(driver, 10).until(target_present)
        backtrans = element.text 
        trans_list.append(backtrans)
      else:
        trans_list.append(backtrans)
    
    except:
      print('error')
      trans_list.append('')

In [None]:
driver = chrome_setting()

In [None]:
trans_list = []
print(len(kor_response))
kor_to_trans(kor_response, 'en', 0, len(kor_response))
np.save(data_path + 'kor_to_eng_train_response.npy', trans_list)

1185


  0%|          | 0/1185 [00:00<?, ?it/s]

99th :  Yes, I accept AMEX and all major credit cards. Would you like me to make a reservation?
198th :  Boudin Bakery
297th :  They don't have high chairs available. Is that okay with you?
396th :  They accept credit cards. Would you like to make a reservation?
495th :  Lucky Star currently does not have a gluten-free option.Do you have any other questions?
error
594th :  Sorry, Presidio Parkway does not offer concierge services. Is there any other question I can help you with?
693th :  The Dojo Noodle Bar provides parking at the location.Is there anything else I can help you with today?
792th :  There is an average noise level for Champa.Would you like to make a reservation?
891th :  Yes, Orchard Garden Hotel provides room service to guests. Would you like to reserve a room now?
990th :  Yes, there are high chairs in La Mediterani. Do you have any other questions?
1089th :  Sorry, TV is not available in Ebisu.Can I help you with something else?


In [None]:
def kor_to_trans(text_data, trans_lang,start_index,final_index):

  target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))

  for i in tqdm(range(start_index,final_index)): 
    
    if (i!=0)&(i%99==0):
      time.sleep(2)
      print('{}th : '.format(i), backtrans)
      np.save(data_path+'kor_to_eng_train_knowledge_{}_{}.npy'.format(start_index,final_index),trans_list)
    
    try:
      driver.get('https://papago.naver.com/?sk=ko&tk='+trans_lang+'&st='+text_data[i])
      time.sleep(1.5)
      element=WebDriverWait(driver, 10).until(target_present)
      time.sleep(0.1)
      backtrans = element.text 

      if (backtrans=='')|(backtrans==' '):
        element=WebDriverWait(driver, 10).until(target_present)
        backtrans = element.text 
        trans_list.append(backtrans)
      else:
        trans_list.append(backtrans)
    
    except:
      print('error')
      trans_list.append('')

In [None]:
driver = chrome_setting()

In [None]:
trans_list = []
print(len(kor_knowledge))
kor_to_trans(kor_knowledge, 'en', 0, len(kor_knowledge))
np.save(data_path + 'kor_to_eng_train_knowledge.npy', trans_list)

1185


  0%|          | 0/1185 [00:00<?, ?it/s]

99th :  Coto accepts all major credit cards.
198th :  Boudin Bakery
297th :  Sugar Hill Kitchen does not have high chairs for infants.
396th :  Credit cards, Google Pay, and Apple Pay are available.
495th :  No, there is no gluten-free option.
error
594th :  No, the Presidio Parkway Inn does not provide concierge services.
693th :  Parking is available at the Dojo Noodle Bar.
792th :  The noise level of the Champa Garden is average.
891th :  Room service is available at the Orchard Garden Hotel.
990th :  Yes, La Mediterani provides high chairs for babies.
1089th :  Ebisu doesn't have a TV.


In [None]:
def kor_to_trans(text_data, trans_lang,start_index,final_index):

  target_present = EC.presence_of_element_located((By.XPATH, '//*[@id="txtTarget"]'))

  for i in tqdm(range(start_index,final_index)): 
    
    if (i!=0)&(i%99==0):
      time.sleep(2)
      print('{}th : '.format(i), backtrans)
      np.save(data_path+'kor_to_eng_train_history_{}_{}.npy'.format(start_index,final_index),trans_list)
    
    try:
      driver.get('https://papago.naver.com/?sk=ko&tk='+trans_lang+'&st='+text_data[i])
      time.sleep(1.5)
      element=WebDriverWait(driver, 10).until(target_present)
      time.sleep(0.1)
      backtrans = element.text 

      if (backtrans=='')|(backtrans==' '):
        element=WebDriverWait(driver, 10).until(target_present)
        backtrans = element.text 
        trans_list.append(backtrans)
      else:
        trans_list.append(backtrans)
    
    except:
      print('error')
      trans_list.append('')

In [None]:
driver = chrome_setting()

In [None]:
trans_list = []
print(len(kor_history))
kor_to_trans(kor_history, 'en', 0, len(kor_history))
np.save(data_path + 'kor_to_eng_train_history.npy', trans_list)

1185


  0%|          | 0/1185 [00:00<?, ?it/s]

error
99th :  Hello, I need a place to eat in the middle of the middle price range. What can you suggest? We have 21 foods, including serving British food and many other kinds. Could you narrow it down a little? Sure, can you find a British restaurant? Koto is very good. Can I reserve a seat for you? How can the restaurant help me and my guide dog? Currently, Coto does not have access to a wheelchair. Would that be okay? What kind of parking lot do they offer? The Gonville Hotel offers free parking to guests dining at the Coto location. Does it work for you? Do they get AMEX for payment?
198th :  Yes, I need help finding affordable restaurants at Fisherman's Wharf, which serves izakaya food.Sorry, Fisherman's Wharf doesn't have a restaurant serving izakaya food.Can you find anything else for you?How about a place that serves sandwiches instead?Boudin Bakery
error
297th :  Hi, can you help me find a place to eat in Marina? Marina has many restaurant options to choose from. What kind of 

In [None]:
eng_data_knowledge = np.load(data_path + 'kor_to_eng_train_knowledge.npy')
eng_data_response = np.load(data_path + 'kor_to_eng_train_response.npy')
eng_data_history = np.load(data_path + 'kor_to_eng_train_history.npy')

eng_knowledge =[*eng_data_knowledge]
eng_response = [*eng_data_response]
eng_history = [*eng_data_history]

print(eng_history)
print(eng_response)
print(eng_knowledge)

["Yes, I'm going to be in San Francisco Chinatown and I'm looking for accommodation and local attractions. There are many accommodation options in Chinatown. Could you tell me the price range you have in mind? Can you find a 3 star one at a reasonable price? Yes, both Orchard Garden Hotel and SW Hotel meet your requirements. What kind would you like, sir? Orchard Garden Hotel would be good. What are the breakfast options available there?", "Hello, I'm planning a trip to San Francisco. I wonder if you can help meIt's the location of Inner Richmond, but I know a great place called Cinderlla Bakery and Cafe.I should be able to go anytime because they don't have a happy time. What is the address of the restaurant?Museums and places to go around are definitely recommended for explorers after dark.Do we have to make a reservation?They don't want you to visit just by appointment.Maybe the hotel is a range of hotels with 2 stars I'm sure what I found is 5 different locations.Dark example, perf

In [81]:
import csv
with open(data_path + 'eng_to_kor_knowledge.csv', 'r', newline = '') as f :
  reader = csv.reader(f)
  for row in reader :
    kor_knowledge = row
print(kor_knowledge)

with open(data_path + 'eng_to_kor_history.csv', 'r', newline = '') as f :
  reader = csv.reader(f)
  for row in reader :
    kor_history = row
print(kor_history)

with open(data_path + 'eng_to_kor_response.csv', 'r', newline = '') as f :
  reader = csv.reader(f)
  for row in reader :
    kor_response = row
print(kor_response)

['오차드 가든 호텔은 조식을 제공하지 않습니다.', '그랜트 호텔 체크인 시간은 오후 3시에 시작됩니다.', '그렇습니다. 피츠빌리는 글루텐 프리 옵션이 있습니다.', '그랜트 호텔 체크인 시간은 오후 3시부터입니다.', '네, 핏츠빌리에는 글루텐 프리 옵션이 있습니다.', '아침 식사의 경우 손님은 일품 카르트 또는 대륙 옵션 중에서 선택할 수 있습니다.', 'Archway House에서는 오프로드 주차를 이용할 수 있습니다.', '네, 예약이 완료되었습니다.', 'El Shaddai는 매일 하우스 키핑 서비스를 제공하지 않습니다.', '조식은 아라카르트 또는 콘티넨탈 옵션 중에서 선택하실 수 있습니다.', '아침 식사는 7:30~10:30에 제공되는 가벼운 대륙간 식사입니다', '앤드류스 호텔에서는 흡연이 허용되지 않습니다.', 'El Shaddai는 매일 하우스키핑 서비스를 제공하지 않습니다.', 'City Stop Restaurant은 라이브 음악을 제공하지 않습니다.', 'Layne Hotel에서는 여분의 침대를 이용할 수 없습니다.', '예, 하이 의자를 사용할 수 있습니다.', '제공되는 자전거 주차', 'Tortellino에는 라이브 음악이 없습니다', '예, 세인트 레지스 샌프란시스코에는 리프트가 있습니다.', '인도의 집은 휠체어로 갈 수 있습니다.', 'Pho 2000은 채식주의자에게 우호적인 옵션이 없습니다.', '아니요, 야외 좌석이 없습니다.', 'Alexander Bed and Breakfast에는 현장에서 스파가 없습니다.', '네, 가능해요. 주차가 가능합니다.', 'Carolina Bed and Breakfast에서는 독일어, 이탈리아어, 스페인어, 영어를 사용할 수 있습니다.', '시티 스톱 레스토랑에서는 라이브 음악을 제공하지 않습니다.', '레이네 호텔에서는 엑스트라 베드를 이용할 수 없습니다.', 'Kirkwood House에는 현장 피트니스 센터가 없습니다.', '네, 높은 의자를 이용할 수 있습니다.', '자전거 주

In [82]:
print('knowledge :', kor_knowledge.count(''))
print('history :', kor_history.count(''))
print('response :', kor_response.count(''))

knowledge : 0
history : 0
response : 0


## Retry Untranslated Sentences

In [83]:
print('knowledge :', eng_knowledge.count(''))
print('history :', eng_history.count(''))
print('response :', eng_response.count(''))

knowledge : 3
history : 32
response : 2


In [84]:
from googletrans import Translator
translator = Translator()

for i, k in enumerate(eng_knowledge) :
  if k == '':
    #print(kor_knowledge[i], eng_knowledge[i])
    eng_knowledge[i] = translator.translate(kor_knowledge[i], src = "ko", dest = "en").text

print('knowledge :', eng_knowledge.count(''))

for i, k in enumerate(eng_history) :
  if k == '':
    #print(kor_history[i], eng_history[i])
    eng_history[i] = translator.translate(kor_history[i], src = "ko", dest = "en").text
print('history :', eng_history.count(''))

for i, k in enumerate(eng_response) :
  if k == '':
    #print(kor_response[i], eng_response[i])
    eng_response[i] = translator.translate(kor_response[i], src = "ko", dest = "en").text
print('response :', eng_response.count(''))

knowledge : 0
history : 0
response : 0


In [85]:
with open(data_path + 'knowledge.csv', 'w', newline = '') as f :
  writer = csv.writer(f)
  writer.writerow(eng_knowledge)

with open(data_path + 'history.csv', 'w', newline = '') as f :
  writer = csv.writer(f)
  writer.writerow(eng_history)

with open(data_path + 'response.csv', 'w', newline = '') as f :
  writer = csv.writer(f)
  writer.writerow(eng_response)