<a href="https://colab.research.google.com/github/jinsusong/study-NLP-BERT/blob/main/QA_CoQA_Chatbot_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# QA_CoQA Chatbot model

# 데이터 로드

In [None]:
!pip install kaggle
from google.colab import files
files.upload()


In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

!pip install transformers

In [None]:
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

In [None]:
len(coqa)

In [None]:
coqa.columns

In [None]:
del coqa["version"]
coqa['data'][0]

In [None]:
coqa['data'][0].keys()

In [None]:
cols = ["text","question","answer"]

In [None]:
cols

# 데이터 전처리

In [None]:
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)

new_df = pd.DataFrame(comp_list, columns=cols)


In [None]:
new_df.to_csv("CoQA_data.csv", index=False)

In [None]:
data = pd.read_csv("CoQA_data.csv")
data.head()

In [None]:
print("Number of question and answers: ", len(data))

# 모델링 

In [None]:
from transformers import TFBertForQuestionAnswering
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
random_num = np.random.randint(0,len(data))

question = data["question"][random_num]
text = data["text"][random_num]

In [None]:
print('Context: \n')
print(text)
print('\nQuestion: \n')
print(question)


In [None]:
input_ids = tokenizer(question, text, return_tensors="tf")
input_ids.keys()

In [None]:
input_ids

In [None]:
input_ids.input_ids[0]

In [None]:
print("The input has a total of {} tokens.".format(len(input_ids.input_ids[0])))

In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids.input_ids[0])
for token, id in zip(tokens, input_ids.input_ids[0]):
    print('{:8}{:8,}'.format(token,id))

In [None]:
output = model(input_ids)

In [None]:
print(output.start_logits)
print('\n')
print(output.end_logits)

In [None]:
#tokens with highest start and end scores
answer_start = tf.argmax(tf.cast(output.start_logits, tf.int32), axis=1)
answer_end = tf.where(tf.equal(output.end_logits, float(tf.reduce_max(output.end_logits[0]))))[:,-1]

In [None]:
print(answer_start, answer_end)

In [None]:
tokens[int(answer_start):int(answer_end)+1]

In [None]:
if answer_end >= answer_start:
    answer = " ".join(tokens[int(answer_start):int(answer_end)+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("Text:\n{}".format(text.capitalize()))
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))

In [None]:
data.loc[random_num]

In [None]:
answer = tokens[int(answer_start)]

In [None]:
for i in range(int(answer_start)+1, int(answer_end)+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]

In [None]:
answer

In [None]:
def question_answer(question, text):
    
    #tokenize question and text in ids as a pair
    input_ids = tokenizer(question, text, return_tensors="tf")
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids.input_ids[0])
    
    #model output using input_ids and segment_ids
    output = model(input_ids)
    
    #reconstructing the answer
    answer_start = tf.argmax(tf.cast(output.start_logits, tf.int32), axis=1)
    answer_end = tf.where(tf.equal(output.end_logits, float(tf.reduce_max(output.end_logits[0]))))[:,-1]

    if answer_end >= answer_start:
        answer = tokens[int(answer_start)]
        for i in range(int(answer_start)+1, int(answer_end)+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nAnswer:\n{}".format(answer.capitalize()))