<a href="https://colab.research.google.com/github/hsong-77/transformer-practice/blob/main/qa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install farm-haystack[colab]==1.4.0

In [None]:
from datasets import get_dataset_config_names

domains = get_dataset_config_names("subjqa")
domains

In [None]:
from datasets import load_dataset

subjqa = load_dataset("subjqa", name="electronics")
subjqa

In [None]:
import pandas as pd

dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}

qa_cols = ["title", "question", "answers.text", "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df

In [None]:
import matplotlib.pyplot as plt

counts = {}
question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]

for q in question_types:
  counts[q] = dfs["train"]["question"].str.startswith(q).value_counts()[True]

pd.Series(counts).sort_values().plot.barh()
plt.title("Frequency of Question Types")
plt.show()

In [None]:
for question_type in ["How", "What", "Is"]:
  for question in dfs["train"][dfs["train"].question.str.startswith(question_type)].sample(n=3, random_state=42)['question']:
    print(question)

In [None]:
from transformers import AutoTokenizer

ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(ckpt)

In [None]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \
file size."""
inputs = tokenizer(question, context, return_tensors="pt")
inputs

In [None]:
print(tokenizer.decode(inputs["input_ids"][0]))

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(ckpt)

with torch.no_grad():
  outputs = model(**inputs)
print(outputs)

In [None]:
import torch

start_logits = outputs.start_logits
end_logits = outputs.end_logits

start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits) + 1
answer_span = inputs["input_ids"][0][start_idx:end_idx]
answer = tokenizer.decode(answer_span)
print(f"Question: {question}")
print(f"Answer: {answer}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

s_scores = start_logits.detach().numpy().flatten()
e_scores = end_logits.detach().numpy().flatten()
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
token_ids = range(len(tokens))

fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
colors = ["C0" if s != np.max(s_scores) else "C1" for s in s_scores]
ax1.bar(x=token_ids, height=s_scores, color=colors)
ax1.set_ylabel("Start Scores")
colors = ["C0" if s != np.max(e_scores) else "C1" for s in e_scores]
ax2.bar(x=token_ids, height=e_scores, color=colors)
ax2.set_ylabel("End Scores")
plt.xticks(token_ids, tokens, rotation="vertical")
plt.show()

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
pipe(question=question, context=context, topk=3)

In [None]:
def compute_input_length(row):
  inputs = tokenizer(row["question"], row["context"])
  return len(inputs["input_ids"])

dfs["train"]["n_tokens"] = dfs["train"].apply(compute_input_length, axis=1)

fig, ax = plt.subplots()
dfs["train"]["n_tokens"].hist(bins=100, grid=False, ec="C0", ax=ax)
plt.xlabel("Number of tokens in question-context pair")
ax.axvline(x=512, ymin=0, ymax=1, linestyle="--", color="C1", 
           label="Maximum sequence length")
plt.legend()
plt.ylabel("Count")
plt.show()

In [None]:
example = dfs["train"].iloc[0][["question", "context"]]
tokenized_example = tokenizer(example["question"], example["context"], return_overflowing_tokens=True, max_length=100, stride=25)

for idx, window in enumerate(tokenized_example["input_ids"]):
  print(f"Window #{idx} has {len(window)} tokens")

for window in tokenized_example["input_ids"]:
  print(f"{tokenizer.decode(window)} \n")

In [None]:
url = """https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz"""
!wget -nc -q {url}
!tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz

In [None]:
import os
from subprocess import Popen, PIPE, STDOUT

# background process
!chown -R daemon:daemon elasticsearch-7.9.2
es_server = Popen(args=['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
# Wait until Elasticsearch has started
!sleep 30

In [None]:
!curl -X GET "localhost:9200/?pretty"

In [None]:
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore

document_store = ElasticsearchDocumentStore(return_embedding=True)

In [None]:
if len(document_store.get_all_documents()) or len(document_store.get_all_labels()) > 0:
  document_store.delete_documents(index="document")
  document_store.delete_documents(index="label")

In [None]:
for split, df in dfs.items():
  docs = [{"content": row["context"], "id": row["review_id"], "meta": {"item_id": row["title"], "question_id": row["id"], "split": split}}
          for _, row in df.drop_duplicates(subset="context").iterrows()]
  document_store.write_documents(documents=docs, index="document")

print(f"Loaded {document_store.get_document_count()} documents")

In [None]:
# sparse retriever
from haystack.nodes.retriever import BM25Retriever

bm25_retriever = BM25Retriever(document_store=document_store)

In [None]:
item_id = "B0074BW614"
query = "Is it good for reading?"
retrieved_docs = bm25_retriever.retrieve(query=query, top_k=3, filters={"item_id": [item_id], "split": ["train"]})

print(retrieved_docs[0])

In [None]:
from haystack.nodes import FARMReader

ckpt = "deepset/minilm-uncased-squad2"
max_seq_length = 284
doc_stride = 128
reader = FARMReader(model_name_or_path=ckpt, progress_bar=False, max_seq_len=max_seq_length, doc_stride=doc_stride, return_no_answer=True)

print(reader.predict_on_texts(question=question, texts=[context], top_k=1))

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader=reader, retriever=bm25_retriever)

In [None]:
n_answers = 3
preds = pipe.run(query=query, params={"Retriever": {"top_k": 3, "filters": {"item_id": [item_id], "split": ["train"]}}, "Reader": {"top_k": n_answers}})

print(f"Question: {preds['query']} \n")
for idx in range(n_answers):
  print(f"Answer {idx+1}: {preds['answers'][idx].answer}")
  print(f"Review snippet: ...{preds['answers'][idx].context}...")
  print("\n\n")

In [None]:
from haystack.nodes import retriever
# Evaluating the Retriever
from haystack.pipelines import DocumentSearchPipeline

pipe = DocumentSearchPipeline(retriever=bm25_retriever)

In [None]:
from haystack import Label, Answer, Document

labels = []
for i, row in dfs["test"].iterrows():
  # for filtering in the Retriever
  meta = {"item_id": row["title"], "question_id": row["id"]}
  # labels for questions with answers
  if len(row["answers.text"]):
    for answer in row["answers.text"]:
      label = Label(query=row["question"], answer=Answer(answer=answer), origin="gold-label",
                    document=Document(content=row["context"], id=row["review_id"]),
                    meta=meta, is_correct_answer=True, is_correct_document=True,
                    no_answer=False, filters={"item_id": [meta["item_id"]], "split": ["test"]})
      labels.append(label)
  # labels for questions without answers
  else:
    label = Label(query=row["question"], answer=Answer(answer=""), origin="gold-label",
                  document=Document(content=row["context"], id=row["review_id"]),
                  meta=meta, is_correct_answer=True, is_correct_document=True,
                  no_answer=True, filters={"item_id": [row["title"]], "split":["test"]})
    labels.append(label)

document_store.write_labels(labels, index="label")
labels_agg = document_store.get_all_labels_aggregated(index="label", open_domain=True, aggregate_by_meta=["item_id"])

In [None]:
# choose top_k
def evaluate_retriever(retriever, topk_value=[1, 3, 5, 10, 20]):
  topk_results = {}
  max_top_k = max(topk_value)
  p = DocumentSearchPipeline(retriever=retriever)
  eval_result = p.eval(labels=labels_agg, params={"Retriever": {"top_k": max_top_k}})

  for topk in topk_value:
    metrics = eval_result.calculate_metrics(simulated_top_k_retriever=topk)
    topk_results[topk] = {"recall": metrics["Retriever"]["recall_single_hit"]}

  return pd.DataFrame.from_dict(topk_results, orient="index")

bm25_topk_df = evaluate_retriever(bm25_retriever)

In [None]:
def plot_retriever_eval(dfs, retriever_names):
  fig, ax = plt.subplots()
  for df, retriever_name in zip(dfs, retriever_names):
    df.plot(y="recall", ax=ax, label=retriever_name)
  plt.xticks(df.index)
  plt.ylabel("Top-k Recall")
  plt.xlabel("k")
  plt.show()
    
plot_retriever_eval([bm25_topk_df], ["BM25"])

In [None]:
# dense retriever
from haystack.nodes import DensePassageRetriever

dpr_retriever = DensePassageRetriever(document_store=document_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                      embed_title=False)

document_store.update_embeddings(retriever=dpr_retriever)

In [None]:
dpr_topk_df = evaluate_retriever(dpr_retriever)
plot_retriever_eval([bm25_topk_df, dpr_topk_df], ["BM25", "DPR"])

In [None]:
# Evaluating the Reader
from haystack.modeling.evaluation.squad import compute_f1, compute_exact
from haystack.pipelines import Pipeline

def evaluate_reader(reader):
  score_keys = ['exact_match', 'f1']
  p = Pipeline()
  p.add_node(component=reader, name="Reader", inputs=["Query"])

  eval_result = p.eval(labels=labels_agg, documents=[[label.document for label in multilabel.labels] for multilabel in labels_agg], params={})
  metrics = eval_result.calculate_metrics(simulated_top_k_reader=1)

  return {k:v for k,v in metrics["Reader"].items() if k in score_keys}

reader_eval = {}
reader_eval["Fine-tune on SQuAD"] = evaluate_reader(reader)

In [None]:
def plot_reader_eval(reader_eval):
  fig, ax = plt.subplots()
  df = pd.DataFrame.from_dict(reader_eval).reindex(["exact_match", "f1"])
  df.plot(kind="bar", ylabel="Score", rot=0, ax=ax)
  ax.set_xticklabels(["EM", "F1"])
  plt.legend(loc='upper left')
  plt.show()

plot_reader_eval(reader_eval)

In [None]:
# domain adaptation
def create_paragraphs(df):
  paragraphs = []
  id2context = dict(zip(df["review_id"], df["context"]))
  for review_id, review in id2context.items():
    qas = []
    review_df = df.query(f"review_id == '{review_id}'")
    id2question = dict(zip(review_df["id"], review_df["question"]))

    for qid, question in id2question.items():
      question_df = df.query(f"id == '{qid}'").to_dict(orient="list")
      ans_start_idxs = question_df["answers.answer_start"][0].tolist()
      ans_text = question_df["answers.text"][0].tolist()
      if len(ans_start_idxs):
        answers = [{"text": text, "answer_start": answer_start} for text, answer_start in zip(ans_text, ans_start_idxs)]
        is_impossible = False
      else:
        answers = []
        is_impossible = True
      qas.append({"question": question, "id": qid, "is_impossible": is_impossible, "answers": answers})
    
    paragraphs.append({"qas": qas, "context": review})
  
  return paragraphs

In [None]:
# data
import json

def convert_to_squad(dfs):
  for split, df in dfs.items():
    subjqa_data = {}
    groups = (df.groupby("title").apply(create_paragraphs).to_frame(name="paragraphs").reset_index())
    subjqa_data["data"] = groups.to_dict(orient="records")

    with open(f"electronics-{split}.json", "w+", encoding="utf-8") as f:
      json.dump(subjqa_data, f)

convert_to_squad(dfs)

In [None]:
# find-tune
train_filename = "electronics-train.json"
dev_filename = "electronics-validation.json"

# reader.train(data_dir=".", use_gpu=True, n_epochs=1, batch_size=16, train_filename=train_filename, dev_filename=dev_filename)

Example will not be converted for training/evaluation.
Example will not be converted for training/evaluation.
Example will not be converted for training/evaluation.
Example will not be converted for training/evaluation.
Preprocessing Dataset electronics-train.json: 100%|██████████| 1265/1265 [00:06<00:00, 195.76 Dicts/s]  
ERROR:haystack.modeling.data_handler.processor:Unable to convert 4 samples to features. Their ids are : 595-0-0, 572-0-0, 1099-0-1, 1167-0-0
INFO:haystack.modeling.data_handler.data_silo:
INFO:haystack.modeling.data_handler.data_silo:LOADING DEV DATA
INFO:haystack.modeling.data_handler.data_silo:Loading dev set from: electronics-validation.json
INFO:haystack.modeling.data_handler.data_silo:Multiprocessing disabled, using a single worker to convert 252dictionaries to pytorch datasets.
Preprocessing Dataset electronics-validation.json: 100%|██████████| 252/252 [00:00<00:00, 256.36 Dicts/s] 
ERROR:haystack.modeling.data_handler.processor:Unable to convert 4 samples to f

In [None]:
reader_eval["Fine-tune on SQuAD + SubjQA"] = evaluate_reader(reader)
plot_reader_eval(reader_eval)

In [None]:
# fine-tune on SubjQA
minilm_ckpt = "microsoft/MiniLM-L12-H384-uncased"
minilm_reader = FARMReader(model_name_or_path=minilm_ckpt, progress_bar=False,
                           max_seq_len=max_seq_length, doc_stride=doc_stride,
                           return_no_answer=True)
# minilm_reader.train(data_dir=".", use_gpu=True, n_epochs=1, batch_size=16, train_filename=train_filename, dev_filename=dev_filename)

reader_eval["Fine-tune on SubjQA"] = evaluate_reader(minilm_reader)
plot_reader_eval(reader_eval)

In [None]:
# Evaluating the Whole QA Pipeline
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(retriever=bm25_retriever, reader=reader)

eval_result = pipe.eval(labels=labels_agg, params={})
metrics = eval_result.calculate_metrics(simulated_top_k_reader=1)
reader_eval["QA Pipeline (top-1)"] = {k:v for k,v in metrics["Reader"].items() for k in ["exact_match", "f1"]}

In [None]:
plot_reader_eval({"Reader": reader_eval["Fine-tune on SQuAD + SubjQA"], 
                  "QA pipeline (top-1)": reader_eval["QA Pipeline (top-1)"]})

In [None]:
# Retrieval-Augmented Generation
from haystack.nodes import RAGenerator
from haystack.pipelines import GenerativeQAPipeline

generator = RAGenerator(model_name_or_path="facebook/rag-token-nq", embed_title=False, num_beams=5)
pipe = GenerativeQAPipeline(generator=generator, retriever=dpr_retriever)

def generate_answers(query, top_k_generator=3):
  preds = pipe.run(query=query, 
                   params={"Retriever": {"top_k":5, 
                           "filters":{"item_id": ["B0074BW614"]}},
                           "Generator": {"top_k": top_k_generator}})  
  print(f"Question: {preds['query']} \n")
  for idx in range(top_k_generator):
    print(f"Answer {idx+1}: {preds['answers'][idx].answer}")

In [None]:
generate_answers(query)
generate_answers("What is the main drawback?")