In [116]:
from datasets import get_dataset_config_names
# suppress warning
import warnings
warnings.filterwarnings("ignore")

domains = get_dataset_config_names('subjqa')
domains

['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']

In [117]:
from datasets import load_dataset

subjqa  = load_dataset('subjqa', name='electronics')

In [118]:
sample = subjqa['train'][1]
print(sample['answers'])

{'text': ['Bass is weak as expected', 'Bass is weak as expected, even with EQ adjusted up'], 'answer_start': [1302, 1302], 'answer_subj_level': [1, 1], 'ans_subj_score': [0.5083333253860474, 0.5083333253860474], 'is_ans_subjective': [True, True]}


In [119]:
import pandas as pd

dfs = {
    split: dset.to_pandas() for split, dset in subjqa.flatten().items()
}

for split, df in dfs.items():
    print(f"Number of questions in {split}: {df['id'].nunique()}")

Number of questions in train: 1295
Number of questions in test: 358
Number of questions in validation: 255


In [120]:
qa_cols = ['title', 'question', 'answers.text', 'answers.answer_start', 'context']
sample_df = dfs['train'][qa_cols].sample(2, random_state=7)
sample_df

Unnamed: 0,title,question,answers.text,answers.answer_start,context
791,B005DKZTMG,Does the keyboard lightweight?,[this keyboard is compact],[215],I really like this keyboard. I give it 4 star...
1159,B00AAIPT76,How is the battery?,[],[],I bought this after the first spare gopro batt...


In [121]:
keyboard_qa = sample_df.iloc[0]
start_index = keyboard_qa['answers.answer_start'][0] # arr of len 1
end_index = start_index + len(keyboard_qa['answers.text'][0]) # arr of len 1
keyboard_qa['context'][start_index: end_index]

'this keyboard is compact'

In [122]:
from transformers import AutoTokenizer

model_ckpt = 'deepset/minilm-uncased-squad2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [123]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \
file size."""

inputs = tokenizer(question, context, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  2129,  2172,  2189,  2064,  2023,  2907,  1029,   102,  2019,
         23378,  2003,  2055,  1015, 16914,  1013,  3371,  1010,  2061,  2055,
         25961,  2847,  5834,  2006,  5371,  2946,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}

In [124]:
tokenizer.decode(inputs['input_ids'][0])

'[CLS] how much music can this hold? [SEP] an mp3 is about 1 mb / minute, so about 6000 hours depending on file size. [SEP]'

In [125]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
with torch.no_grad():
    outputs = model(**inputs)
    
print(outputs)

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.9862, -4.7750, -5.4025, -5.2378, -5.2863, -5.5117, -4.9819, -6.1880,
         -0.9862,  0.2596, -0.2144, -1.7136,  3.7806,  4.8561, -1.0546, -3.9097,
         -1.7374, -4.5944, -1.4278,  3.9949,  5.0391, -0.2018, -3.0193, -4.8549,
         -2.3107, -3.5110, -3.5713, -0.9862]]), end_logits=tensor([[-0.9623, -5.4733, -5.0326, -5.1639, -5.4278, -5.5151, -5.1749, -4.6233,
         -0.9623, -3.7855, -0.8715, -3.7745, -3.0161, -1.1780,  0.1758, -2.7365,
          4.8934,  0.3046, -3.1761, -3.2762,  0.8937,  5.6606, -0.3623, -4.9554,
         -3.2531, -0.0914,  1.6211, -0.9623]]), hidden_states=None, attentions=None)


In [126]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

print('Input ID shape: ', inputs.input_ids.shape)
print('Start logits shape: ', start_logits.shape)
print('End logits shape: ', end_logits.shape)

Input ID shape:  torch.Size([1, 28])
Start logits shape:  torch.Size([1, 28])
End logits shape:  torch.Size([1, 28])


In [127]:
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits) + 1
answer_span = inputs.input_ids[0, start_idx:end_idx]

run_result = tokenizer.decode(answer_span)
print('Q:', question)
print('A:', run_result)

Q: How much music can this hold?
A: 6000 hours


In [128]:
from transformers import pipeline

pipe = pipeline('question-answering', model=model, tokenizer=tokenizer)
pipe(question=question, context=context, top_k=3)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.26516246795654297,
  'start': 38,
  'end': 48,
  'answer': '6000 hours'},
 {'score': 0.22082945704460144,
  'start': 16,
  'end': 48,
  'answer': '1 MB/minute, so about 6000 hours'},
 {'score': 0.10253474116325378,
  'start': 16,
  'end': 27,
  'answer': '1 MB/minute'}]

In [129]:
pipe(question='how much can an MP3 hold in 1 minute?', context=context, top_k=3)

[{'score': 0.6698799729347229, 'start': 16, 'end': 20, 'answer': '1 MB'},
 {'score': 0.113197460770607, 'start': 18, 'end': 20, 'answer': 'MB'},
 {'score': 0.10320844501256943,
  'start': 16,
  'end': 27,
  'answer': '1 MB/minute'}]

## Dealing with long context by using sliding window.

In [130]:
example = dfs['train'].iloc[0][['question', 'context']]
tokenized_example = tokenizer(
    example['question'], 
    example['context'],
    
    # sliding window paramters
    return_overflowing_tokens=True,
    max_length=100,
    stride=25
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [131]:
len(tokenized_example['input_ids'])

2

In [132]:
for idx, window in enumerate(tokenized_example['input_ids']):
    print(f"Window {idx}:")
    print(tokenizer.decode(window))
    print(len(window), 'tokens')
    print()

Window 0:
[CLS] how is the bass? [SEP] i have had koss headphones in the past, pro 4aa and qz - 99. the koss portapro is portable and has great bass response. the work great with my android phone and can be " rolled up " to be carried in my motorcycle jacket or computer bag without getting crunched. they are very light and do not feel heavy or bear down on your ears even after listening to music with them on all day. the sound is [SEP]
100 tokens

Window 1:
[CLS] how is the bass? [SEP] and do not feel heavy or bear down on your ears even after listening to music with them on all day. the sound is night and day better than any ear - bud could be and are almost as good as the pro 4aa. they are " open air " headphones so you cannot match the bass to the sealed types, but it comes close. for $ 32, you cannot go wrong. [SEP]
88 tokens



## Use Haystack to build QA pipeline

In [133]:
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack import Document

document_store = ElasticsearchDocumentStore(
    hosts = "http://localhost:9200",
)

document_store.count_documents()

1875

In [134]:
for split, df in dfs.items():
    docs = [
        {
            'content': row['context'],
            'meta': {
                'item_id': row['title'],
                'question_id': row['id'],
                'split': split
            }
        }
        for _, row in df.drop_duplicates(subset='context').iterrows()
    ]
    # document_store.write_documents([
    #     Document(**doc)
    #     for doc in docs
    # ])

In [135]:
document_store.count_documents()

1875

In [136]:
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
es_retriever = ElasticsearchBM25Retriever(
    document_store=document_store,
)

In [137]:
item_id = "B0074BW614"
query = question
filters = {
    'operator': 'AND',
    'conditions': [
        {
            'field': 'meta.item_id',
            'value': item_id,
            'operator': '=='
        },
        {
            'field': 'meta.split',
            'value': 'train',
            'operator': '==',
        }
    ]
}


retrieved_docs = es_retriever.run(
    query=query,
    top_k=3,
    filters=filters
)

In [138]:
for doc in retrieved_docs['documents']:
    print(doc.meta['item_id'], doc.meta['split'])

B0074BW614 train
B0074BW614 train
B0074BW614 train


## Reader

In [139]:
from haystack.components.readers import ExtractiveReader

model_ckpt = 'deepset/minilm-uncased-squad2'
max_seq_length, doc_stride = 384, 128
reader = ExtractiveReader(
    model=model_ckpt,
)

In [140]:
reader.warm_up()
run_result = reader.run(
    query=question,
    documents=retrieved_docs['documents'],
    top_k=3
)

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [141]:
print(question)

for ans in run_result['answers']:
    print(ans.data)

How much music can this hold?
no music
all my music from iTunes
Cloud Player
None


In [142]:
from haystack import Pipeline

pipe = Pipeline()
pipe.add_component(instance=es_retriever, name='retriever')
pipe.add_component(instance=reader, name='reader')

pipe.connect('retriever.documents', 'reader.documents')



{'reader': {'answers': [ExtractedAnswer(query='How much music can this hold?', score=0.6685347557067871, data='stream if a phone call comes in at the same time', document=Document(id=b6d3a0fbe7c2fd4a0e236444d07e3113ebe0d8685607e96ad69f5c14771b6c11, content: 'I never imagined I would spend $200 on a tiny wireless speaker system but the rave review by David P...', meta: {'item_id': 'B004E10KFG', 'question_id': '1b1d92228cac6fef9eee45496f6f101f', 'split': 'train'}, score: 11.0743265), context=None, document_offset=ExtractedAnswer.Span(start=1258, end=1306), context_offset=None, meta={}),
   ExtractedAnswer(query='How much music can this hold?', score=0.6322438716888428, data='12oz', document=Document(id=b6d3a0fbe7c2fd4a0e236444d07e3113ebe0d8685607e96ad69f5c14771b6c11, content: 'I never imagined I would spend $200 on a tiny wireless speaker system but the rave review by David P...', meta: {'item_id': 'B004E10KFG', 'question_id': '1b1d92228cac6fef9eee45496f6f101f', 'split': 'train'}, score:

In [143]:
run_result = pipe.run(
    data = {
        'retriever': {
            'query': question,
            'top_k': 3
        },
        'reader': {
            'query': question,
            'top_k': 3
        }
    }
)

In [148]:
print('Q:', question)
for ans in run_result['reader']['answers']:
    print('-', ans.data)

Q: How much music can this hold?
- stream if a phone call comes in at the same time
- 12oz
- the tiny unit can play music streamed via Bluetooth from both my iPhone and iPad anywhere in my apartment
- None


# Recap
## HF pipeline
pros: straight forward, easy to use.
- Usage: `pipe(question, context)`

cons: only works if you have the context ready.

## QA pipeline
Retriever: parse, dense.
- sparse: use word frequency (like TF-IDF)
- dense: use word embedding

Reader: extract answers from retrived documents.

Document store: sparse (tf-idf, BM25), dense(embedding, DPR).

**Evaluate performance**
- EM: exact match. stricter
- F1: f1 score. more relaxed
Relying on F1 only can be misleading -> track both metrics.

## Fine-tuning (domain adaptation)
SQuAD pipieline performs poorly on SubjQA. 

Fine-tuned on SQuAD and SubjQA yieds significant performance.

Q: why we don't fine-tune on SubjQA?

A: SubjQA has only over 1,000 samples which can lead to overfitting. SQuAD has over 100,000 samples.

<img src="./qa-perf.png" width="500px"> </img>

## RAG
- RAG is *generative* QA, while the above approach is called *extractive*.
- relies on dense vector embedding (both document and question).
- reader -> generator: generates passage from matched docs.