###  Load Dataset


In [1]:
from datasets import load_dataset

split = "train"
cache_dir = "./data_cache_doc2vec_solution_spans_notebook"

dialogue_dataset = load_dataset(
    "doc2dial",
    name="dialogue_domain",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

document_dataset = load_dataset(
    "doc2dial",
    name="document_domain",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

rc_dataset = load_dataset(
    "doc2dial",
    name="doc2dial_rc",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Downloading and preparing dataset doc2dial/dialogue_domain (download: 5.61 MiB, generated: 7.86 MiB, post-processed: Unknown size, total: 13.47 MiB) to ./data_cache_doc2vec_solution_spans_notebook/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


Downloading:   0%|          | 0.00/5.88M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache_doc2vec_solution_spans_notebook/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.
Downloading and preparing dataset doc2dial/document_domain (download: 5.61 MiB, generated: 195.38 MiB, post-processed: Unknown size, total: 200.99 MiB) to ./data_cache_doc2vec_solution_spans_notebook/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache_doc2vec_solution_spans_notebook/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.
Downloading and preparing dataset doc2dial/doc2dial_rc (download: 5.61 MiB, generated: 131.12 MiB, post-processed: Unknown size, total: 136.72 MiB) to ./data_cache_doc2vec_solution_spans_notebook/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache_doc2vec_solution_spans_notebook/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.


# Train a model for each document

### Preprocessing

In [2]:
import pandas as pd

document_full_df = pd.DataFrame(data=document_dataset)

# key=span_id, value=text
def span_dict_for_doc(doc_id):
    #use pandas as way faster
    document = document_full_df.loc[document_full_df['doc_id'] == doc_id]
    spans_dict = {}
    for span in document['spans'].iloc[0]:
        spans_dict[span['id_sp']]=span['text_sp']

    return spans_dict



In [11]:
from gensim.utils import simple_preprocess
#Getting a list of spans per grounding document !!!! Language confusion: spans are the documents in gensim land and the list of spans (aka the whole document) is the corpora in gensim

#I only want to use documents from the document datasets that have dialogues (which not all have)
unique_doc_ids = list(set(dialogue_dataset['doc_id']))

#extract all the span texts for that doc id,
#!!!Index drama. For list index starts with 0, for spans index starts with 1, use dict to keep the sp_id and doc_id
#key=doc_id, value=span dict


raw_training_text_per_doc = {}
for doc_id in unique_doc_ids:
    # doc_spans[doc_id] = span_dict_for_doc(doc_id) #for spans
    raw_training_text_per_doc[doc_id] = span_dict_for_doc(doc_id) #for text_sec

tokenized_training_docs = {}
for doc_id in raw_training_text_per_doc:
    spans_dic = raw_training_text_per_doc[doc_id]
    preprocessed_spans = {}
    for key, value in spans_dic.items():
        #simplest preprocessing from gemsim
        preprocessed_spans[key] = simple_preprocess(value, deacc=True)
    tokenized_training_docs[doc_id] = preprocessed_spans

tokenized_training_docs

{'Benefits Planner: Retirement | Benefits For Your Spouse | Social Security Administration#1_0': {'1': ['benefits',
   'planner',
   'retirement'],
  '2': ['benefits', 'for', 'your', 'spouse'],
  '3': ['even',
   'if',
   'you',
   'have',
   'never',
   'worked',
   'under',
   'social',
   'security'],
  '4': ['you',
   'may',
   'be',
   'able',
   'to',
   'get',
   'spouse',
   'retirement',
   'benefits',
   'if',
   'you',
   'are',
   'at',
   'least',
   'years',
   'of',
   'age',
   'and',
   'your',
   'spouse',
   'is',
   'receiving',
   'retirement',
   'or',
   'disability',
   'benefits'],
  '5': ['you', 'can', 'also', 'qualify', 'for', 'medicare', 'at', 'age'],
  '6': ['if', 'you', 'are', 'divorced'],
  '7': ['you',
   'may',
   'still',
   'be',
   'able',
   'to',
   'get',
   'benefits',
   'on',
   'your',
   'ex',
   'spouse',
   'record'],
  '8': ['for',
   'information',
   'on',
   'the',
   'requirements',
   'for',
   'divorced',
   'spouse',
   'benefits'],

### training

In [13]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

models_for_doc = {}
for doc_id in tokenized_training_docs:
    tokenized_span_dic = tokenized_training_docs[doc_id]
    training_docs = [TaggedDocument(span, key) for key, span in tokenized_span_dic.items()]
    # Check how to fine tune the model https://radimrehurek.com/gensim/models/doc2vec.html
    model = Doc2Vec(training_docs, vector_size=10, window=4, min_count=1, workers=4, epochs=30)
    models_for_doc[doc_id] = model

len(models_for_doc)

415

### predict rc questions

In [21]:
predictions = []
references = []
for example in rc_dataset:
    question_ = example["question"]
    doc_id =example['title']
    # it does better if the user and agent string are left
    # question_ = question_.replace('user:', '')
    # question_ = question_.replace('agent:', '')

    #preprocess question in the same way
    test_doc = simple_preprocess(question_, deacc=True)
    #calculate vector using model for that document
    model = models_for_doc[doc_id]
    vector = model.infer_vector(test_doc)
    #find the most similar document (spans)
    sims = model.dv.most_similar([vector], topn=1)
    if sims[0][0] == '0':
        print(question_)
        print(doc_id)
        print(sims)
    # print(sims)
    # spans_dic = raw_training_text_per_doc[doc_id]
    # index_for_answer = sims[0][0]
    # most_likely_answer = list(spans_dic.values())[index_for_answer]
    # predicted_tag = list(spans_dic.keys())[index_for_answer]
    #
    # # most_likely_answer = training_docs[sims[0][0] - 1]
    # # most_likely_predicted_tag = most_likely_answer.tags
    # #find original text for tag
    # # most_likely_predicted_text = spans[most_likely_predicted_tag[0]-1]
    # print(f'Question: {question_}\n')
    # print(f'Predicted Answer tag {predicted_tag}: {most_likely_answer}\n')
    # print(f'Correct answer: {example["answers"]}\n')
    #
    # id_ = example["id"]
    # predictions.append(
    #     {'id': id_,
    #      'prediction_text':
    #          most_likely_answer,
    #      'no_answer_probability': 0.0
    #      }
    # )
    #
    # #just using their answers
    # references.append(
    #     {
    #         "id": id_,
    #         "answers": example["answers"],
    #     }
    # )

predictions[:5]


user:Hello, I forgot o update my address, can you help me with that?
Top 5 DMV Mistakes and How to Avoid Them#3_0
[('0', 0.9982489347457886)]
user:I need to know how much is the fee for my vehicle.
Commercial vehicle registration fees, vehicle use taxes and supplemental fees#3_0
[('0', 0.9873241186141968)]
user:What are the things that I need to organise before I am eligible to take my road test?
Prepare for your road test#3_0
[('0', 0.9988679885864258)]
user:Can I send an appeal by mail?
Appeal a TVB ticket conviction#1_0
[('0', 0.9967803359031677)]
user:I'm looking to appeal a conviction.
Appeal a TVB ticket conviction#1_0
[('0', 0.9387500286102295)]
user:How do I get proof of ownership?
Proof of ownership is not available#3_0
[('0', 0.9965589046478271)]
user:yes there is 	agent:is there a surviving spouse that wants to transfer ownership? 	user:no it won't 	agent:will the transfer be made according to the Estates , Powers & Trust Law of New York State? 	agent:Use this checklist to h

KeyboardInterrupt: 

In [24]:
spans = raw_training_text_per_doc['Top 5 DMV Mistakes and How to Avoid Them#3_0']

KeyboardInterrupt: 

### evaluate


In [37]:
from datasets import load_metric

print(f'Number of question in train dataset {rc_dataset.shape[0]}')
metric = load_metric("squad_v2")
metric.add_batch(predictions=predictions, references=references)
final_score = metric.compute()
final_score

Number of question in train dataset 20431


{'exact': 0.783123684596936,
 'f1': 17.21500881285785,
 'total': 20431,
 'HasAns_exact': 0.783123684596936,
 'HasAns_f1': 17.21500881285785,
 'HasAns_total': 20431,
 'best_exact': 0.783123684596936,
 'best_exact_thresh': 0.0,
 'best_f1': 17.21500881285785,
 'best_f1_thresh': 0.0}

### How does this compare to guessing at random?


In [46]:
import random

random_predictions = []
random_references = []
for example in rc_dataset:
    question_ = example["question"]
    doc_id =example['title']

    #pick a random text from the document
    texts = raw_training_text_per_doc[doc_id]
    most_likely_answer = texts[random.randint(0, len(texts)-1)]

    id_ = example["id"]
    random_predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    random_references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

random_predictions[:3]

[{'id': '9f44c1539efe6f7e79b02eb1b413aa43_1',
  'prediction_text': 'Many DMV customers make easily avoidable mistakes that cause them significant problems, including encounters with law enforcement and impounded vehicles. Because we see customers make these mistakes over and over again , we are issuing this list of the top five DMV mistakes and how to avoid them. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_3',
  'prediction_text': '\n\n2. Leaving the State Without Notifying DMV \n',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_5',
  'prediction_text': 'It is not sufficient to only: write your new address on the back of your old license; tell the United States Postal Service; or inform the police officer writing you a ticket. If you fail to keep your address current , you will miss a suspension order and may be charged with operating an unregistered vehicle and/or aggravated unlicensed operation, both misdemeanors. This rea

In [47]:
print(f'Results for random text predictions for each question')
metric = load_metric("squad_v2")
metric.add_batch(predictions=random_p   redictions, references=random_references)
final_score = metric.compute()
final_score

Results for random text predictions for each question


{'exact': 1.5907199843375264,
 'f1': 14.745765602471895,
 'total': 20431,
 'HasAns_exact': 1.5907199843375264,
 'HasAns_f1': 14.745765602471895,
 'HasAns_total': 20431,
 'best_exact': 1.5907199843375264,
 'best_exact_thresh': 0.0,
 'best_f1': 14.745765602471895,
 'best_f1_thresh': 0.0}

### Results for validation part of rc dataset

In [38]:
rc_validation_dataset = load_dataset(
    "doc2dial",
    name="doc2dial_rc",
    split="validation",
    ignore_verifications=True,
    cache_dir=cache_dir,
)



Reusing dataset doc2dial (./data_cache_doc2vec_solution_notebook/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c)


In [39]:
predictions = []
references = []
print(f'Number of question in train dataset {rc_validation_dataset.shape[0]}')

for example in rc_validation_dataset:
    question_ = example["question"]
    doc_id =example['title']
    # it does better if the user and agent string are left
    # question_ = question_.replace('user:', '')
    # question_ = question_.replace('agent:', '')

    #preprocess question in the same way
    test_doc = simple_preprocess(question_, deacc=True)
    #calculate vector using model for that document
    model = models_for_doc[doc_id]
    vector = model.infer_vector(test_doc)
    #find the most similar document (spans)
    sims = model.dv.most_similar([vector], topn=1)
    most_likely_answer = raw_training_text_per_doc[doc_id][sims[0][0]]
    # most_likely_answer = training_docs[sims[0][0] - 1]
    # most_likely_predicted_tag = most_likely_answer.tags
    #find original text for tag
    # most_likely_predicted_text = spans[most_likely_predicted_tag[0]-1]
    # print(f': {most_likely_answer.words}\n')
    # print(f'Predicted Answer: {most_likely_answer}\n')

    id_ = example["id"]
    predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

predictions[:5]

Number of question in train dataset 3972


KeyError: 'About New York State Inspections#3_0'

### Questions:
- Given that we don't have all the documents to train does that make sense for a Doc2Vec approach or does it make sense
to create a model per document?