###  Load Dataset


In [None]:
from datasets import load_dataset

split = "train"
cache_dir = "./data_cache_doc2vec_solution_notebook"

dialogue_dataset = load_dataset(
    "doc2dial",
    name="dialogue_domain",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

document_dataset = load_dataset(
    "doc2dial",
    name="document_domain",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

rc_dataset = load_dataset(
    "doc2dial",
    name="doc2dial_rc",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

# Train a model for each document

### Preprocessing

In [3]:
import pandas as pd

document_full_df = pd.DataFrame(data=document_dataset)

# key=span_id, value=text
def span_dict_for_doc(doc_id):
    #use pandas as way faster
    document = document_full_df.loc[document_full_df['doc_id'] == doc_id]
    spans_dict = {}
    for span in document['spans'].iloc[0]:
        spans_dict[span['id_sp']]=span['text_sp']

    return spans_dict



In [6]:
from gensim.utils import simple_preprocess
#Getting a list of spans per grounding document !!!! Language confusion: spans are the documents in gensim land and the list of spans (aka the whole document) is the corpora in gensim

#I only want to use documents from the document datasets that have dialogues (which not all have)
unique_doc_ids = list(set(dialogue_dataset['doc_id']))

#extract all the span texts for that doc id,
#!!!Index drama. For list index starts with 0, for spans index starts with 1, use dict to keep the sp_id and doc_id
#key=doc_id, value=span dict

#TODO consider using text-sec instead of spans

def text_sec_doc(doc_id):
    #use pandas as way faster
    document = document_full_df.loc[document_full_df['doc_id'] == doc_id]
    #TODO interestingly the spans also have a text_sect which is split into less small segments and probably better
    text_sec = []
    current_text_sec = ''

    for span in document['spans'].iloc[0]:
        text = span['text_sec']
        if current_text_sec != text: #found new sect
            text_sec.append(text)
            current_text_sec = text
    return text_sec

raw_training_text_per_doc = {}
for doc_id in unique_doc_ids:
    # doc_spans[doc_id] = span_dict_for_doc(doc_id) #for spans
    raw_training_text_per_doc[doc_id] = text_sec_doc(doc_id) #for text_sec

tokenized_training_docs = {}
for doc_id in raw_training_text_per_doc:
    text_sections = raw_training_text_per_doc[doc_id]
    #simplest preprocessing from gemsim
    tokenized_sections = [list(simple_preprocess(sec, deacc=True)) for sec in text_sections]
    tokenized_training_docs[doc_id] = tokenized_sections

list(tokenized_training_docs.keys())[:5]

['Buy or sell a vehicle (transfer ownership)#3_0',
 'Direct Deposit | Social Security Administration#2_0',
 'Benefits Planner: Retirement | Benefits By Year Of Birth | Social Security Administration#1_0',
 'Veterans Vocational Rehabilitation Programs | Veterans Affairs#1_0',
 'Standard Plan | Federal Student Aid#1_0']

### Training

In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

models_for_doc = {}
for doc_id in tokenized_training_docs:
    tokenized_sec = tokenized_training_docs[doc_id]
    training_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_sec)]
    # Check how to fine tune the model https://radimrehurek.com/gensim/models/doc2vec.html
    model = Doc2Vec(training_docs, vector_size=10, window=4, min_count=1, workers=4, epochs=30)
    models_for_doc[doc_id] = model

len(models_for_doc)

415

### Predict rc questions

In [8]:
predictions = []
references = []
for example in rc_dataset:
    question_ = example["question"]
    doc_id =example['title']
    # it does better if the user and agent string are left
    # question_ = question_.replace('user:', '')
    # question_ = question_.replace('agent:', '')

    #preprocess question in the same way
    test_doc = simple_preprocess(question_, deacc=True)
    #calculate vector using model for that document
    model = models_for_doc[doc_id]
    vector = model.infer_vector(test_doc)
    #find the most similar document (spans)
    sims = model.dv.most_similar([vector], topn=1)
    most_likely_answer = raw_training_text_per_doc[doc_id][sims[0][0]]
    # most_likely_answer = training_docs[sims[0][0] - 1]
    # most_likely_predicted_tag = most_likely_answer.tags
    #find original text for tag
    # most_likely_predicted_text = spans[most_likely_predicted_tag[0]-1]
    # print(f': {most_likely_answer.words}\n')
    # print(f'Predicted Answer: {most_likely_answer}\n')

    id_ = example["id"]
    predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

predictions[:5]


[{'id': '9f44c1539efe6f7e79b02eb1b413aa43_1',
  'prediction_text': 'States communicate with each other , so when you move to another state, be sure to tie up any loose ends regarding your New York State license or registration. That means resolving any unanswered tickets, suspensions or revocations, and surrendering your license plates to NYS when you get to your new home state. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_3',
  'prediction_text': 'DMV maintains a point system to track dangerous drivers. Often , motorists convicted of a traffic ticket feel they have resolved all their motoring issues with the local court, but later learn that the Driver Responsibility Assessment DRA is a separate DMV charge based on the total points they accumulate. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_5',
  'prediction_text': 'About ten percent of customers visiting a DMV office do not bring what they need to complete their tran

### Evaluate


In [9]:
from datasets import load_metric

print(f'Number of question in train dataset {rc_dataset.shape[0]}')
metric = load_metric("squad_v2")
metric.add_batch(predictions=predictions, references=references)
final_score = metric.compute()
final_score

Number of question in train dataset 20431


{'exact': 0.8467524839704371,
 'f1': 17.2646973229193,
 'total': 20431,
 'HasAns_exact': 0.8467524839704371,
 'HasAns_f1': 17.2646973229193,
 'HasAns_total': 20431,
 'best_exact': 0.8467524839704371,
 'best_exact_thresh': 0.0,
 'best_f1': 17.2646973229193,
 'best_f1_thresh': 0.0}

### How does this compare to guessing at random?


In [10]:
import random

random_predictions = []
random_references = []
for example in rc_dataset:
    question_ = example["question"]
    doc_id =example['title']

    #pick a random text from the document
    texts = raw_training_text_per_doc[doc_id]
    most_likely_answer = texts[random.randint(0, len(texts)-1)]

    id_ = example["id"]
    random_predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    random_references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

random_predictions[:3]

[{'id': '9f44c1539efe6f7e79b02eb1b413aa43_1',
  'prediction_text': 'By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_3',
  'prediction_text': 'The $300 DRA fee can be paid in $100 annual installments over three years. Motorists who fail to maintain an updated address with DMV may resolve their tickets with the court, but never receive their DRA assessment because we do not have their new address on record. Failure to pay the DRA will result in a suspended license. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_5',
  'prediction_text': '\n\n5. Not Bringing Proper Documentation to DMV Office \n',
  'no_answer_probability': 0.0}]

In [11]:
print(f'Results for random text predictions for each question')
metric = load_metric("squad_v2")
metric.add_batch(predictions=random_predictions, references=random_references)
final_score = metric.compute()
final_score

Results for random text predictions for each question


{'exact': 1.5075130928491018,
 'f1': 14.794680360709638,
 'total': 20431,
 'HasAns_exact': 1.5075130928491018,
 'HasAns_f1': 14.794680360709638,
 'HasAns_total': 20431,
 'best_exact': 1.5075130928491018,
 'best_exact_thresh': 0.0,
 'best_f1': 14.794680360709638,
 'best_f1_thresh': 0.0}

### Results for validation part of rc dataset

In [None]:
rc_validation_dataset = load_dataset(
    "doc2dial",
    name="doc2dial_rc",
    split="validation",
    ignore_verifications=True,
    cache_dir=cache_dir,
)



In [13]:
predictions = []
references = []
print(f'Number of question in train dataset {rc_validation_dataset.shape[0]}')

for example in rc_validation_dataset:
    question_ = example["question"]
    doc_id =example['title']
    # it does better if the user and agent string are left
    # question_ = question_.replace('user:', '')
    # question_ = question_.replace('agent:', '')

    #preprocess question in the same way
    test_doc = simple_preprocess(question_, deacc=True)
    #calculate vector using model for that document
    model = models_for_doc[doc_id]
    vector = model.infer_vector(test_doc)
    #find the most similar document (spans)
    sims = model.dv.most_similar([vector], topn=1)
    most_likely_answer = raw_training_text_per_doc[doc_id][sims[0][0]]
    # most_likely_answer = training_docs[sims[0][0] - 1]
    # most_likely_predicted_tag = most_likely_answer.tags
    #find original text for tag
    # most_likely_predicted_text = spans[most_likely_predicted_tag[0]-1]
    # print(f': {most_likely_answer.words}\n')
    # print(f'Predicted Answer: {most_likely_answer}\n')

    id_ = example["id"]
    predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

predictions[:5]

Number of question in train dataset 3972


KeyError: 'About New York State Inspections#3_0'

### Questions:
- Given that we don't have all the documents to train does that make sense for a Doc2Vec approach or does it make sense
to create a model per document?