###  Load Dataset


In [1]:
from datasets import load_dataset

split = "train"
cache_dir = "./data_cache_doc2vec_solution_spans_notebook"

document_dataset = load_dataset(
    "doc2dial",
    name="document_domain",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

rc_dataset = load_dataset(
    "doc2dial",
    name="doc2dial_rc",
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Reusing dataset doc2dial (./data_cache_doc2vec_solution_spans_notebook/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c)
Reusing dataset doc2dial (./data_cache_doc2vec_solution_spans_notebook/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c)
Reusing dataset doc2dial (./data_cache_doc2vec_solution_spans_notebook/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c)


# Train a model for each document

### Preprocessing

In [27]:
def span_dict_for_row(spans):
    spans_dict = {}
    for span in spans:
        spans_dict[span['id_sp']] = span['text_sp']

    return spans_dict


In [28]:
from gensim.utils import simple_preprocess
import pandas as pd

#Getting a list of spans per grounding document !!!! Language confusion: spans are the documents in gensim land and the list of spans (aka the whole document) is the corpora in gensim

document_full_df = pd.DataFrame(data=document_dataset)
raw_training_docs_per_doc = {}
for index, row in document_full_df.iterrows():
    raw_training_docs_per_doc[row['doc_id']] = span_dict_for_row(row['spans'])

tokenized_training_docs = {}
for doc_id in raw_training_docs_per_doc:
    spans_dic = raw_training_docs_per_doc[doc_id]
    preprocessed_spans = {}
    for key, value in spans_dic.items():
        #simplest preprocessing from gemsim
        preprocessed_spans[key] = simple_preprocess(value, deacc=True)
    tokenized_training_docs[doc_id] = preprocessed_spans

tokenized_training_docs

{'Benefits Planner: Survivors | Planning For Your Survivors | Social Security Administration#1_0': {'1': ['benefits',
   'planner',
   'survivors',
   'planning',
   'for',
   'your',
   'survivors'],
  '2': ['as', 'you', 'plan', 'for', 'the', 'future'],
  '3': ['you',
   'll',
   'want',
   'to',
   'think',
   'about',
   'what',
   'your',
   'family',
   'would',
   'need',
   'if',
   'you',
   'should',
   'die',
   'now'],
  '4': ['social',
   'security',
   'can',
   'help',
   'your',
   'family',
   'if',
   'you',
   'have',
   'earned',
   'enough',
   'social',
   'security',
   'credits',
   'through',
   'your',
   'work'],
  '5': ['you', 'can', 'earn', 'up', 'to', 'four', 'credits', 'each', 'year'],
  '6': ['in'],
  '7': ['for', 'example'],
  '8': ['you',
   'earn',
   'one',
   'credit',
   'for',
   'each',
   'of',
   'wages',
   'or',
   'self',
   'employment',
   'income'],
  '9': ['when', 'you', 'have', 'earned'],
  '10': ['you',
   'have',
   'earned',
   'your'

### training

In [29]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

models_for_doc = {}
for doc_id in tokenized_training_docs:
    tokenized_span_dic = tokenized_training_docs[doc_id]
    training_docs = [TaggedDocument(span, [key]) for key, span in tokenized_span_dic.items()]
    # Check how to fine tune the model https://radimrehurek.com/gensim/models/doc2vec.html
    model = Doc2Vec(training_docs, vector_size=10, window=4, min_count=1, workers=4, epochs=30)
    models_for_doc[doc_id] = model

len(models_for_doc)

488

### Predict rc questions

In [30]:
predictions = []
references = []
for example in rc_dataset:
    question_ = example["question"]
    doc_id = example['title']
    # it does better if the user and agent string are left
    # question_ = question_.replace('user:', '')
    # question_ = question_.replace('agent:', '')

    #preprocess question in the same way
    test_doc = simple_preprocess(question_, deacc=True)
    #calculate vector using model for that document
    model = models_for_doc[doc_id]
    vector = model.infer_vector(test_doc)
    #find the most similar document (spans)
    sims = model.dv.most_similar([vector], topn=1)

    spans_dic = raw_training_docs_per_doc[doc_id]
    tag_for_most_likely_answer = sims[0][0]
    most_likely_answer = spans_dic[tag_for_most_likely_answer]

    # print(f'Question: {question_}\n')
    # print(f'Predicted Answer tag {tag_for_answer}: {most_likely_answer}\n')
    # print(f'Correct answer: {example["answers"]}\n')

    id_ = example["id"]
    predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

predictions[:5]


[{'id': '9f44c1539efe6f7e79b02eb1b413aa43_1',
  'prediction_text': 'motorists convicted of a traffic ticket feel they have resolved all their motoring issues with the local court, but later learn that the Driver Responsibility Assessment DRA is a separate DMV charge based on the total points they accumulate. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_3',
  'prediction_text': 'motorists convicted of a traffic ticket feel they have resolved all their motoring issues with the local court, but later learn that the Driver Responsibility Assessment DRA is a separate DMV charge based on the total points they accumulate. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_5',
  'prediction_text': 'motorists convicted of a traffic ticket feel they have resolved all their motoring issues with the local court, but later learn that the Driver Responsibility Assessment DRA is a separate DMV charge based on the total points they accumulate

### Evaluate


In [32]:
from datasets import load_metric

print(f'Number of question in train dataset {rc_dataset.shape[0]}')
metric = load_metric("squad_v2")
metric.add_batch(predictions=predictions, references=references)
final_score = metric.compute()
final_score

Number of question in train dataset 20431


{'exact': 1.4292007243894083,
 'f1': 15.261395155668184,
 'total': 20431,
 'HasAns_exact': 1.4292007243894083,
 'HasAns_f1': 15.261395155668184,
 'HasAns_total': 20431,
 'best_exact': 1.4292007243894083,
 'best_exact_thresh': 0.0,
 'best_f1': 15.261395155668184,
 'best_f1_thresh': 0.0}

### How does this compare to guessing at random?


In [19]:
import random

random_predictions = []
random_references = []
for example in rc_dataset:
    question_ = example["question"]
    doc_id = example['title']

    #pick a random text from the document
    spans = raw_training_docs_per_doc[doc_id]
    most_likely_answer = spans[str(random.randint(1, len(spans)))]

    id_ = example["id"]
    random_predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    random_references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

random_predictions[:3]

[{'id': '9f44c1539efe6f7e79b02eb1b413aa43_1',
  'prediction_text': 'if it persists, your driver license!We suspend 300,000 registrations a year for failure to maintain insurance. ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_3',
  'prediction_text': 'Sign up or log into MyDMV [6 ] ',
  'no_answer_probability': 0.0},
 {'id': '9f44c1539efe6f7e79b02eb1b413aa43_5',
  'prediction_text': '\n\n3. Letting Insurance Lapse \n',
  'no_answer_probability': 0.0}]

In [31]:
print(f'Results for random text predictions for each question')
metric = load_metric("squad_v2")
metric.add_batch(predictions=random_predictions, references=random_references)
final_score = metric.compute()
final_score

Results for random text predictions for each question


{'exact': 1.140423865694288,
 'f1': 11.73321526956814,
 'total': 20431,
 'HasAns_exact': 1.140423865694288,
 'HasAns_f1': 11.73321526956814,
 'HasAns_total': 20431,
 'best_exact': 1.140423865694288,
 'best_exact_thresh': 0.0,
 'best_f1': 11.73321526956814,
 'best_f1_thresh': 0.0}

### Results for validation part of rc dataset

In [33]:
rc_validation_dataset = load_dataset(
    "doc2dial",
    name="doc2dial_rc",
    split="validation",
    ignore_verifications=True,
    cache_dir=cache_dir,
)



Reusing dataset doc2dial (./data_cache_doc2vec_solution_spans_notebook/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c)


In [39]:
validation_predictions = []
validation_references = []
print(f'Number of question in validation dataset {rc_validation_dataset.shape[0]}')

for example in rc_validation_dataset:
    question_ = example["question"]
    doc_id = example['title']
    # it does better if the user and agent string are left
    # question_ = question_.replace('user:', '')
    # question_ = question_.replace('agent:', '')

    #preprocess question in the same way
    test_doc = simple_preprocess(question_, deacc=True)
    #calculate vector using model for that document
    model = models_for_doc[doc_id]
    vector = model.infer_vector(test_doc)
    #find the most similar document (spans)
    sims = model.dv.most_similar([vector], topn=1)

    #find what that relates to in the original document
    spans_dic = raw_training_docs_per_doc[doc_id]
    tag_for_most_likely_answer = sims[0][0]
    most_likely_answer = spans_dic[tag_for_most_likely_answer]

    id_ = example["id"]
    validation_predictions.append(
        {'id': id_,
         'prediction_text':
             most_likely_answer,
         'no_answer_probability': 0.0
         }
    )

    #just using their answers
    validation_references.append(
        {
            "id": id_,
            "answers": example["answers"],
        }
    )

validation_predictions[:5]

Number of question in validation dataset 3972


[{'id': 'dea7174409afbfe0af0ace21e7f318ae_1',
  'prediction_text': 'motorists convicted of a traffic ticket feel they have resolved all their motoring issues with the local court, but later learn that the Driver Responsibility Assessment DRA is a separate DMV charge based on the total points they accumulate. ',
  'no_answer_probability': 0.0},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_3',
  'prediction_text': 'motorists convicted of a traffic ticket feel they have resolved all their motoring issues with the local court, but later learn that the Driver Responsibility Assessment DRA is a separate DMV charge based on the total points they accumulate. ',
  'no_answer_probability': 0.0},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_5',
  'prediction_text': 'motorists convicted of a traffic ticket feel they have resolved all their motoring issues with the local court, but later learn that the Driver Responsibility Assessment DRA is a separate DMV charge based on the total points they accumulate

In [40]:
print(f'Results for validation dataset')
metric = load_metric("squad_v2")
metric.add_batch(predictions=validation_predictions, references=validation_references)
final_score = metric.compute()
final_score

Results for validation dataset


{'exact': 1.2588116817724069,
 'f1': 15.374366183963192,
 'total': 3972,
 'HasAns_exact': 1.2588116817724069,
 'HasAns_f1': 15.374366183963192,
 'HasAns_total': 3972,
 'best_exact': 1.2588116817724069,
 'best_exact_thresh': 0.0,
 'best_f1': 15.374366183963192,
 'best_f1_thresh': 0.0}

### Questions:
- Given that we don't have all the documents to train does that make sense for a Doc2Vec approach or does it make sense
to create a model per document?