<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/expanded_datasets_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

# Exploring Datasets

## Downloading Datasets
Download using HuggingFace's `datasets` library

In [64]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import json

from pprint import pprint

In [65]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [67]:
from datasets import list_datasets, load_dataset_builder, get_dataset_config_names, load_dataset, load_from_disk

In [68]:
def summarize_dataset (dataset, config=None):
    builder = load_dataset_builder(dataset, config)
    print(f"Description:\n {builder.info.description}")
    print(f"Features:")
    pprint(builder.info.features)
    return

In [69]:
dataset_list = list_datasets()

In [70]:
[dataset for dataset in dataset_list if "squad" in dataset]

['iapp_wiki_qa_squad',
 'squad',
 'squad_adversarial',
 'squad_es',
 'squad_it',
 'squad_kor_v1',
 'squad_kor_v2',
 'squad_v1_pt',
 'squad_v2',
 'squadshifts',
 'thaiqa_squad',
 'GEM/squad_v2',
 'Sabokou/qg_squad_modified',
 'Sabokou/qg_squad_modified_dev',
 'Tevatron/wikipedia-squad-corpus',
 'Tevatron/wikipedia-squad',
 'adamlin/coqa_squad',
 'lmqg/qg_squad',
 'caltonji/harrypotter_squad_v2',
 'caltonji/harrypotter_squad_v2_2',
 'dweb/squad_with_cola_scores',
 'lhoestq/custom_squad',
 'lhoestq/squad',
 'lijingxin/squad_zen',
 'lijingxin/squad_zh_1',
 'lincoln/newsquadfr',
 'philschmid/test_german_squad',
 'phoelti/squad_dev',
 'piEsposito/squad_20_ptbr',
 'qwant/squad_fr',
 'shivmoha/squad-unanswerable',
 'shivmoha/squad_adversarial_manual',
 'susumu2357/squad_v2_sv',
 'vershasaxena91/squad_multitask',
 'yuvalkirstain/quality_squad',
 'yuvalkirstain/quality_squad_debug',
 'yuvalkirstain/squad_full_doc',
 'yuvalkirstain/squad_seq2seq',
 'yuvalkirstain/squad_t5',
 'z-uo/squad-it',
 'Ni

In [8]:
[dataset for dataset in dataset_list if "trivia_qa" in dataset]

['trivia_qa']

In [9]:
[dataset for dataset in dataset_list if "natural" in dataset]

['natural_questions',
 'vasudevgupta/bigbird-tokenized-natural-questions',
 'vasudevgupta/natural-questions-validation',
 'LeboNLP/toxic-natural-utterances',
 'pscotti/naturalscenesdataset',
 'Veldrovive/split-webdataset-naturalscenes']

In [10]:
[dataset for dataset in dataset_list if "quac" in dataset]

['quac', 'Zaid/quac_expanded']

### SQuAD

In [71]:
print (get_dataset_config_names("squad"))

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.17k [00:00<?, ?B/s]

['plain_text']


In [72]:
summarize_dataset("squad")

Description:
 Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.

Features:
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
 'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None)}


In [73]:
# SQuAD is quick to download from Hugging Face
# Use the code below if you aren't accessing the data from the shared
# Google Drive folder.

# data_squad = load_dataset("squad")

# The followind code assumes you have added a link to the shared 
# w266 NLP Final Project folder in your Google Drive folder
# Loading data from there is faster.

data_squad = load_from_disk("/content/drive/MyDrive/w266 NLP Final Project/Data/squad.hf")

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [74]:
(type (data_squad))

datasets.dataset_dict.DatasetDict

In [75]:
# data_squad.save_to_disk("/content/drive/MyDrive/w266 NLP Final Project/Data/squad.hf")

⚠️ WARNING ⚠️ — TriviaQA is a large dataset

### TriviaQA*

In [None]:
pprint(get_dataset_config_names("trivia_qa"))

Downloading builder script:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/41.3k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

['rc',
 'rc.nocontext',
 'unfiltered',
 'unfiltered.nocontext',
 'rc.web',
 'rc.web.nocontext',
 'rc.wikipedia',
 'rc.wikipedia.nocontext']


In [None]:
summarize_dataset("trivia_qa", "rc")

Description:
 TriviaqQA is a reading comprehension dataset containing over 650K
question-answer-evidence triples. TriviaqQA includes 95K question-answer
pairs authored by trivia enthusiasts and independently gathered evidence
documents, six per question on average, that provide high quality distant
supervision for answering the questions.

Features:
{'answer': {'aliases': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
            'matched_wiki_entity_name': Value(dtype='string', id=None),
            'normalized_aliases': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
            'normalized_matched_wiki_entity_name': Value(dtype='string', id=None),
            'normalized_value': Value(dtype='string', id=None),
            'type': Value(dtype='string', id=None),
            'value': Value(dtype='string', id=None)},
 'entity_pages': Sequence(feature={'doc_source': Value(dtype='string', id=None), 'filename': Value(dtype='string', id=None), '

In [None]:
# TriviaQA downloads and pre-processes to generate 17.4 GB -- it takes 25 mins total
# data_trivia = load_dataset("trivia_qa", "rc")

data_trivia = load_from_disk("/content/drive/MyDrive/w266 NLP Final Project/Data/trivia_qa_rc.hf")

In [None]:
(type (data_trivia))

datasets.dataset_dict.DatasetDict

In [None]:
# data_trivia.save_to_disk("/content/drive/MyDrive/w266 NLP Final Project/Data/trivia_qa_rc.hf")

### Natural Questions**

In [None]:
pprint(get_dataset_config_names("natural_questions"))

Downloading builder script:   0%|          | 0.00/9.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

['default', 'dev']


In [None]:
summarize_dataset("natural_questions", "default")

Description:
 
The NQ corpus contains questions from real users, and it requires QA systems to
read and comprehend an entire Wikipedia article that may or may not contain the
answer to the question. The inclusion of real user questions, and the
requirement that solutions should read an entire page to find the answer, cause
NQ to be a more realistic and challenging task than prior QA datasets.

Features:
{'annotations': Sequence(feature={'id': Value(dtype='string', id=None), 'long_answer': {'start_token': Value(dtype='int64', id=None), 'end_token': Value(dtype='int64', id=None), 'start_byte': Value(dtype='int64', id=None), 'end_byte': Value(dtype='int64', id=None), 'candidate_index': Value(dtype='int64', id=None)}, 'short_answers': Sequence(feature={'start_token': Value(dtype='int64', id=None), 'end_token': Value(dtype='int64', id=None), 'start_byte': Value(dtype='int64', id=None), 'end_byte': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None)}, length=-1, id=None), '

In [None]:
# Natural Questions is too big for memory at 138 GB
# data_nq = load_dataset("natural_questions")

### QuAC

In [None]:
pprint(get_dataset_config_names("quac"))

Downloading builder script:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.2k [00:00<?, ?B/s]

['plain_text']


In [None]:
summarize_dataset("quac")

Description:
 Question Answering in Context is a dataset for modeling, understanding,
and participating in information seeking dialog. Data instances consist
of an interactive dialog between two crowd workers: (1) a student who
poses a sequence of freeform questions to learn as much as possible
about a hidden Wikipedia text, and (2) a teacher who answers the questions
by providing short excerpts (spans) from the text. QuAC introduces
challenges not found in existing machine comprehension datasets: its
questions are often more open-ended, unanswerable, or only meaningful
within the dialog context.

Features:
{'answers': Sequence(feature={'texts': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'answer_starts': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}, length=-1, id=None),
 'background': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'dialogue_id': Value(dtype='string', id=None),
 'followups': Sequence(featur

In [None]:
# Question Answering in Context downloads 136 MB and is quick
# data_quac = load_dataset("quac")

data_quac = load_from_disk("/content/drive/MyDrive/w266 NLP Final Project/Data/quac.hf")

In [None]:
(type (data_quac))

datasets.dataset_dict.DatasetDict

In [None]:
# data_quac.save_to_disk("/content/drive/MyDrive/w266 NLP Final Project/Data/quac.hf")

## Getting Familiar

### SQuAD

In [76]:
data_squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [77]:
data_squad['train'].info.features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

In [83]:
# Look at first example
pprint(data_squad['train'][0])

{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the '
            "Main Building's gold dome is a golden statue of the Virgin Mary. "
            'Immediately in front of the Main Building and facing it, is a '
            'copper statue of Christ with arms upraised with the legend '
            '"Venite Ad Me Omnes". Next to the Main Building is the Basilica '
            'of the Sacred Heart. Immediately behind the basilica is the '
            'Grotto, a Marian place of prayer and reflection. It is a replica '
            'of the grotto at Lourdes, France where the Virgin Mary reputedly '
            'appeared to Saint Bernadette Soubirous in 1858. At the end of the '
            'main drive (and in a direct line that connects through 3 statues '
            'and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f41900661182',
 'question': 'To whom did t

In [86]:
count=25
sample=data_squad['train'].shuffle(seed=1962).select(range(count))
df=pd.DataFrame()
df['answer'] = [answer['text'][0] for answer in sample['answers']]
df['context'] = sample['context']
df['question'] = sample['question']



In [87]:
df

Unnamed: 0,answer,context,question
0,biotech companies,"Prior to moving its headquarters to Chicago, a...",What type of businesses did Nickles want to at...
1,Tytus Woyciechowski,Four boarders at his parents' apartments becam...,To whom did Chopin reveal in letters which par...
2,the Endangered Species Committee,The question to be answered is whether a liste...,"If a species may be harmed, who holds final sa..."
3,China,"In Asian countries such as China, Korea, and J...",What country has the dog as part of its 12 ani...
4,45 years,Saint Athanasius of Alexandria (/ˌæθəˈneɪʃəs/;...,How long did his episcopate last?
5,"Cold War, First Gulf War, Kosovo War","Since 1947, Canadian military units have parti...",What are some of the wars the Canadian Militar...
6,Buddha,Tibet has various festivals that are commonly ...,What is worshipped during Tibet's various fest...
7,9.3%,"From 2001 to 2008, Mac sales increased continu...",What was Apples market share of all computer s...
8,Improvisation,Improvisation stands at the centre of Chopin's...,What is central to Chopin's process?
9,1861,"Alfred North Whitehead was born in Ramsgate, K...",What year was Whitehead born?


### TriviaQA

In [None]:
data_trivia

DatasetDict({
    train: Dataset({
        features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
        num_rows: 138384
    })
    validation: Dataset({
        features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
        num_rows: 17944
    })
    test: Dataset({
        features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
        num_rows: 17210
    })
})

In [None]:
data_trivia['train'].info.features

{'question': Value(dtype='string', id=None),
 'question_id': Value(dtype='string', id=None),
 'question_source': Value(dtype='string', id=None),
 'entity_pages': Sequence(feature={'doc_source': Value(dtype='string', id=None), 'filename': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'wiki_context': Value(dtype='string', id=None)}, length=-1, id=None),
 'search_results': Sequence(feature={'description': Value(dtype='string', id=None), 'filename': Value(dtype='string', id=None), 'rank': Value(dtype='int32', id=None), 'title': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'search_context': Value(dtype='string', id=None)}, length=-1, id=None),
 'answer': {'aliases': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
  'normalized_aliases': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
  'matched_wiki_entity_name': Value(dtype='string', id=None),
  'normalized_matched_wiki_entity_name': Value(dtyp

In [None]:
# Look at first example
pprint(data_trivia['train'][5])

{'answer': {'aliases': ['Chicago Bears',
                        'Chicago Staleys',
                        'Decatur Staleys',
                        'Chicago Bears football',
                        'Chicago bears',
                        'Save Da Planet',
                        'Chicago Gators'],
            'matched_wiki_entity_name': '',
            'normalized_aliases': ['chicago bears',
                                   'chicago staleys',
                                   'chicago gators',
                                   'decatur staleys',
                                   'save da planet',
                                   'chicago bears football'],
            'normalized_matched_wiki_entity_name': '',
            'normalized_value': 'chicago bears',
            'type': 'WikipediaEntity',
            'value': 'Chicago Bears'},
 'entity_pages': {'doc_source': ['TagMe'],
                  'filename': ['Super_Bowl_XX.txt'],
                  'title': ['Super Bowl XX'],
 

In [None]:
# Look at another example
pprint(data_trivia['train'][1962])

{'answer': {'aliases': ['Hydrogen carbide',
                        'Metane',
                        'Carbon tetrahydride',
                        'CH₄',
                        'Liquid methane',
                        'CH4 (disambiguation)',
                        'Methane plume',
                        'Marsh Gas',
                        'Methane gas',
                        'Carburetted hydrogen',
                        'Ch4',
                        'Liquid methane rocket fuel',
                        'Methyl hydride',
                        'Methan',
                        'CH4',
                        'Marsh gas,firedamp',
                        'Methane'],
            'matched_wiki_entity_name': '',
            'normalized_aliases': ['ch₄',
                                   'methyl hydride',
                                   'ch4 disambiguation',
                                   'metane',
                                   'carburetted hydrogen',
               

### Natural Questions

### QuAC

In [None]:
data_quac

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'wikipedia_page_title', 'background', 'section_title', 'context', 'turn_ids', 'questions', 'followups', 'yesnos', 'answers', 'orig_answers'],
        num_rows: 11567
    })
    validation: Dataset({
        features: ['dialogue_id', 'wikipedia_page_title', 'background', 'section_title', 'context', 'turn_ids', 'questions', 'followups', 'yesnos', 'answers', 'orig_answers'],
        num_rows: 1000
    })
})

In [None]:
data_quac['train']

Dataset({
    features: ['dialogue_id', 'wikipedia_page_title', 'background', 'section_title', 'context', 'turn_ids', 'questions', 'followups', 'yesnos', 'answers', 'orig_answers'],
    num_rows: 11567
})

In [None]:
# Look at first example
pprint(data_quac['train'][12])

{'answers': {'answer_starts': [[46], [391], [2887], [2887]],
             'texts': [['May 25, 1803,'],
                       ['John Clarke,'],
                       ['CANNOTANSWER'],
                       ['CANNOTANSWER']]},
 'background': 'Ralph Waldo Emerson (May 25, 1803 - April 27, 1882) was an '
               'American essayist, lecturer, philosopher and poet who led the '
               'transcendentalist movement of the mid-19th century. He was '
               'seen as a champion of individualism and a prescient critic of '
               'the countervailing pressures of society, and he disseminated '
               'his thoughts through dozens of published essays and more than '
               '1,500 public lectures across the United States. Emerson '
               'gradually moved away from the religious and social beliefs of '
               'his contemporaries, formulating and expressing the philosophy '
               'of transcendentalism in his 1836 essay "Nature". 