In [1]:
import torch
from torch.utils.data import DataLoader

from accelerate import Accelerator, DistributedType
from datasets import load_dataset, load_metric
from transformers import (
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

from tqdm.auto import tqdm

import datasets
import transformers
     

In [2]:
from config import model_checkpoint 

In [3]:
raw_datasets = load_dataset("glue", "mnli")

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [5]:

raw_datasets["train"][0]

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'label': 1,
 'idx': 0}

In [6]:

import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(raw_datasets["train"])

Unnamed: 0,premise,hypothesis,label,idx
0,"In the $125,000-$40,000 scenario, for instance, a deficit-neutral solution would increase their taxes by $2,000 or so (compared with now) if they remain single and cut their taxes by about $550 if they get married.",Taxes for married people will increase.,contradiction,133058
1,"The Knesset, Israel's parliament, meets in a modern building decorated with major works by Chagall and other artists.",The Knesset holds their meetings in a structure decorated by Chagall's works.,entailment,384168
2,"He sat and sipped his tea, idly spouting off technical specs and humorous anecdotes about my life- anecdotes which never happened.",He drank his tea and talked about the funny things that happened to him in college.,neutral,213938
3,Users are those who operate or rely on agency information resources and include the managers and staff responsible for agency policies and programs supported by the acquisition.,Users are those who operate on agency information resources and include the managers.,entailment,101606
4,so but uh of course my husband did everything except my brother's a trim carpenter and he came in you know and did the inside for us and that helped and and uh yes and uh we had to hire course the plumbing and the brick and everything else nothing you know he did everything else,We let professionals handle all of the work on the construction.,contradiction,341465
5,"She is perfectly sane, if that is what you mean.",The woman states that the man is not mentally ill.,contradiction,66631
6,it doesn't matter to me,I have no opinions on it.,neutral,119855
7,Long-Term Simulations,The computer simulated long-term performance.,entailment,177279
8,"From Agde, take the D51 to Marseillan, another ancient port, initially settled by the Phoenicians.",The D51 will bring you from Agde to Marseillan.,entailment,213292
9,Gingrich rose to fame by destroying a powerful Democrat (House Speaker Jim Wright),Gingrich berated Jim Wright for his extramarital affair and his indecisiveness.,neutral,328157


In [7]:
from config import get_tokenizer
tokenizer = get_tokenizer()

In [8]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")


{'input_ids': [0, 31414, 6, 42, 65, 3645, 328, 2, 2, 2409, 42, 3645, 1411, 19, 24, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}