In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 
from transformers import AutoTokenizer,TFAutoModelForSequenceClassification
from datasets import load_dataset

In [2]:
raw_datasets=load_dataset('glue','mrpc')

Downloading: 28.8kB [00:00, 5.77MB/s]                   
Downloading: 28.7kB [00:00, 7.18MB/s]                   
Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to C:\Users\bravi\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...
Downloading: 6.22kB [00:00, 2.08MB/s]
Downloading: 1.05MB [00:01, 692kB/s]
Downloading: 441kB [00:00, 464kB/s]
Dataset glue downloaded and prepared to C:\Users\bravi\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 408
    })
    test: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 1725
    })
})

In [37]:
raw_datasets.keys()

dict_keys(['train', 'validation', 'test'])

In [8]:
raw_train_ds=raw_datasets["train"]

In [11]:
raw_train_ds

Dataset({
    features: ['idx', 'label', 'sentence1', 'sentence2'],
    num_rows: 3668
})

In [12]:
raw_train_ds.features


{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}

In [15]:
raw_train_ds[0]

{'idx': 0,
 'label': 1,
 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'}

In [39]:
raw_train_ds['sentence1'][:5]

['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']

In [19]:
raw_train_ds[:5]

{'idx': [0, 1, 2, 3, 4],
 'label': [1, 0, 1, 0, 1],
 'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
  'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
  'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
  'Tab shares jumped 20

#### TOkenizer with pair of sentences

In [21]:
checkpoint='bert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

In [34]:
inputs=tokenizer('This is the first sentence.','This is the second sentence.')
print(inputs)

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 6251, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [35]:
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'sentence', '.', '[SEP]']


In [40]:
def tokenize_dataset(dataset):
    sent1=dataset['sentence1']
    sent2=dataset['sentence2']
    encoded=tokenizer(sent1,sent2,padding=True,truncation=True,return_tensors='np')
    return encoded.data

In [42]:
tokenized_dataset={split:tokenize_dataset(raw_datasets[split]) for split in raw_datasets.keys()}

In [48]:
tokenized_dataset

{'train': {'input_ids': array([[  101,  2572,  3217, ...,     0,     0,     0],
         [  101,  9805,  3540, ...,     0,     0,     0],
         [  101,  2027,  2018, ...,     0,     0,     0],
         ...,
         [  101,  1000,  2057, ...,     0,     0,     0],
         [  101,  1996, 26828, ...,     0,     0,     0],
         [  101,  1996,  2382, ...,     0,     0,     0]]),
  'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]]),
  'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]])},
 'validation': {'input_ids': array([[  101,  2002,  2056, ...,     0,     0,     0],
         [  101, 20201, 22948, ...,     0,     0,   