In [1]:
import torch 
from transformers import AutoTokenizer, T5EncoderModel, T5Config, T5Model, T5ForConditionalGeneration

In [2]:
encoder = T5EncoderModel.from_pretrained('t5-small')
tokenizer = AutoTokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
print(f"Number of parameters in model: {model.num_parameters()/1000000}M")

Number of parameters in model: 60.506624M


In [40]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
encoder_outputs = model.encoder(**inputs)
print(encoder_outputs.keys())
print(encoder_outputs.last_hidden_state.shape)
decoder_inputs = {k:model._shift_right(v) for k, v in inputs.items()}
decoder_outputs = model.decoder(input_ids=decoder_inputs["input_ids"], encoder_hidden_states=encoder_outputs.last_hidden_state)
print(decoder_outputs.keys())
logits_ = model.lm_head(decoder_outputs.last_hidden_state * (model.model_dim**-0.5)) 
print(logits_[0])
outputs = model(**inputs, labels=inputs.input_ids)
print(outputs.keys())
print(outputs.logits[0])
print(outputs.logits.shape)

odict_keys(['last_hidden_state'])
torch.Size([1, 7, 512])
odict_keys(['last_hidden_state', 'past_key_values'])
tensor([[-11.4802,  -6.3112,  -8.8519,  ..., -40.1647, -40.2136, -40.1873],
        [-24.6288,  -8.8084,  -9.6085,  ..., -43.8241, -43.8325, -43.7878],
        [-25.8387,  -6.6168, -10.3323,  ..., -40.9006, -40.9082, -40.9096],
        ...,
        [-28.0055,  -6.4625, -10.2796,  ..., -46.8775, -46.9684, -46.9990],
        [-26.3766,  -7.5304, -11.4456,  ..., -44.3888, -44.4481, -44.4851],
        [-28.7336,  -3.8104, -10.0482,  ..., -46.8083, -46.8263, -46.8783]],
       grad_fn=<SelectBackward0>)
odict_keys(['loss', 'logits', 'past_key_values', 'encoder_last_hidden_state'])
tensor([[-11.4802,  -6.3112,  -8.8519,  ..., -40.1647, -40.2136, -40.1873],
        [-24.6288,  -8.8084,  -9.6085,  ..., -43.8241, -43.8325, -43.7878],
        [-25.8387,  -6.6168, -10.3323,  ..., -40.9006, -40.9082, -40.9096],
        ...,
        [-28.0055,  -6.4625, -10.2796,  ..., -46.8775, -46.9684, 

In [15]:
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
# print num params human readable
print(f"Number of parameters in model: {model.num_parameters()/1000000}M")

Number of parameters in model: 14.350248M


In [16]:
# gpt2 
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
print(f"Number of parameters in model: {model.num_parameters()/1000000}M")

Number of parameters in model: 124.439808M


## Bert
Let's check out distilbert

In [32]:
inputs.attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1]])

In [48]:
outputs["pooler_output"].shape

torch.Size([1, 312])

In [43]:
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM
model = AutoModel.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
# print num params human readable
print(f"Number of parameters in model: {model.num_parameters()/1000000}M")
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
print(outputs.last_hidden_state)    
model2 = AutoModelForMaskedLM.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
outputs2 = model2(**inputs, labels=inputs["input_ids"])
print(outputs2.logits)
model3 = AutoModelForCausalLM.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", is_decoder=True)
outputs3 = model3(**inputs, labels=inputs["input_ids"])
print(outputs3.logits)

Number of parameters in model: 14.350248M
tensor([[[-0.3632,  0.1912, -0.1566,  ..., -0.0675, -0.0154,  0.0712],
         [-0.1966, -0.0148,  0.2135,  ..., -0.5316,  0.3616, -0.2079],
         [-0.0606, -0.0285, -0.0152,  ...,  0.0132, -0.8960,  0.3656],
         ...,
         [-0.3456,  0.2578, -0.2698,  ...,  0.0631,  0.2830,  0.3263],
         [-0.1478,  0.3253, -0.1482,  ...,  0.0215,  0.9744,  0.1024],
         [-0.0326, -0.2293, -0.0655,  ..., -0.2141, -0.5240,  0.3316]]],
       grad_fn=<NativeLayerNormBackward0>)
tensor([[[-0.0233,  0.3683,  0.1870,  ...,  0.3800,  0.7760,  0.3710],
         [-0.0709, -0.1214,  0.0082,  ..., -0.1908,  0.2418, -0.2694],
         [-0.0662, -0.7025, -0.5034,  ..., -0.8311, -0.4651, -0.0320],
         ...,
         [-0.0476, -0.4128, -0.1295,  ..., -0.3573,  0.1723, -0.0999],
         [ 0.0017,  0.2796,  0.6235,  ...,  0.4129,  0.6510, -0.1558],
         [ 0.0803,  0.7502,  0.5556,  ...,  0.0634, -0.0539,  0.0026]]],
       grad_fn=<ViewBackward0>)

odict_keys(['last_hidden_state', 'pooler_output'])

## GPT2
Let's check out gpt2

## T5
Let's check out T5

In [34]:
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

encoder_ouputs = encoder(input_ids)

In [36]:
print(outputs.encoder_last_hidden_state)
print(encoder_ouputs.last_hidden_state)

tensor([[[-0.0235,  0.0490, -0.1531,  ..., -0.1444, -0.0982, -0.0988],
         [ 0.2134,  0.0752, -0.2040,  ..., -0.1085, -0.1076, -0.4276],
         [-0.1410,  0.0597,  0.2624,  ...,  0.1803,  0.2338, -0.1288],
         ...,
         [-0.1496, -0.0140, -0.2103,  ...,  0.0344, -0.0903,  0.1334],
         [-0.2262,  0.0248, -0.0235,  ..., -0.1075,  0.0131,  0.0612],
         [ 0.2149,  0.0967, -0.0164,  ..., -0.0685,  0.1851,  0.0746]]],
       grad_fn=<MulBackward0>)
tensor([[[-0.0235,  0.0490, -0.1531,  ..., -0.1444, -0.0982, -0.0988],
         [ 0.2134,  0.0752, -0.2040,  ..., -0.1085, -0.1076, -0.4276],
         [-0.1410,  0.0597,  0.2624,  ...,  0.1803,  0.2338, -0.1288],
         ...,
         [-0.1496, -0.0140, -0.2103,  ...,  0.0344, -0.0903,  0.1334],
         [-0.2262,  0.0248, -0.0235,  ..., -0.1075,  0.0131,  0.0612],
         [ 0.2149,  0.0967, -0.0164,  ..., -0.0685,  0.1851,  0.0746]]],
       grad_fn=<MulBackward0>)


In [41]:
decoder_outputs = model.decoder(input_ids=input_ids, encoder_hidden_states=encoder_ouputs.last_hidden_state)
print(model.lm_head(decoder_outputs.last_hidden_state))
print(outputs.logits)

tensor([[[ -818.2573,  -210.1928,  -336.9014,  ..., -1344.4564,
          -1345.2698, -1345.4575],
         [ -999.7817,  -271.1569,  -446.1989,  ..., -1461.3430,
          -1461.7225, -1461.9854],
         [-1119.0250,  -337.6578,  -511.1584,  ..., -1698.6843,
          -1702.1517, -1700.9644],
         ...,
         [-1058.8584,  -303.8443,  -486.7072,  ..., -1663.9822,
          -1671.6787, -1671.0627],
         [ -985.0807,  -206.8462,  -393.3148,  ..., -1544.1208,
          -1549.5402, -1548.0869],
         [ -673.4078,   -71.0885,  -243.6629,  ..., -1063.8423,
          -1067.3953, -1066.8198]]], grad_fn=<UnsafeViewBackward0>)
tensor([[[-36.1622,  -9.2893, -14.8891,  ..., -59.4171, -59.4531, -59.4614],
         [-44.1845, -11.9835, -19.7194,  ..., -64.5828, -64.5996, -64.6112],
         [-49.4544, -14.9225, -22.5902,  ..., -75.0719, -75.2252, -75.1727],
         ...,
         [-46.7954, -13.4281, -21.5096,  ..., -73.5383, -73.8785, -73.8512],
         [-43.5348,  -9.1414, -17.382

In [43]:
outputs.logits.shape

torch.Size([1, 7, 32128])

In [31]:
decoder_outputs = model.decoder(encoder_hidden_states=encoder_ouputs.last_hidden_state, input_ids=input_ids)
decoder_outputs.last_hidden_state
model.lm_head(decoder_outputs.last_hidden_state)

tensor([[[ -818.2573,  -210.1928,  -336.9014,  ..., -1344.4564,
          -1345.2698, -1345.4575],
         [ -999.7817,  -271.1569,  -446.1989,  ..., -1461.3430,
          -1461.7225, -1461.9854],
         [-1119.0250,  -337.6578,  -511.1584,  ..., -1698.6843,
          -1702.1517, -1700.9644],
         ...,
         [-1058.8584,  -303.8443,  -486.7072,  ..., -1663.9822,
          -1671.6787, -1671.0627],
         [ -985.0807,  -206.8462,  -393.3148,  ..., -1544.1208,
          -1549.5402, -1548.0869],
         [ -673.4078,   -71.0885,  -243.6629,  ..., -1063.8423,
          -1067.3953, -1066.8198]]], grad_fn=<UnsafeViewBackward0>)

# Final setup

In [79]:
tokenizer = AutoTokenizer.from_pretrained('t5-small')
text_encoder_model = T5EncoderModel.from_pretrained('t5-small')
text_decoder_model = T5ForConditionalGeneration.from_pretrained('t5-small', is_decoder=True, add_cross_attention=True)
inputs = tokenizer("Hello, my dog is cute but", return_tensors="pt")
outputs = text_encoder_model(**inputs)
outputs.last_hidden_state.shape
# print(outputs.last_hidden_state)
outputs = text_decoder_model(encoder_outputs=outputs, decoder_input_ids=inputs["input_ids"])
outputs.keys()
# print(outputs.encoder_last_hidden_state)
print(outputs.logits)

tensor([[[-36.0586, -11.3399, -15.3354,  ..., -60.5524, -60.5518, -60.5525],
         [-45.0051, -13.2414, -20.2115,  ..., -65.1386, -65.1400, -65.1418],
         [-50.9481, -16.4054, -23.7251,  ..., -77.4390, -77.5915, -77.5295],
         ...,
         [-47.4272, -12.9864, -19.9328,  ..., -73.8682, -74.1522, -74.0732],
         [-36.7683,  -5.9688, -14.8783,  ..., -55.6205, -55.8629, -55.8130],
         [-30.0457,  -3.7978, -11.6187,  ..., -47.4286, -47.6093, -47.5897]]],
       grad_fn=<UnsafeViewBackward0>)


In [85]:
gen = text_decoder_model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
print(tokenizer.decode(gen[0]))

<pad> Hello, Hello, my dog is cute but</s>


In [88]:
full_model = T5ForConditionalGeneration.from_pretrained('t5-small')
inputs = tokenizer("Fill: I am ...", return_tensors="pt")
gen = full_model.generate(input_ids=inputs["input_ids"], max_new_tokens=20)
print(tokenizer.decode(gen[0]))

<pad> Fill: I am...</s>


In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
gen = model.generate(input_ids=inputs["input_ids"], max_length=20)
print(tokenizer.decode(gen[0]))

In [1]:
from transformers import T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained('t5-small')


In [6]:
tokenizer(["abc stride", "def"], return_tensors="pt", padding=True)

{'input_ids': tensor([[ 703,   75, 5765, 1599,    1],
        [  20,   89,    1,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0]])}

Test out tokenization of dataset

In [39]:
from transformers import T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained('t5-small')
print(tokenizer.eos_token_id, tokenizer.pad_token_id)

1 0


In [47]:
tokenizer.batch_decode(tokenizer.encode("<pad>", return_tensors="pt"), skip_special_tokens=True)

['']

In [40]:
from kbgen.data.datasets import GSM
dataset = GSM(tokenizer="t5-small")

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [21]:
{k:v for k, v in dataset.categories_str_to_id["phone.oem"].items() if v in [11, 95, 18, 81, 30 ,57]}

{'i-mobile': 11,
 'LG': 18,
 'Oppo': 30,
 'VK Mobile': 57,
 'Archos': 81,
 'Karbonn': 95}

In [25]:
for item_from_token_decoder, item_from_prototypes in zip(proto_str_to_id[field], examples):
    assert item_from_token_decoder == item_from_prototypes, f"{item_from_token_decoder.replace(' ', '.')} != {item_from_prototypes.replace(' ', '.')}"

In [5]:
list(dataset.categories_str_to_id["phone.network_edge"].items())[:4], len(dataset.categories_str_to_id["phone.network_edge"])

([('', 0), ('No', 1), ('Yes', 2), ('Class 6', 3)], 51)

In [24]:
dataset.categories_str_to_id["phone.network_edge"]

{'': 0,
 'No': 1,
 'Yes': 2,
 'Class 6': 3,
 'Class 10': 4,
 'Up to 236.8 kbps': 5,
 'Class 33': 6,
 'Class 12': 7,
 'Yes (SIM 1 only)': 8,
 'Class 32': 9,
 'Up to 560 kbps': 10,
 'Up to 384 kbps': 11,
 'Yes - 3G model': 12,
 'Class 12 (T-Mobile)': 13,
 'Up to 237 kbps': 14,
 'Class 11': 15,
 'Up to 236 kbps': 16,
 'Class 12, 296 / 177.6 kbits': 17,
 'Class 4': 18,
 'Up to 60 kbps': 19,
 'Up to 296 kbps': 20,
 'Class B': 21,
 'Class 6 (Up to 177.6 kbps)': 22,
 'Class 32, 296 / 177.6 kbits': 23,
 'Class 32, 296 kbits': 24,
 'Class 32, 296 / 178.8 kbits': 25,
 'Class 32, 296 kbps': 26,
 'Class 6 (downlink only)': 27,
 'Class 32, 296 kbps; DTM Class 11, 177 kbps': 28,
 'Class 32, 296 kbps; DTM Class 11, 178.8 kbps': 29,
 'Yes, DL only': 30,
 'Class 8': 31,
 'Class 32, 236.8 kbits': 32,
 'Class 32, up to 177 kbits': 33,
 'Class 11, 236.8 kbps': 34,
 'Class 32, 296 kbps; DTM Class 11, 236.8 kbps': 35,
 'Yes, 118.4 kbps': 36,
 'Up to 480 kbps': 37,
 'Yes - SCH-I605, SPH-L900': 38,
 'Yes -SGH