In [15]:
import datasets
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline

In [5]:
# download data
wikidata = datasets.load_dataset("wikitext", 'wikitext-2-raw-v1')

Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /Users/felkner/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /Users/felkner/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
# get tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [65]:
# tokenize appropriate data instance
sequence = wikidata['train'][10]['text']
tokenized_sequence = tokenizer(sequence, return_tensors='pt')

In [77]:
# mask out token at index 5
tokenized_sequence['input_ids'][0][5] = tokenizer.mask_token_id

{'input_ids': tensor([[  101,  1109,  1342,   112,   188,   103,  1449,   117,  1103,   139,
          2646,  1942,  5301,  1449,   117,  1110,  2446,  1166,  2626,  1121,
         12226,  3781,  5132, 17758,   119,  1507,  6178,   117,  2139,  8247,
          1296,  2587,  1606,   170,  1499,   137,   118,   137,  1205,  7281,
          1104,  1103, 13777,  4520,   131,  1517,   170,  1959,  1110,  2700,
           117,  1103,  1591,  5279,  1103,  1959,  1213,  1103, 13777,  1107,
          1503,   137,   118,   137,  1825,   119,   138,  1959,  1169,  1178,
          2496,  1517,  1679,   137,   118,   137,  1885,   117,  1133,  2650,
          1169,  1129,  3609,  2967,  3587,  1120,  1103, 11013,  1104,  1168,
          2650,   112,  3587,   119,  2994,  1959,  1144,   170,  1768,  1105,
          2462,  1104,  2230,  2609,  1118,  1147,  6605,   144,  3984,  2176,
           119,  3725,  1106,  2551,  2650,  1169,  1129,  3346,  1106,   170,
          1423,  2862,   119,  1507, 1

In [48]:
# download model 
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [81]:
# pass sequence to model
output = model(**tokenized_sequence)

In [82]:
# get probabilities for masked token
mask_logits = output.logits[0, 5]

In [88]:
# take argmax to get most likely token ID
top_token_id = mask_logits.argmax()

In [94]:
# decode
top_token = tokenizer.decode(top_token_id)
print("Most likely token for mask is", top_token)

Most likely token for mask is combat


In [96]:
masked_sequence = ' The game \'s [MASK] system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique to each character . They are divided into " Personal Potential " , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character , and " Battle Potentials " , which are grown throughout the game and always grant boons to a character . To learn Battle Potentials , each character has a unique " Masters Table " , a grid @-@ based skill table that can be used to acquire and link different skills . Characters also have Special Abilities that grant them temporary boosts on the battlefield : Kurt can activate " Direct Command " and move around the battlefield without depleting his Action Point gauge , the character Reila can shift into her " Valkyria Form " and become invincible , while Imca can target multiple enemy units with her heavy weapon . \n'

In [97]:
# can also do the same thing with the fill-mask pipeline

filler = pipeline('fill-mask', model='distilbert-base-cased')
filler(masked_sequence)

[{'score': 0.13662149012088776,
  'token': 4127,
  'token_str': 'combat',
  'sequence': 'The game\'s combat system, the BliTZ system, is carried over directly from Valkyira Chronicles. During missions, players select each unit using a top @ - @ down perspective of the battlefield map : once a character is selected, the player moves the character around the battlefield in third @ - @ person. A character can only act once per @ - @ turn, but characters can be granted multiple turns at the expense of other characters\'turns. Each character has a field and distance of movement limited by their Action Gauge. Up to nine characters can be assigned to a single mission. During gameplay, characters will call out if something happens to them, such as their health points ( HP ) getting low or being knocked out by enemy attacks. Each character has specific " Potentials ", skills unique to each character. They are divided into " Personal Potential ", which are innate skills that remain unaltered unl