In [12]:
import pandas as pd

Using BERT large uncased
https://huggingface.co/bert-large-uncased

In [2]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
unmasker = pipeline('fill-mask', model='bert-large-uncased')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [31]:
unmasker("the fox jumped over the [MASK].")

[{'score': 0.20809294283390045,
  'token': 8638,
  'token_str': 'fence',
  'sequence': 'the fox jumped over the fence.'},
 {'score': 0.10911596566438675,
  'token': 3341,
  'token_str': 'edge',
  'sequence': 'the fox jumped over the edge.'},
 {'score': 0.045163024216890335,
  'token': 2217,
  'token_str': 'side',
  'sequence': 'the fox jumped over the side.'},
 {'score': 0.03707710653543472,
  'token': 2813,
  'token_str': 'wall',
  'sequence': 'the fox jumped over the wall.'},
 {'score': 0.0314963273704052,
  'token': 18283,
  'token_str': 'ledge',
  'sequence': 'the fox jumped over the ledge.'}]

In [14]:
df = pd.read_csv('data/masked_tweets.csv')
df

Unnamed: 0,id,tweets,funny,masked_word,index,masked_tweet
0,684187,"Sometimes, when I’m on a date with my wife of ...",1,mom,"[(143, 146)]","Sometimes, when I’m on a date with my wife of ..."
1,509083,Pizza rolls are shaped like little pillows bec...,1,Pizza,"[(0, 5)]",[MASK] rolls are shaped like little pillows be...
2,413339,[struggling to get out of a hammock] come here...,1,struggling,"[(1, 11)]",[[MASK] to get out of a hammock] come here and...
3,880531,If an astronaut goes really fast they’re a fas...,1,astronaut,"[(6, 15), (44, 53)]",If an [MASK] goes really fast they’re a f[MASK]
4,973171,I toss and turn in bed all night like a beauti...,1,toss,"[(2, 6)]",I [MASK] and turn in bed all night like a beau...
...,...,...,...,...,...,...
138,144630,if you sweat while you eat it should count as ...,1,sweat,"[(7, 12)]",if you [MASK] while you eat it should count as...
139,328630,god!! did anyone stop by your house and drop o...,1,opinion,"[(76, 83)]",god!! did anyone stop by your house and drop o...
140,971535,If you breakdance you buy dance.,1,breakdance,"[(7, 17)]",If you [MASK] you buy dance.
141,604980,"You can catch a lot of flies with honey, but y...",1,catch,"[(8, 13), (53, 58)]","You can [MASK] a lot of flies with honey, but ..."


In [33]:
# input: BERT response for only one [MASK] per sentence.
# ex: [{'score': 0.20809294283390045,   'token': 8638,   'token_str': 'fence',   'sequence': 'the fox jumped over the fence.'},  {'score': 0.10911596566438675,   'token': 3341,   'token_str': 'edge',   'sequence': 'the fox jumped over the edge.'},  {'score': 0.045163024216890335,   'token': 2217,   'token_str': 'side',   'sequence': 'the fox jumped over the side.'},  {'score': 0.03707710653543472,   'token': 2813,   'token_str': 'wall',   'sequence': 'the fox jumped over the wall.'},  {'score': 0.0314963273704052,   'token': 18283,   'token_str': 'ledge',   'sequence': 'the fox jumped over the ledge.'}]

# output: dictionary of all 'token_str':'score'
# does not accept punctuations 
def process_bert_response(result):
    output = {}
    for token in result:
        string = token['token_str']
        score = token['score']
        output[string] = score

    return output

sample = unmasker("the fox jumped over the [MASK].")
process_bert_response(sample)

{'fence': 0.20809294283390045,
 'edge': 0.10911596566438675,
 'side': 0.045163024216890335,
 'wall': 0.03707710653543472,
 'ledge': 0.0314963273704052}

In [36]:
bert_responses = []
unmasker = pipeline('fill-mask', model='bert-large-uncased', top_k=5)

for i, row in df[:5].iterrows():
    response = unmasker(row['masked_tweet'])
    if len(eval(row['index'])) > 1:
        # print(response)
        for res in response:
            result = process_bert_response(res)
            bert_responses.append(result)

            print(result)
            print(i)
    else:
        result = process_bert_response(response)
        bert_responses.append(result)
        print(result)
        print(i)
    
bert_responses        

{'girl': 0.5899927616119385, 'woman': 0.22106695175170898, 'guy': 0.05336887389421463, 'man': 0.015112991444766521, 'female': 0.014234392903745174}
0
{'the': 0.21772967278957367, 'these': 0.10143017768859863, 'cinnamon': 0.06283179670572281, 'bread': 0.04623531177639961, 'egg': 0.04007622227072716}
1
{'trying': 0.49840620160102844, 'try': 0.14736302196979523, 'about': 0.05259213224053383, 'time': 0.025765905156731606, 'tries': 0.02252083830535412}
2
{'f': 0.3566504418849945, 'e': 0.16973876953125, 'a': 0.08504264801740646, 'x': 0.07494959235191345, 'm': 0.03575292229652405}
3
{'.': 0.9195104241371155, ';': 0.046818770468235016, '!': 0.031757812947034836, '?': 0.0014781301142647862, '|': 0.0002399297954980284}
3
{'toss': 0.9967106580734253, 'tossed': 0.0020156633108854294, 'tossing': 0.0010578269138932228, 'twist': 9.161982598016039e-05, 'rock': 4.442503995960578e-05}
4


[{'girl': 0.5899927616119385,
  'woman': 0.22106695175170898,
  'guy': 0.05336887389421463,
  'man': 0.015112991444766521,
  'female': 0.014234392903745174},
 {'the': 0.21772967278957367,
  'these': 0.10143017768859863,
  'cinnamon': 0.06283179670572281,
  'bread': 0.04623531177639961,
  'egg': 0.04007622227072716},
 {'trying': 0.49840620160102844,
  'try': 0.14736302196979523,
  'about': 0.05259213224053383,
  'time': 0.025765905156731606,
  'tries': 0.02252083830535412},
 {'f': 0.3566504418849945,
  'e': 0.16973876953125,
  'a': 0.08504264801740646,
  'x': 0.07494959235191345,
  'm': 0.03575292229652405},
 {'.': 0.9195104241371155,
  ';': 0.046818770468235016,
  '!': 0.031757812947034836,
  '?': 0.0014781301142647862,
  '|': 0.0002399297954980284},
 {'toss': 0.9967106580734253,
  'tossed': 0.0020156633108854294,
  'tossing': 0.0010578269138932228,
  'twist': 9.161982598016039e-05,
  'rock': 4.442503995960578e-05}]

In [38]:
df['masked_tweet'][3]

'If an [MASK] goes really fast they’re a f[MASK]'

In [45]:
unmasker = pipeline('fill-mask', model='bert-large-uncased', top_k=30)
unmasker('If an [MASK] goes really fast they’re a f[MASK]')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[[{'score': 0.3566504418849945,
   'token': 1042,
   'token_str': 'f',
   'sequence': '[CLS] if an f goes really fast they ’ re a f [MASK] [SEP]'},
  {'score': 0.16973876953125,
   'token': 1041,
   'token_str': 'e',
   'sequence': '[CLS] if an e goes really fast they ’ re a f [MASK] [SEP]'},
  {'score': 0.08504264801740646,
   'token': 1037,
   'token_str': 'a',
   'sequence': '[CLS] if an a goes really fast they ’ re a f [MASK] [SEP]'},
  {'score': 0.07494959235191345,
   'token': 1060,
   'token_str': 'x',
   'sequence': '[CLS] if an x goes really fast they ’ re a f [MASK] [SEP]'},
  {'score': 0.03575292229652405,
   'token': 1049,
   'token_str': 'm',
   'sequence': '[CLS] if an m goes really fast they ’ re a f [MASK] [SEP]'},
  {'score': 0.024925023317337036,
   'token': 1048,
   'token_str': 'l',
   'sequence': '[CLS] if an l goes really fast they ’ re a f [MASK] [SEP]'},
  {'score': 0.017454318702220917,
   'token': 1050,
   'token_str': 'n',
   'sequence': '[CLS] if an n goes r