In [None]:
!pip install transformers
!pip install plotly==4.14.1
!pip install datasets

In [None]:
import torch
import pandas as pd
import plotly.express as px
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from torch.utils.data import DataLoader

from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import euclidean, pdist, squareform
from sklearn import manifold          #use this for MDS computation

#visualization libs
import plotly.graph_objects as go
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
pd.set_option('max_colwidth', 800)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('using device: ', torch.cuda.get_device_name(device), flush=True)

using device:  Tesla T4


In [None]:
# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True
                                  )
model.to(device)
# Put the model in "evaluation" mode
model.eval()

In [None]:
df = pd.read_csv('jokes_combined.csv', encoding='utf-8')

In [None]:
df.head(5)

Unnamed: 0,text,score,category
0,"I hate how you cant even say black paint anymore Now I have to say ""Leroy can you please paint the fence?""",1.0,
1,What's the difference between a Jew in Nazi Germany and pizza ? Pizza doesn't scream when you put it in the oven . I'm so sorry.,0.0,
2,"I recently went to America... and being there really helped me learn about American culture. So I visited a shop and as I was leaving, the Shopkeeper said ""Have a nice day!"" But I didn't so I sued him.",0.0,
3,"Brian raises his hand and says, ""He's in Heaven."" A Sunday school teacher is concerned that his students might be a little confused about Jesus, so he asks his class, ""Where is Jesus today?"" Brian raises his hand and says, ""He's in Heaven."" Susan answers, ""He's in my heart."" Little Johnny waves his hand furiously and blurts out, ""He's in our bathroom!"" The teacher is surprised by this answer and asks Little Johnny how he knows this. ""Well,"" Little Johnny says, ""every morning, my Dad gets up, bangs on the bathroom door and yells 'Jesus Christ, are you still in there?'""",1.0,
4,"You hear about the University book store worker who was charged for stealing $20,000 worth of books? He got caught trying to sell the two books to a freshman.",0.0,


In [None]:
df.sample(5)

Unnamed: 0,text,score,category
32409,What you call a pre-op MtF transsexual who visits another city? A tourist trap.,0.0,
154062,What's long and hard on a black guy? The first grade,8.0,
105154,"Bacon Tree Two Mexicans are stuck in the desert after crossing into the United States, wandering aimlessly and starving. They are about to just lie down and wait for death, when all of a sudden Luis says...: ""Hey Pepe, do you smell what I smell. Ees bacon, I theenk."" ""Si, Luis, eet sure smells like bacon."" With renewed hope they struggle up the next sand dune, &there, in the distance, is a tree loaded with bacon. There's raw bacon, there's fried bacon, back bacon, double smoked bacon ... every imaginable kind of cured pork. ""Pepe, Pepe, we ees saved. Ees a bacon tree!"" ""Luis, maybe ees a meerage? We ees in the dessert, don't forget."" ""Pepe, since when deed you ever hear of a meerage that smell like bacon... ees no meerage, ees a bacon tree."" ...",2335.0,
158042,"What are your go to jokes? I was at an event the other day and someone asked ""So... anyone know any jokes?"" What's everyone's ""go to"" joke in social situations?",2.0,
131364,Whats black and thin and all over my private parts? Dead African Children,0.0,


In [None]:
inputs = tokenizer("Why do bald men cut holes in their pockets? So they can run their fingerz through their hair.")

In [None]:
inputs

{'input_ids': [101, 2339, 2079, 13852, 2273, 3013, 8198, 1999, 2037, 10306, 1029, 2061, 2027, 2064, 2448, 2037, 4344, 2480, 2083, 2037, 2606, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
len(inputs['input_ids'])

23

In [None]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

['[CLS]',
 'why',
 'do',
 'bald',
 'men',
 'cut',
 'holes',
 'in',
 'their',
 'pockets',
 '?',
 'so',
 'they',
 'can',
 'run',
 'their',
 'finger',
 '##z',
 'through',
 'their',
 'hair',
 '.',
 '[SEP]']

In [None]:
stopwords = []
with open('stopwords-en.txt', encoding='utf-8') as f:
  for line in f:
    stopwords.append(line.strip())

In [None]:
stopwords[:10]

["'ll", "'tis", "'twas", "'ve", '10', '39', 'a', "a's", 'able', 'ableabout']

In [None]:
subsample = df.iloc[:1000].copy()

In [None]:
from collections import Counter

In [None]:
def get_repetitions(text):
    token_list = text.split()
    counts = Counter()

    for token in token_list:
      if token in stopwords:
        continue
      counts[token] += 1

    # return word with most repetitions, if exists
    top_word, top_count = counts.most_common(1)[0]
    if top_count > 1:
      return top_word
    return None

In [None]:
%%time
subsample.loc[:, 'repeated'] = subsample['text'].str.lower().apply(get_repetitions)

CPU times: user 442 ms, sys: 0 ns, total: 442 ms
Wall time: 448 ms


In [None]:
subsample.head(5)

Unnamed: 0,text,score,category,repeated
0,"I hate how you cant even say black paint anymore Now I have to say ""Leroy can you please paint the fence?""",1.0,,paint
1,What's the difference between a Jew in Nazi Germany and pizza ? Pizza doesn't scream when you put it in the oven . I'm so sorry.,0.0,,pizza
2,"I recently went to America... and being there really helped me learn about American culture. So I visited a shop and as I was leaving, the Shopkeeper said ""Have a nice day!"" But I didn't so I sued him.",0.0,,
3,"Brian raises his hand and says, ""He's in Heaven."" A Sunday school teacher is concerned that his students might be a little confused about Jesus, so he asks his class, ""Where is Jesus today?"" Brian raises his hand and says, ""He's in Heaven."" Susan answers, ""He's in my heart."" Little Johnny waves his hand furiously and blurts out, ""He's in our bathroom!"" The teacher is surprised by this answer and asks Little Johnny how he knows this. ""Well,"" Little Johnny says, ""every morning, my Dad gets up, bangs on the bathroom door and yells 'Jesus Christ, are you still in there?'""",1.0,,"""he's"
4,"You hear about the University book store worker who was charged for stealing $20,000 worth of books? He got caught trying to sell the two books to a freshman.",0.0,,


In [None]:
na_rows = df[df['text'].isna()]

In [None]:
df = df.drop(na_rows.index)

In [None]:
df.shape

(204541, 3)

In [None]:
%%time
# tokenize without padding and truncation first
encodings = tokenizer(df['text'].to_list())

Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 3min 15s, sys: 363 ms, total: 3min 15s
Wall time: 3min 16s


In [None]:
lens = [len(ex) for ex in encodings['input_ids']]

In [None]:
lens_series = pd.Series(lens, name='number_of_tokens')

In [None]:
lens_series

0          28
1          35
2          53
3         146
4          36
         ... 
204536    226
204537    162
204538    269
204539    339
204540    118
Name: number_of_tokens, Length: 204541, dtype: int64

In [None]:
lens_series.describe()

count    204541.000000
mean         70.817274
std         154.053908
min           2.000000
25%          20.000000
50%          27.000000
75%          60.000000
max       14567.000000
Name: number_of_tokens, dtype: float64

In [None]:
fig = px.histogram(lens_series, x="number_of_tokens", range_x=[0, 1000])
fig.show()

In [None]:
lens_series[lens_series > 512].shape

(2075,)

In [None]:
# take only jokes that have <= 300 tokens
short_indices = [i for i, length in lens_series.iteritems() if length <= 300]
df_short = df.iloc[short_indices].copy()

In [None]:
df_short.shape

(196267, 3)

In [None]:
# take a subsample of this for testing
df_sub = df_short.iloc[:1000].copy()

In [None]:
# source: https://colab.research.google.com/github/nidharap/Notebooks/blob/master/Word_Embeddings_BERT.ipynb#scrollTo=2OGCu6P6exFS
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data, tokenizer_obj):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    @return   attention_masks_without_special_tok (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model excluding the special tokens (CLS/SEP)
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer(
            text=sent,  # Preprocess sentence
            padding='max_length',         # Pad sentence to max length
            truncation=True,              # Shouldn't be necessary for us
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    #lets create another mask that will be useful when we want to average all word vectors later
    #we would like to average across all word vectors in a sentence, but excluding the CLS and SEP token
    #create a copy
    attention_masks_without_special_tok = attention_masks.clone().detach()
    
    #set the CLS token index to 0 for all sentences 
    attention_masks_without_special_tok[:,0] = 0

    #get sentence lengths and use that to set those indices to 0 for each length
    #essentially, the last index for each sentence, which is the SEP token
    sent_len = attention_masks_without_special_tok.sum(1).tolist()

    #column indices to set to zero
    col_idx = torch.LongTensor(sent_len)
    #row indices for all rows
    row_idx = torch.arange(attention_masks.size(0)).long()
    
    #set the SEP indices for each sentence token to zero
    attention_masks_without_special_tok[row_idx, col_idx] = 0

    return input_ids, attention_masks, attention_masks_without_special_tok

In [None]:
%%time
#run sentences through the tokenizer
#input_ids, attention_masks, attention_masks_without_special_tok = preprocessing_for_bert(df_sub['text'].to_list(), tokenizer)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [None]:
%%time
encodings = tokenizer(
            df_sub['text'].to_list(),
            padding='max_length',         # Pad sentence to max length
            truncation=True,              # Shouldn't be necessary for us
            #return_tensors="pt"
            )

CPU times: user 946 ms, sys: 22 ms, total: 968 ms
Wall time: 976 ms


In [None]:
encodings.items()

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
dataset = Dataset(encodings)

In [None]:
# took small batch size for now to get less cluttered visualization
data_loader = DataLoader(dataset, batch_size=2)

In [None]:
# number of batches in our dataset
len(data_loader)

500

In [None]:
all_hidden_states = []
all_input_ids = []
all_attention_masks = []
# need to process batch by batch, otherwise will run out of RAM
for batch in data_loader:
    # Load batch to GPU
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    print(input_ids)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        # outputs include:
        # 1) last_hidden_state (torch.FloatTensor of shape (batch_size, sequence_length, hidden_size)) – Sequence of hidden-states at the output of the last layer of the model.
        # 2) pooler_output (torch.FloatTensor of shape (batch_size, hidden_size)) – Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        # 3) hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True is passed or when config.output_hidden_states=True) – Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size).
        # Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        # 4) attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True is passed or when config.output_attentions=True) – Tuple of torch.FloatTensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).
        # Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
        # 5) cross_attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True and config.add_cross_attention=True is passed or when config.output_attentions=True) – Tuple of torch.FloatTensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).
        # Attentions weights of the decoder’s cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads.
        
        hidden_states = outputs[2]
        # this gives us a tuple of length 13 which includes a [batch_size, max_len, hidden_dim] hidden states for each layer 
        print("Total hidden layers:", len(hidden_states))
        print("First layer : hidden_states[0].shape ", hidden_states[0].shape)  # [batch_size, max_len, hidden_dim]
        # append current batch hidden states to all hidden states
        all_hidden_states.append(hidden_states)
        # store current batch details for visualization purposes
        all_input_ids.append(batch['input_ids'])
        all_attention_masks.append(batch['attention_mask'])
    
    break

tensor([[ 101, 1045, 5223,  ...,    0,    0,    0],
        [ 101, 2054, 1005,  ...,    0,    0,    0]], device='cuda:0')
Total hidden layers: 13
First layer : hidden_states[0].shape  torch.Size([2, 512, 768])


In [None]:
def get_vector(hidden_layers_form_arch, token_index=0, mode='average', top_n_layers=4):
  '''
  retrieve vectors for a token_index from the top n layers and return a concatenated, averaged or summed vector 
  hidden_layers_form_arch: tuple returned by the transformer library
  token_index: index of the token for which a vector is desired
  mode=
        'average' : avg last n layers
        'concat': concatenate last n layers
        'sum' : sum last n layers
        'last': return embeddings only from last layer
        'second_last': return embeddings only from second last layer

  top_n_layers: number of top layers to concatenate/ average / sum
  '''
  if mode == 'concat':
    #concatenate last 4 layer outputs -> returns [batch_size x seq_len x dim]
    #permute(1,0,2) swaps the the batch and seq_len dim , making it easy to return all the vectors for a particular token position
    return torch.cat(hidden_layers_form_arch[-top_n_layers:], dim=2).permute(1,0,2)[token_index]
  
  if mode == 'average':
    #avg last 4 layer outputs -> returns [batch_size x seq_len x dim]
    return torch.stack(hidden_layers_form_arch[-top_n_layers:]).mean(0).permute(1,0,2)[token_index]


  if mode == 'sum':
    #sum last 4 layer outputs -> returns [batch_size x seq_len x dim]
    return torch.stack(hidden_layers_form_arch[-top_n_layers:]).sum(0).permute(1,0,2)[token_index]


  if mode == 'last':
    #last layer output -> returns [batch_size x seq_len x dim]
    return hidden_layers_form_arch[-1:][0].permute(1,0,2)[token_index]

  if mode == 'second_last':
    #last layer output -> returns [batch_size x seq_len x dim]
    return hidden_layers_form_arch[-2:-1][0].permute(1,0,2)[token_index]

  return None

In [None]:
# this gives us the output of the last layer for tokens at position 1 in all sentences of the first batch
get_vector(all_hidden_states[0], token_index=1, mode='last').shape

torch.Size([2, 768])

In [None]:
# get information about the batch we are going to visualize
input_ids = all_input_ids[0]
attention_mask = all_attention_masks[0]

In [None]:
#Lengths of each sentence
sent_lengths = attention_mask.sum(1).tolist()
sent_lengths

[28, 35]

In [None]:
#get the tokenized version of each sentence (text form, to label things in the plot)
tokenized_sents = [tokenizer.convert_ids_to_tokens(i) for i in input_ids]
tokenized_sents[0][:50]

['[CLS]',
 'i',
 'hate',
 'how',
 'you',
 'can',
 '##t',
 'even',
 'say',
 'black',
 'paint',
 'anymore',
 'now',
 'i',
 'have',
 'to',
 'say',
 '"',
 'leroy',
 'can',
 'you',
 'please',
 'paint',
 'the',
 'fence',
 '?',
 '"',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [None]:
def plt_dists(dists, labels, dims=2, words_of_interest=[], title=""):
  '''
  Plot distances using MDS in 2D/3D 
  dists: precomputed distance matrix
  labels: labels to display on the plot
  dims: 2/3 for 2 or 3 dimensional plot, defaults to 2 for any other value passed
  words_of_interest: list of words to highlight with a different color
  title: title for the plot
  '''
  cnt_dict = dict()
  color = list()

  #separate colors for words that are in words_of_interest vs other
  #each word will have a _SentenceNumber at the end to differentiate the words coming in from different sentences
  for v in labels:
    found = False
    for wrd_int in words_of_interest:
      if wrd_int in v:
        found = True
        break
      
    if found:
      color.append(1)
    else:
      color.append(0)

  #https://community.plotly.com/t/plotly-colours-list/11730/6
  colorscale = [[0, 'darkcyan'], [1, 'white']]

  #dists is precomputed using cosine similarity and passed
  #calculate MDS with number of dims passed
  mds = manifold.MDS(n_components=dims, dissimilarity="precomputed", random_state=60, max_iter=90000)
  results = mds.fit(dists)

  #get coodinates for each point
  coords = results.embedding_

  #plot
  if dims == 3:
    fig = go.Figure(data=[go.Scatter3d(
        x=coords[:, 0],
        y=coords[:, 1],
        z=coords[:, 2],
        mode='markers+text',
        textposition="top center",
        text=labels,
        marker=dict(
            size=10,
            color=color,
            colorscale=colorscale,
            opacity=0.8,
            
        )
    )])
  else:
    fig = go.Figure(data=[go.Scatter(
        x=coords[:, 0],
        y=coords[:, 1],
        mode='markers+text',
        text=labels,
        textposition="top center",
        marker=dict(
            size=12,
            color=color,
            colorscale=colorscale,
            opacity=0.8,
            
        )
    )])

  fig.update_layout(template="plotly_dark")
  if title!="":
    fig.update_layout(title_text=title)
  fig.show()

In [None]:
def eval_vecs(input_hidden_states, input_tokenized_sents, mode='concat', top_n_layers=4, viz_dims=2, words_with_diff_color=[], max_len=512):
  '''
  function to get a vectors for each word in each sentence, add the sentence number to the end of each word
  calculate cosine distance between each pair of words and then pass it to the visualization function

  inputs:
  input_hidden_states: hiddent states retrieved from a BERT-like model
  input_tokenized_sents: tokenized sentences, used to assign labels for each point on the plot
  model:  'average' : avg last n layers
          'concat': concatenate last n layers
          'sum' : sum last n layers
          'last':  embeddings only from last layer
          'second_last':  embeddings only from second last layer
  top_n_layers: top n layers to use for concat/sum etc.
  viz_dims: 2/3 for 2D or 3D plot
  words_with_diff_color: words that should be highlighed with different color on the plot
  '''
  vecs = list()
  labels = list()
  for token_ind in range(max_len):
    if token_ind == 0:
      #ignore CLS
      continue
    vectors = get_vector(input_hidden_states, token_index=token_ind, mode=mode, top_n_layers=top_n_layers)
    for sent_ind, sent_len in enumerate(sent_lengths):
      if token_ind < sent_len-1:
        #ignore SEP which will be at the last index of each sentence
        vecs.append(vectors[sent_ind].cpu())
        labels.append(input_tokenized_sents[sent_ind][token_ind]+"_"+str(sent_ind))
    
  #create a numpy matrix to pass to cosine distance
  mat = torch.stack(vecs).detach().numpy()
  #call the plot function on the cosine distance matrix
  plt_dists(cosine_distances(mat), labels=labels, dims=viz_dims, words_of_interest=words_with_diff_color, title='Method: {}'.format(mode))

In [None]:
eval_vecs(all_hidden_states[0], tokenized_sents, mode='last', viz_dims=3, words_with_diff_color=['paint'], max_len=300)