<a href="https://colab.research.google.com/github/jsedoc/ConceptorDebias/blob/master/Experiments/BERT/Visualizing_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [0] Initialization 

In this notebook, we use three different layers of BERT using the flair embeddings (1, 12, 24) as well as the brown corpus. This section can be modified to use different layers of BERT as well as to use a different corpus. 

We also define some functions that will be useful later in the notebook.

### Intitializing BERT embeddings and Brown corpus

In [0]:
import numpy as np
from numpy.linalg import *
import torch
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline

In [0]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/4e/3a/2e777f65a71c1eaa259df44c44e39d7071ba8c7780a1564316a38bf86449/flair-0.4.2-py3-none-any.whl (136kB)
[K     |████████████████████████████████| 143kB 2.8MB/s 
Collecting mpld3==0.3 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/91/95/a52d3a83d0a29ba0d6898f6727e9858fe7a43f6c2ce81a5fe7e05f0f4912/mpld3-0.3.tar.gz (788kB)
[K     |████████████████████████████████| 798kB 54.0MB/s 
[?25hCollecting sqlitedict>=1.6.0 (from flair)
  Downloading https://files.pythonhosted.org/packages/0f/1c/c757b93147a219cf1e25cef7e1ad9b595b7f802159493c45ce116521caff/sqlitedict-1.6.0.tar.gz
Collecting bpemb>=0.2.9 (from flair)
  Downloading https://files.pythonhosted.org/packages/bc/70/468a9652095b370f797ed37ff77e742b11565c6fd79eaeca5f2e50b164a7/bpemb-0.3.0-py3-none-any.whl
Collecting pytorch-pretrained-bert>=0.6.1 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973

In [0]:
from flair.embeddings import BertEmbeddings
from flair.data import Sentence

embedding24 = BertEmbeddings('bert-large-uncased', layers = '-1')
embedding1 = BertEmbeddings('bert-large-uncased', layers = '1')
embedding12 = BertEmbeddings('bert-large-uncased', layers = '12')

100%|██████████| 231508/231508 [00:00<00:00, 5315707.27B/s]
100%|██████████| 1248501532/1248501532 [00:21<00:00, 58044301.85B/s]


In [0]:
import nltk
from nltk.corpus import brown
from tqdm import tqdm_notebook as tqdm

nltk.download('brown')
brown_corpus = brown.sents()
brown_corpus = brown_corpus[:35000]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [0]:
brown_e24 = []
brown_e12 = []
brown_e1 = []
for s in tqdm(brown_corpus):
    
    sentence = Sentence(' '.join(s))
    embedding24.embed(sentence)
    sent_emb = torch.stack([token.embedding for token in sentence]).numpy()
    brown_e24.append(sent_emb)
    
    sentence = Sentence(' '.join(s))
    embedding12.embed(sentence)
    sent_emb = torch.stack([token.embedding for token in sentence]).numpy()
    brown_e12.append(sent_emb)
    
    sentence = Sentence(' '.join(s))
    embedding1.embed(sentence)
    sent_emb = torch.stack([token.embedding for token in sentence]).numpy()
    brown_e1.append(sent_emb)

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [0]:
!pip install plotly



In [0]:
import plotly as py
print(py.__version__)
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

init_notebook_mode(connected = True)

def configure_plotly():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

3.6.1


In [0]:
np.set_printoptions(suppress=True)
np.set_printoptions(precision = 6)
plt_style = 'seaborn-talk'

brown_e = [brown_e1, brown_e12, brown_e24]
e_number = ['1','12','24']

### Defining helpful functions

In [0]:
def pick_embeddings(corpus,sent_embs,word_list):
    X = []
    labels = []
    sents = []
    for i, s in enumerate(corpus):
        for j, w in enumerate(s):
            if w in word_list:
                X.append(sent_embs[i][j])
                labels.append(w)
                sents.append(s)
    return (X, labels, sents)

# [1] BERT has high intra-token variance

First we demonstrate that BERT has a large amount of variance in the different embeddings of a single word relative to the amount of variance between different groups of words. Something interesting is that different layers appear to encode different information about the words. For example, layer 24 shows little to no distinction between 'he' and 'him' whilst layer 12 shows no distinction between 'he' and 'she'

## He vs Him

### Using PCA Singular Values
We start by using PCA singular values as a measure of variance within groups. We calculate the top 25 principal components for each word in the list as well as for the overall list so we can compare the results. Use the drop down menu to compare variance across layers or see different layers. The legend can be clicked to disable/enable different variables.

In [0]:
## First define a list of words to study

word_list = ['he','him']
corpus = brown_corpus

## Now we plot the singular values for PCA transformed clouds corresponding to
## each word in the list as well as the cloud corresponding to the whole list
configure_plotly()

data = []
n_components = 10
pca = PCA(n_components = n_components)
    
for i,embedding in enumerate(brown_e):
    
    X, _, _ = pick_embeddings(corpus, embedding, word_list)
    pca.fit(X)
    
    trace = go.Bar(       
        x = np.arange(n_components) + 1,
        y = pca.singular_values_[0:n_components],
        name= 'Layer ' + e_number[i] + ': All Words',
        text = pca.singular_values_[0:n_components]
    )
    
    data.append(trace)
    
    for word in word_list:
        X, _, _ = pick_embeddings(corpus, embedding, [word])
        pca.fit(X)
        
        trace = go.Bar(       
            x = np.arange(n_components) + 1,
            y = pca.singular_values_[0:n_components],
            name= 'Layer ' + e_number[i] + ': ' + word,
            text= pca.singular_values_[0:n_components],
            visible = False
        )
        
        data.append(trace)

compare_layers = list([
    dict(label = 'Compare Layers',
         method = 'update',
         args = [{'visible': (([True] + ([False]*len(word_list)))*len(brown_e))},
                 {'title': 'Comparing Layer PCA Singular Values'}])    
])

update_layers = list([
    dict(label = 'Layer ' + e_number[i],
         method = 'update',
         args = [{'visible': (([False]*(len(word_list)+1))*i) + ([True]*(len(word_list)+1)) + (([False]*(len(word_list)+1))*(len(brown_e)-i-1))},
                 {'title': 'Top PCA Singular Values - Layer' + e_number[i]}])    
for i in range(len(brown_e))])

all_data = list([
    dict(label = 'See All Data',
         method = 'update',
         args = [{'visible': [True]*len(data)},
                 {'title': 'Comparing All Data'}])    
])

updatemenus = list([
    dict(active = 0,
        buttons = compare_layers + update_layers + all_data)
])

layout = dict(
    title='Comparing Layer PCA Singular Values',
    barmode = 'group',
    updatemenus = updatemenus
)

fig = dict(data=data, layout=layout)
iplot(fig)

### Plotting Top PCA Components

Notice that the variance along these components within each group is of similar magnitude to the variation across groups. Especially in layer 24, we do not even get good separation between 'he' and 'him' on these first two principal components. 

In [0]:
configure_plotly()

def wordlist_scatter(corpus, embedding, wordlist, component1 = 0, component2 = 1, title = 'PCA'):
    
    X,_,_ = pick_embeddings(corpus, embedding, wordlist)
    X = np.asarray(X)
    pca = PCA()
    pca.fit(X)
    
    data = []
    for word in wordlist:
        X, _, _ = pick_embeddings(corpus, embedding, [word])
        X = np.asarray(X)
        
        trans = pca.transform(X)

        trace = go.Scattergl(
            x = trans[:,component1],
            y = trans[:,component2],
            name = word,
            mode = 'markers',
            text= word
        )
        
        data.append(trace)
        
    layout= go.Layout(
        title= title,
        hovermode= 'closest',
        width = 500,
        height = 500
    )
    
    fig= go.Figure(data=data, layout=layout)
    iplot(fig)
    
    return

for i, embedding in enumerate(brown_e):
    wordlist_scatter(brown_corpus, embedding, word_list, title = 'Layer ' + e_number[i])

## He vs She

### Using PCA Singular Values


In [0]:
## First define a list of words to study

word_list = ['he','she']
corpus = brown_corpus

## Now we plot the singular values for PCA transformed clouds corresponding to
## each word in the list as well as the cloud corresponding to the whole list
configure_plotly()

data = []
n_components = 10
pca = PCA(n_components = n_components)
    
for i,embedding in enumerate(brown_e):
    
    X, _, _ = pick_embeddings(corpus, embedding, word_list)
    pca.fit(X)
    
    trace = go.Bar(       
        x = np.arange(n_components) + 1,
        y = pca.singular_values_[0:n_components],
        name= 'Layer ' + e_number[i] + ': All Words',
        text = pca.singular_values_[0:n_components]
    )
    
    data.append(trace)
    
    for word in word_list:
        X, _, _ = pick_embeddings(corpus, embedding, [word])
        pca.fit(X)
        
        trace = go.Bar(       
            x = np.arange(n_components) + 1,
            y = pca.singular_values_[0:n_components],
            name= 'Layer ' + e_number[i] + ': ' + word,
            text= pca.singular_values_[0:n_components],
            visible = False
        )
        
        data.append(trace)

compare_layers = list([
    dict(label = 'Compare Layers',
         method = 'update',
         args = [{'visible': (([True] + ([False]*len(word_list)))*len(brown_e))},
                 {'title': 'Comparing Layer PCA Singular Values'}])    
])

update_layers = list([
    dict(label = 'Layer ' + e_number[i],
         method = 'update',
         args = [{'visible': (([False]*(len(word_list)+1))*i) + ([True]*(len(word_list)+1)) + (([False]*(len(word_list)+1))*(len(brown_e)-i-1))},
                 {'title': 'Top PCA Singular Values - Layer' + e_number[i]}])    
for i in range(len(brown_e))])

all_data = list([
    dict(label = 'See All Data',
         method = 'update',
         args = [{'visible': [True]*len(data)},
                 {'title': 'Comparing All Data'}])    
])

updatemenus = list([
    dict(active = 0,
        buttons = compare_layers + update_layers + all_data)
])

layout = dict(
    title='Comparing Layer PCA Singular Values',
    barmode = 'group',
    updatemenus = updatemenus
)

fig = dict(data=data, layout=layout)
iplot(fig)

### Plotting Top PCA Components


In [0]:
configure_plotly()

def wordlist_scatter(corpus, embedding, wordlist, component1 = 0, component2 = 1, title = 'PCA'):
    
    X,_,_ = pick_embeddings(corpus, embedding, wordlist)
    X = np.asarray(X)
    pca = PCA()
    pca.fit(X)
    
    data = []
    for word in wordlist:
        X, _, _ = pick_embeddings(corpus, embedding, [word])
        X = np.asarray(X)
        
        trans = pca.transform(X)

        trace = go.Scattergl(
            x = trans[:,component1],
            y = trans[:,component2],
            name = word,
            mode = 'markers',
            text= word
        )
        
        data.append(trace)
        
    layout= go.Layout(
        title= title,
        hovermode= 'closest',
        width = 500,
        height = 500
    )
    
    fig= go.Figure(data=data, layout=layout)
    iplot(fig)
    
    return

for i, embedding in enumerate(brown_e):
    wordlist_scatter(brown_corpus, embedding, word_list, title = 'Layer ' + e_number[i])

# [2] 