In [0]:
import numpy as np
from itertools import combinations, filterfalse
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd
import random
import sys
import os
import pickle

In [2]:
!pip install transformers==2.8.0

Collecting transformers==2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 2.8MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 48.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 43.8MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K   

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Config

In [22]:
import torch
import transformers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = 'large' #'base'
nlayer = 12 if config == 'base' else 24
nsamples = 50000

model = transformers.BertForMaskedLM.from_pretrained('bert-'+config+'-uncased', output_hidden_states=True).to(device)
tokenizer = transformers.BertTokenizer.from_pretrained('bert-'+config+'-uncased')
# turn on eval mode
model.eval()

HBox(children=(IntProgress(value=0, description='Downloading', max=1344997306, style=ProgressStyle(description…




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elementw

## New BERTs

In [0]:
import densray_bert

# WEAT Algorithm
The Word Embeddings Association Test (WEAT), as proposed by Calikson et. al., is a statistical test analogous to the Implicit Association Test (IAT) which helps quantify human biases in textual data. WEAT uses the cosine similarity between word embeddings which is analogous to the reaction time when subjects are asked to pair two concepts they find similar in the IAT.  WEAT considers two sets of target words and two sets of attribute words of equal size. The null hypothesis is that there is no difference between the two sets of target words and the sets of attribute words in terms of their relative similarities measured as the cosine similarity between the embeddings. For example, consider the target sets as words representing *Career* and *Family* and let the two sets of attribute words be *Male* and *Female* in that order. The null hypothesis states that *Career* and *Family* are equally similar (mathematically, in terms of the mean cosine similarity between the word representations) to each of the words in the *Male* and *Female* word lists. 

REF: https://gist.github.com/SandyRogers/e5c2e938502a75dcae25216e4fae2da5



## Test Statistic

The WEAT test statistic measures the differential association of the two sets of target words with the attribute.

To ground this, we cast WEAT in our formulation where $\mathcal{X}$ and $\mathcal{Y}$ are two sets of target
words, (concretely, $\mathcal{X}$ might be*Career* words and $\mathcal{Y}$ *Family* words) and $\mathcal{A}$, $\mathcal{B}$ are two sets of attribute words ($\mathcal{A}$ might be ''female'' names and $\mathcal{B}$  ''male'' names) assumed to associate with the bias concept(s). WEAT is then
\begin{align*}
s(\mathcal{X}, &\mathcal{Y}, \mathcal{A}, \mathcal{B}) \\ &= \frac{1}{|\mathcal{X}|}\Bigg[\sum_{x \in \mathcal{X}}{\Big[\sum_{a\in \mathcal{A}}{s(x,a)} - \sum_{b\in \mathcal{B}}{s(x,b)}\Big]} \\ &\hbox{}  - \sum_{y \in \mathcal{Y}}{\Big[\sum_{a\in \mathcal{A}}{s(y,a)} - \sum_{b\in \mathcal{B}}{s(y,b)}\Big]}\Bigg],
\end{align*}
where $s(x,y) = \cos(\hbox{vec}(x), \hbox{vec}(y))$ and $\hbox{vec}(x) \in \mathbb{R}^k$ is the $k$-dimensional word embedding for word $x$. We assume that there is no overlap between any of the sets $\mathcal{X}$, $\mathcal{Y}$, $\mathcal{A}$, and $\mathcal{B}$.

Note that for this definition of WEAT, the cardinality of the sets must be equal, so $|\mathcal{A}|=|\mathcal{B}|$ and $|\mathcal{X}|=|\mathcal{Y}|$. Our  conceptor formulation given below relaxes this assumption.

In [0]:
from sklearn.metrics.pairwise import cosine_similarity

def swAB(W, A, B):
    """Calculates differential cosine-similarity between word vectors in W, A and W, B
        Arguments
                W, A, B : n x d matrix of word embeddings stored row wise
    """
    WA = cosine_similarity(W,A)
    WB = cosine_similarity(W,B)
    
    #Take mean along columns
    WAmean = np.mean(WA, axis = 1)
    WBmean = np.mean(WB, axis = 1)
    
    return (WAmean - WBmean)
  
def test_statistic(X, Y, A, B):
    """Calculates test-statistic between the pair of association words and target words
        Arguments
                X, Y, A, B : n x d matrix of word embeddings stored row wise
        Returns
                Test Statistic
    """
    return (sum(swAB(X, A, B)) - sum(swAB(Y, A, B)))

## Effect Size (d-value)

The ''effect size'' is a normalized measure of how separated the two distributions are.

In [0]:
def weat_effect_size(X, Y, A, B, embd):
    """Computes the effect size for the given list of association and target word pairs
        Arguments
                X, Y : List of association words
                A, B : List of target words
                embd : Dictonary of word-to-embedding for all words
        Returns
                Effect Size
    """
    Xmat = np.array([embd[w] for w in X if w in embd])
    Ymat = np.array([embd[w] for w in Y if w in embd])
    Amat = np.array([embd[w] for w in A if w in embd])
    Bmat = np.array([embd[w] for w in B if w in embd])
    XuY = list(set(X).union(Y))
    XuYmat = []
    for w in XuY:
        if w.lower() in embd:
            XuYmat.append(embd[w.lower()])
    XuYmat = np.array(XuYmat)
    d = (np.mean(swAB(Xmat,Amat,Bmat)) - np.mean(swAB(Ymat,Amat,Bmat)))/np.std(swAB(XuYmat, Amat, Bmat))
    return d

## P-Value

The one-sided P value measures the likelihood that a random permutation of the attribute words would produce at least the observed test statistic

In [0]:
def random_permutation(iterable, r=None):
    """Returns a random permutation for any iterable object"""
    pool = tuple(iterable)
    r = len(pool) if r is None else r
    return tuple(random.sample(pool, r))

def weat_p_value(X, Y, A, B, embd, sample=None):
    np.random.seed(42)
    random.seed(42)
    """Computes the one-sided P value for the given list of association and target word pairs
        Arguments
                X, Y : List of association words
                A, B : List of target words
                embd : Dictonary of word-to-embedding for all words
                sample : Number of random permutations used.
        Returns
    """
    size_of_permutation = min(len(X), len(Y))
    X_Y = X + Y
    test_stats_over_permutation = []
    
    Xmat = np.array([embd[w.lower()] for w in X if w.lower() in embd])
    Ymat = np.array([embd[w.lower()] for w in Y if w.lower() in embd])
    Amat = np.array([embd[w.lower()] for w in A if w.lower() in embd])
    Bmat = np.array([embd[w.lower()] for w in B if w.lower() in embd])
    
    if not sample:
        permutations = combinations(X_Y, size_of_permutation)
    else:
        permutations = [random_permutation(X_Y, size_of_permutation) for s in range(sample)]
        
    for Xi in permutations:
        Yi = filterfalse(lambda w:w in Xi, X_Y)
        Ximat = np.array([embd[w.lower()] for w in Xi if w.lower() in embd])
        Yimat = np.array([embd[w.lower()] for w in Yi if w.lower() in embd])
        test_stats_over_permutation.append(test_statistic(Ximat, Yimat, Amat, Bmat))
        
    unperturbed = test_statistic(Xmat, Ymat, Amat, Bmat)
    
    is_over = np.array([o > unperturbed for o in test_stats_over_permutation])
    
    return is_over.sum() / is_over.size

# Dataset

In [0]:
def category(type=1):
    m_words = "male, man, boy, brother, he, him, his, son, father, uncle, grandfather".lower().replace(" ", "").split(',')
    f_words = "female, woman, girl, sister, she, her, hers, daughter, mother, aunt, grandmother".lower().replace(" ", "").split(',')
    if type == 1:
        #B1: career vs family
        x_words = "executive, management, professional, corporation, salary, office, business, career".lower().replace(" ", "").split(',')
        y_words = "home, parents, children, family, cousins, marriage, wedding, relatives".lower().replace(" ", "").split(',')
    elif type == 2:
        #B2: maths vs arts
        x_words = "math, algebra, geometry, calculus, equations, computation, numbers, addition".lower().replace(" ", "").split(',')
        y_words = "poetry, art, Shakespeare, dance, literature, novel, symphony, drama".lower().replace(" ", "").split(',')
    elif type == 3:
        #B3: science vs arts
        x_words = "science, technology, physics, chemistry, Einstein, NASA, experiment, astronomy".lower().replace(" ", "").split(',')
        y_words = "poetry, art, Shakespeare, dance, literature, novel, symphony, drama".lower().replace(" ", "").split(',')
    elif type == 4:
        #B4: intelligence vs appearance
        x_words = "precocious, resourceful, inquisitive, genius, inventive, astute, adaptable, reflective,discerning, intuitive, inquiring, judicious, analytical, apt, venerable, imaginative,shrewd, thoughtful, wise, smart, ingenious, clever, brilliant, logical, intelligent".lower().replace(" ", "").split(',')
        y_words = "alluring, voluptuous, blushing, homely, plump, sensual, gorgeous, slim, bald,athletic, fashionable, stout, ugly, muscular, slender, feeble, handsome, healthy,attractive, fat, weak, thin, pretty, beautiful, strong".lower().replace(" ", "").split(',')
    elif type == 5:
        #B5: strength vs weakness
        x_words = "power, strong, confident, dominant, potent, command, assert, loud, bold, succeed,triumph, leader, shout, dynamic, winner".lower().replace(" ", "").split(',')
        y_words = "weak, surrender, timid, vulnerable, weakness, wispy, withdraw, yield, failure, shy,follow, lose, fragile, afraid, loser".lower().replace(" ", "").split(',')
    return m_words, f_words, x_words, y_words

def category_2(type=1):
    if type == 1:
        #Career/Family
        m_words = 'John, Paul, Mike, Kevin, Steve, Greg, Jeff, Bill'.lower().replace(" ", "").split(',')
        f_words = 'Amy, Joan, Lisa, Sarah, Diana, Kate, Ann, Donna'.lower().replace(" ", "").split(',')
        x_words = 'executive, management, professional, corporation, salary, office, business, career'.lower().replace(" ", "").split(',')
        y_words = 'home, parents, children, family, cousins, marriage, wedding, relatives'.lower().replace(" ", "").split(',')
    elif type == 2:
        #Math/Art
        m_words = 'math, algebra, geometry, calculus, equations, computation, numbers, addition'.lower().replace(" ", "").split(',')
        f_words = 'poetry, art, dance, literature, novel, symphony, drama, sculpture'.lower().replace(" ", "").split(',')
        x_words = 'male, man, boy, brother, he, him, his, son'.lower().replace(" ", "").split(',')
        y_words = 'female, woman, girl, sister, she, her, hers, daughter'.lower().replace(" ", "").split(',')
    elif type == 3:
        #Science/Art
        m_words = 'science, technology, physics, chemistry, Einstein, NASA, experiment, astronomy'.lower().replace(" ", "").split(',')
        f_words = 'poetry, art, Shakespeare, dance, literature, novel, symphony, drama'.lower().replace(" ", "").split(',')
        x_words = 'brother, father, uncle, grandfather, son, he, his, him'.lower().replace(" ", "").split(',')
        y_words = 'sister, mother, aunt, grandmother, daughter, she, hers, her'.lower().replace(" ", "").split(',')
    return m_words, f_words, x_words, y_words

# Get Embeddings

In [0]:
import densray_bert


def get_eigvecs_dict(layer=-1):
    eigvecs_dict = {}
    #-1:apply to all layers
    if layer == -1:
        for l in range(nlayer):
            eigvecs_dict[str(l)] = ('/content/drive/My Drive/eigvecs_'+config+'_noavg_'+str(nsamples)+'_'+str(l)+'.pt', True)
    elif layer ==-2:
        for l in range(nlayer):
            eigvecs_dict[str(l)] = ('/content/drive/My Drive/eigvecs_'+config+'_noavg_'+str(nsamples)+'_'+str(l)+'.pt', False)
    else:
        for l in range(nlayer):
            if l==layer:
                eigvecs_dict[str(l)] = ('/content/drive/My Drive/eigvecs_'+config+'_noavg_'+str(nsamples)+'_'+str(l)+'.pt', True)
            else:
                eigvecs_dict[str(l)] = ('/content/drive/My Drive/eigvecs_'+config+'_noavg_'+str(nsamples)+'_'+str(l)+'.pt', False)
    return eigvecs_dict


def get_bert_embedding(model, wordlist, is_targets=1):
    vecss = torch.Tensor().to(device)
    for w in wordlist:
        text = w + ' is ' + tokenizer.mask_token + '.' if is_targets else tokenizer.mask_token + ' is ' + w + '.'
        vec = tokenizer.prepare_for_model(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)),
                                            return_token_type_ids=False, return_tensors='pt')['input_ids'].to(device)
        vecs = vec.clone().detach()
        # get output
        vecs = model.bert(vecs)[0]#[2][nlayer]
        vecs = vecs[0][1:-4,:].mean(dim=0).unsqueeze(0) if is_targets else vecs[0][3:-2,:].mean(dim=0).unsqueeze(0)
        vecss = torch.cat((vecss,vecs))
    return vecss

def eval_per_layer(layer=-2):
    config_class = get_eigvecs_dict(layer)
    model = densray_bert.BertForMaskedLM_1.from_pretrained('bert-'+config+'-uncased', eigvecs_dict=get_eigvecs_dict(l)).to(device)
    # turn on eval mode
    model.eval()
    m = get_bert_embedding(model, m_words, is_targets=0).cpu().detach().numpy()
    f = get_bert_embedding(model, f_words, is_targets=0).cpu().detach().numpy()
    x = get_bert_embedding(model, x_words, is_targets=1).cpu().detach().numpy()
    y = get_bert_embedding(model, y_words, is_targets=1).cpu().detach().numpy()
    embed = {}
    for i in range(len(m_words)): embed[m_words[i]] = m[i]
    for i in range(len(f_words)): embed[f_words[i]] = f[i]
    for i in range(len(x_words)): embed[x_words[i]] = x[i]
    for i in range(len(y_words)): embed[y_words[i]] = y[i]
    return embed

# Go!

In [29]:
for t in range(1,4):
    m_words, f_words, x_words, y_words = category_2(t)
    print('d    d_densray |d|-|d_densray|   p   p-densray   p_densray-p')
    l=-2
    # no densray
    embed = eval_per_layer(layer=l)
    d =  weat_effect_size(x_words, y_words, m_words, f_words, embed)
    p = weat_p_value(x_words, y_words, m_words, f_words, embed, sample=1000)
    #densray
    for l in range(-1, nlayer):
        # densray
        embed = eval_per_layer(layer=l)
        d_densray =  weat_effect_size(x_words, y_words, m_words, f_words, embed)
        p_densray = weat_p_value(x_words, y_words, m_words, f_words, embed, sample=1000)
        print(round(d,4), round(d_densray,4), round(abs(d)-abs(d_densray),4), 
              round(p,4), round(p_densray,4), round(p_densray-p,4))
    print('\n')

d    d_densray |d|-|d_densray|   p   p-densray   p_densray-p
1.5705 0.8143 0.7562 0.0 0.053 0.053
1.5705 1.5282 0.0423 0.0 0.0 0.0
1.5705 1.5007 0.0698 0.0 0.0 0.0
1.5705 1.5095 0.0609 0.0 0.0 0.0
1.5705 1.3763 0.1942 0.0 0.001 0.001
1.5705 1.4469 0.1236 0.0 0.0 0.0
1.5705 1.5604 0.01 0.0 0.0 0.0
1.5705 1.4713 0.0992 0.0 0.0 0.0
1.5705 1.4277 0.1428 0.0 0.0 0.0
1.5705 1.4517 0.1188 0.0 0.0 0.0
1.5705 1.4118 0.1587 0.0 0.0 0.0
1.5705 1.5418 0.0287 0.0 0.0 0.0
1.5705 1.5268 0.0437 0.0 0.0 0.0
1.5705 1.5443 0.0262 0.0 0.0 0.0
1.5705 1.4957 0.0748 0.0 0.0 0.0
1.5705 1.1701 0.4003 0.0 0.009 0.009
1.5705 1.0482 0.5222 0.0 0.018 0.018
1.5705 0.9637 0.6068 0.0 0.026 0.026
1.5705 0.9731 0.5974 0.0 0.024 0.024
1.5705 1.0097 0.5608 0.0 0.021 0.021
1.5705 1.0048 0.5657 0.0 0.021 0.021
1.5705 1.0219 0.5485 0.0 0.023 0.023
1.5705 0.9975 0.5729 0.0 0.027 0.027
1.5705 1.0536 0.5169 0.0 0.019 0.019
1.5705 1.0563 0.5142 0.0 0.019 0.019


d    d_densray |d|-|d_densray|   p   p-densray   p_densray-p
-0.40

d    d_densray |d|-|d_densray|   p   p-densray   p_densray-p
1.0083 0.1539 0.8544 0.02 0.353 0.333
1.0083 0.7002 0.3082 0.02 0.089 0.069
1.0083 0.6382 0.3702 0.02 0.116 0.096
1.0083 0.6179 0.3904 0.02 0.122 0.102
1.0083 0.8179 0.1904 0.02 0.056 0.036
1.0083 0.792 0.2163 0.02 0.065 0.045
1.0083 1.06 -0.0517 0.02 0.019 -0.001
1.0083 0.9506 0.0577 0.02 0.013 -0.007
1.0083 0.9207 0.0877 0.02 0.015 -0.005
1.0083 1.2032 -0.1949 0.02 0.002 -0.018
1.0083 1.0765 -0.0681 0.02 0.003 -0.017
1.0083 1.2116 -0.2033 0.02 0.006 -0.014
1.0083 0.9429 0.0654 0.02 0.034 0.014
1.0083 0.9473 0.061 0.02 0.033 0.013
1.0083 1.1504 -0.142 0.02 0.004 -0.016
1.0083 1.0471 -0.0387 0.02 0.0 -0.02
1.0083 0.9258 0.0825 0.02 0.001 -0.019
1.0083 0.9291 0.0793 0.02 0.001 -0.019
1.0083 1.0881 -0.0798 0.02 0.002 -0.018
1.0083 1.1217 -0.1134 0.02 0.001 -0.019
1.0083 1.0004 0.0079 0.02 0.005 -0.015
1.0083 0.9316 0.0767 0.02 0.007 -0.013
1.0083 0.7355 0.2729 0.02 0.074 0.054
1.0083 0.7182 0.2901 0.02 0.077 0.057
1.0083 0.788 