In [1]:
import os
import numpy as np
import sklearn
import scipy
from tqdm import tqdm
from sklearn.model_selection import LeaveOneOut
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold
from tqdm import tqdm
from numpy import dot
from numpy.linalg import norm
import random
import torch

In [2]:
def cosim(a, b):
    if (norm(a)*norm(b)) == 0: return 0
    return dot(a, b)/(norm(a)*norm(b))

In [3]:
def top_k_accuracy(y_hat, y_true, k = 5, THRESHOLD = 0.95):
    matched = 0
    for i in range(y_hat.shape[0]):
        # For each predicted vector => compare true ones
        similarities = np.sort(
            np.array([cosim(y_hat[i], y_true[j]) for j in range(y_true.shape[0])])
        )[::-1]
        
        has_top_k_match = int(np.sum((similarities[:k] >= THRESHOLD).astype(int)) >= 1)
        matched += has_top_k_match

    return matched / y_hat.shape[0]

In [4]:
random.seed(42)

def cross_validate_model(X, Y):

    # Shuffle together
    temp = list(zip(X, Y))
    random.shuffle(temp)
    X, Y = zip(*temp)
    X, Y = np.array(X), np.array(Y)

    # Cross validation
    crossVal = KFold(n_splits=10)
    K = 30
    accuracies = []
    progress = tqdm(range(crossVal.get_n_splits(X)))

    for i, (train_index, test_index) in (enumerate(crossVal.split(X))):

        model = make_pipeline(StandardScaler(), PLSRegression(n_components=K))
        model.fit(X[train_index], Y[train_index])

        # Given a predicted vector, we rank all the 534 vectors in the gold standard data set by decreasing cosine similarity values
        y_hat = model.predict(X[test_index]) 
        accuracies.append(
            top_k_accuracy(y_hat, Y)
        )
        progress.update(1)
        
    return np.mean(accuracies)

### Binder features

In [5]:
filename = os.path.join("data", "binder_semantic_features", "word_ratings", "WordSet1_Ratings.csv")

binder_features = {}
with open(filename, "r") as file:
    for line in file.readlines()[1:]:
        if ",na," not in line: # NOTE: incomplete data issue
            word = line.split(",")[1]
            features = [float(x) if x != "na" else 0 for x in line.split(",")[5:69]]
            binder_features[word] = np.array(features)

In [6]:
len(binder_features.keys())

434

In [7]:
from modules.lm import LanguageModel
device = "cuda"
lm = LanguageModel(device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### 1. GloVe embeddings performance

In [None]:
def get_word_activations(path, skip_lines=0):
    """
        Returns dataset of fMRI word activations
        path            Path to .txt fMRI data vectors (continuous) from Cognival
        context_len     Words before the occurring one
        data            Returned dictionary with key ['word'] -> {'context', 'activations'}
    """
    data = {}
    with open(path, "r") as datafile:
        lines = datafile.readlines()[skip_lines:] # skip header
        for line in tqdm(lines):
            word = line.split(" ")[0]
            activations = np.array([float(x) for x in line.split(" ")[1:]])
            data[word] = {"activations": activations}
    return data

In [None]:
filename = os.path.join("data", "glove.6B", f"glove.6B.100d.txt")
glove_embeddings = get_word_activations(filename, skip_lines = 0)

In [None]:
common_keys = [k for k in glove_embeddings.keys() if k in binder_features.keys()]
len(common_keys) # almost all the binder features!

In [None]:
X = []
Y = []
for key in common_keys:
    X.append(glove_embeddings[key]["activations"])
    Y.append(binder_features[key])

X = np.array(X).astype(np.float32) # word embeddings
Y = np.array(Y).astype(np.float32) # binder features

X.shape, Y.shape

In [None]:
np.save(os.path.join("data", "experiments", "glove_binder_X.npy"), X)
np.save(os.path.join("data", "experiments", "glove_binder_Y.npy"), Y)

In [None]:
cross_validate_model(X, Y)

### Random embeddings performance

In [None]:
Y = []
for key in binder_features.keys():
    Y.append(binder_features[key])

Y = np.array(Y).astype(np.float32) # binder features
random_X = np.random.rand(Y.shape[0], 300)

In [None]:
cross_validate_model(random_X, Y)

### 2. BERT non-contextual embeddings performance

In [None]:
X = []
Y = []
for key in tqdm(binder_features.keys()):
    X.append(lm.get_contextualized_word_embedding(key, key))
    Y.append(binder_features[key])

Y = np.array(Y).astype(np.float32) # binder features
X = np.array(X).astype(np.float32) # binder features

In [None]:
np.save(os.path.join("data", "experiments", "bert_noctx_binder_X.npy"), X)
np.save(os.path.join("data", "experiments", "bert_noctx_binder_Y.npy"), Y)

In [None]:
cross_validate_model(X, Y)

### BERT contextual embeddings performance

"Because they produce token vectors, following the method proposed by Bommasani, Davis, and Cardie (2020) and Vuli ́ c et al. (2020), we created type representations by randomly sampling 1,000 sentences for each target word from the Wikipedia corpus. We generated a contextualized embedding for each word token by feeding the sentence to the publicly available pre-trained models of ELMo and BERT and taking the token vector of the output layer. Finally, an embedding for each word was obtained by averaging its 1,000 contextualized vectors."

In [8]:
from datasets import load_dataset
wikipedia = load_dataset("wikipedia", "20220301.en")

Found cached dataset wikipedia (C:/Users/xdieg/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
import random
from copy import deepcopy
from tqdm import tqdm

def get_context_sentences(word, dataset, rnd = 0.5, max_words = 1000, progress = False, patience = 500, max_sentence_len = 200):
    """ Search word in db and grab the context sentence """
    word_sentences = [] 
    if progress == True: progress_bar = tqdm(range(max_words))
    cnt = 0
    word = f" {word} "
    for row in dataset:
        if len(word_sentences) >= max_words: break
        if cnt >= patience: break

        # Check wikipedia entry containing word with probability 30% (random sampling)
        if word in row["text"] and random.uniform(0,1) <= rnd:
            cnt = 0
            text = row["text"].replace("\n", " ")
            # Pick all sentences containing that word
            for sentence in text.split("."):
                if len(word_sentences) >= max_words: break
                if word in sentence and len(sentence.split(" ")) < max_sentence_len:
                    word_sentences.append(sentence)
                    if progress == True: progress_bar.update(1)
        else:
            cnt += 1
    return word_sentences

In [10]:
dataset = wikipedia["train"]

**Fetching 1000 contexts for each Binder word**

In [11]:
import torch

words = []
progress = tqdm(range(len(binder_features.keys())))
for i, word in enumerate(binder_features.keys()):
    context_sentences = get_context_sentences(word, dataset, progress=False, patience=5e3)
    words.append(lm.get_contextualized_embeddings(word, context_sentences, batch_size=4))
    progress.update(1)

  7%|▋         | 32/434 [03:24<1:08:12, 10.18s/it]

**Clustering & visualizing the meanings for a given word**

In [None]:
contextualized_words = list(zip(list(binder_features.keys())[:38], words[:-1]))
len(contextualized_words)

In [None]:
# Fetch data
idx = 21
layer = 6
word = contextualized_words[idx][0]
layer_ctx_word = contextualized_words[idx][1][layer-1].numpy()
labels, meaning = lm.get_dominant_meaning(layer_ctx_word)

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(layer_ctx_word)

dominant_label = None 
max_l = -1
for label in np.unique(labels):
    idx = np.where(labels == label)
    if len(idx[0]) > max_l:
        max_l = len(idx[0])
        dominant_label = label
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=label)
    
plt.title(f"layer: {layer} / word: '{word}' / # embeds: {layer_ctx_word.shape[0]}")
plt.legend()
plt.show()

In [None]:
# np.save(os.path.join("data", "experiments", "bert_binder_X.npy"), X)
# np.save(os.path.join("data", "experiments", "bert_binder_Y.npy"), Y)
# cross_validate_model(X, Y)