In [118]:
import pandas as pd
from minicons import cwe
from torch.utils.data import DataLoader
import torch
from collections import defaultdict
from tqdm import tqdm
import re
from model import FFNModule, FeatureNormPredictor, FFNParams, TrainingParams

In [119]:
# first want to clean the data
# uploading both csvs

do_df = pd.read_csv("dative_data/direct_object.csv")
po_df = pd.read_csv("dative_data/prepositional.csv")

In [120]:
# convert csvs into lists of sentence word pairs
do_list_natural = do_df[['sentence','recipient']].values.tolist()
do_list_alternate = do_df[['alternant','recipient']].values.tolist()
po_list_natural = po_df[['sentence','recipient']].values.tolist()
po_list_alternate = po_df[['alternant','recipient']].values.tolist()

In [121]:
do_list_natural[1]

['Rodolfo gives Mimi the pink bonnet he bought her, which he has kept as a souvenir of their love.',
 'mimi']

In [122]:
do_list_natural[1][1] in do_list_natural[1][0]

False

In [123]:
def _find_word_form(word, sentence):
  '''find how word occurs in sentence'''
  matches = re.finditer(word, sentence.lower())
  for match in matches:
    span = match.span()
    if (span[0] == 0 or sentence[span[0]-1] == ' ') and (span[1] == len(sentence) or sentence[span[1]] == ' ' or sentence[span[1]] == '.' or sentence[span[1]] == '!' or sentence[span[1]] == '?' or sentence[span[1]] == ',' or sentence[span[1]] == ';'):
      return tuple(span)
  return None
  print(f"no match found for {word} in {sentence}")
  raise Exception("no match found")

In [124]:
# have to convert the 'words' to spans
datasets = [do_list_natural, do_list_alternate, po_list_natural, po_list_alternate]
datasets_spans = []
for set in datasets:
    new_data = []
    for pair in set:
        sentence = pair[0]
        word = pair[1]

        # get the word's span
        wordspan = _find_word_form(word, sentence)
        if wordspan: 
            new_data.append((sentence, torch.tensor(wordspan)))
        else:
            print(f"no span found for {word} in {sentence}")
    datasets_spans.append(new_data)

no span found for the members of the sarcastic gamer community (the forums) in The Sarcastic Gamer Community Podcast is not a podcast, but an initiative designed to give the members of the Sarcastic Gamer Community (the forums) a chance to make their own podcast and have it featured on the site.
no span found for the members of the sarcastic gamer community (the forums) in The Sarcastic Gamer Community Podcast is not a podcast, but an initiative designed to give a chance to make their own podcast and have it featured on the site to the members of the Sarcastic Gamer Community (the forums).
no span found for many of the senators and their men , men who once considered him an enemy in Nor did Ptolemy take into account that Caesar was granting amnesty to many of the senators and their men, men who once considered him an enemy.
no span found for this 40 year travesty of justice in In May 2007 West joined a demonstration against "injustices faced by the Palestinian people resulting from the

In [49]:
"the members of the sarcastic gamer community (the forums)" in "The Sarcastic Gamer Community Podcast is not a podcast, but an initiative designed to give the members of the Sarcastic Gamer Community (the forums) a chance to make their own podcast and have it featured on the site.".lower()

True

In [85]:
len(datasets_spans[0])

993

In [53]:
re.findall("the members of the sarcastic gamer community (the forums)", "The Sarcastic Gamer Community Podcast is not a podcast, but an initiative designed to give the members of the Sarcastic Gamer Community (the forums) a chance to make their own podcast and have it featured on the site.".lower())

[]

In [None]:
# extract all the embeddings and do predictions
lms = ['bert-base-uncased', 'albert-xxlarge-v2', 'roberta-base']
# hierarchy is model: dataset: layer: average feature prediction vector
preds_per_model = {'bert-base-uncased': [], 'albert-xxlarge-v2': [], 'roberta-base': []}
num_features = 65 # binder
for model in lms:
    # load lm for embedding extraction
    lm = cwe.CWE(model)
    # load my models
    nick_name = str.split(model, "-")[0]
    classifiers = []
    for layer in range(13):
        path = f"saved_models/{nick_name}_models_all/{nick_name}_to_binder_layer{layer}.ckpt"
        my_mo = FeatureNormPredictor.load_from_checkpoint(
            checkpoint_path=path,
            map_location={"cuda:1":"cpu", "cuda:2":"cpu"}
        )
        my_mo.eval()
        classifiers.append(my_mo)
    
    for set in datasets_spans:
        # initialize a dictionary for our predictions to live in
        layerwise_predictions = defaultdict(lambda : torch.zeros(num_features).unsqueeze(0))

        dl = DataLoader(set, batch_size=16)
        # tqdm is progress bar
        for batch in tqdm(dl):
            sentences, words = batch
            batched_query = list(zip(sentences, words))
            # we have a 3d tensor: layer: sentence: embedding
            layer_embs = lm.extract_representation(batched_query, layer='all')
            for layer, embs in enumerate(layer_embs):
                for i in range(0, embs.size(0)):
                    emb = embs[i]
                    # this is a nan check
                    if layer == 0:
                        query = batched_query[i]
                        if emb.isnan().any():
                            print("nan detected in extracted embeddings, offending query:")
                            print(f"query: {query}")
                            raise Exception("nan found in embedding")
                    # perform prediction on this embedding
                    my_mo = classifiers[layer]
                    pred = torch.nn.functional.relu(my_mo(emb))
                    pred = pred.squeeze(0)
                    # add this to the calculation
                    layerwise_predictions[layer] += (pred/len(set))

        layerwise_predictions = dict(layerwise_predictions)
        preds_per_model[model].append(layerwise_predictions)


100%|██████████| 63/63 [00:09<00:00,  6.63it/s]
100%|██████████| 63/63 [00:09<00:00,  6.68it/s]
100%|██████████| 59/59 [00:09<00:00,  6.19it/s]
100%|██████████| 60/60 [00:09<00:00,  6.17it/s]
100%|██████████| 63/63 [02:53<00:00,  2.76s/it]
100%|██████████| 63/63 [02:53<00:00,  2.75s/it]
100%|██████████| 59/59 [02:54<00:00,  2.96s/it]
100%|██████████| 60/60 [02:58<00:00,  2.98s/it]
  0%|          | 0/63 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [106]:
# checking that structure is as expected
# embs_per_model[model][dataset][layer][int] should give a 1 d vector
preds_per_model['bert-base-uncased'][0][8]

tensor([[2.8957, 0.5429, 0.4082, 0.8506, 1.0457, 1.0956, 1.0349, 1.3926, 2.0159,
         0.6957, 0.6467, 2.0708, 1.0457, 1.6196, 1.9443, 1.3422, 0.4986, 1.1414,
         1.5599, 0.4212, 1.2448, 0.7276, 0.3622, 0.4850, 1.1003, 0.4663, 1.5891,
         0.4010, 0.6296, 1.0965, 1.4279, 0.6373, 1.6188, 0.7161, 0.5618, 1.3935,
         1.4818, 0.5627, 0.4510, 0.7963, 0.5461, 0.7202, 0.7615, 0.5377, 0.9508,
         1.2000, 1.4229, 1.6378, 1.2033, 1.0763, 0.9351, 2.0503, 0.9261, 1.7071,
         0.6029, 1.4353, 0.4673, 0.4514, 0.3626, 0.4858, 0.7164, 1.2765, 1.4978,
         1.6717, 1.4249]], grad_fn=<AsStridedBackward0>)

In [15]:
# predict features for each sentence in each list and average


["I' ll give you a tip.", 'you']