In [None]:
import numpy as np
import torch


In [None]:
# Tasks - bert, coref, ner, nli, paraphrase, qa, sa, srl, ss, sum, wsd
task = "bert"

if task == "bert":
    from transformers import AutoTokenizer, BertModel

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")
elif task == "coref":
    from transformers import AutoTokenizer, AutoModel

    tokenizer = AutoTokenizer.from_pretrained("nielsr/coref-bert-base")
    model = AutoModel.from_pretrained("nielsr/coref-bert-base")
elif task == "ner":
    from transformers import AutoTokenizer, AutoModelForTokenClassification

    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
elif task == "nli":
    from transformers import AutoTokenizer, AutoModel

    tokenizer = AutoTokenizer.from_pretrained(
        "sentence-transformers/bert-base-nli-mean-tokens"
    )
    model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
elif task == "paraphrase":
    from transformers import AutoTokenizer, AutoModelForMaskedLM

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
    model = AutoModelForMaskedLM.from_pretrained("bert-base-cased-finetuned-mrpc")
elif task == "qa":
    from transformers import AutoTokenizer, BertForQuestionAnswering

    tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
    model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")
elif task == "sa":
    from transformers import AutoTokenizer, AutoModelForSequenceClassification

    tokenizer = AutoTokenizer.from_pretrained("barissayil/bert-sentiment-analysis-sst")
    model = AutoModelForSequenceClassification.from_pretrained(
        "barissayil/bert-sentiment-analysis-sst"
    )
elif task == "srl":
    from transformers import AutoTokenizer, AutoModel

    tokenizer = AutoTokenizer.from_pretrained("liaad/srl-en_mbert-base")
    model = AutoModel.from_pretrained("liaad/srl-en_mbert-base")
elif task == "ss":
    from transformers import AutoTokenizer, AutoModelForTokenClassification

    tokenizer = AutoTokenizer.from_pretrained(
        "vblagoje/bert-english-uncased-finetuned-chunk"
    )
    model = AutoModelForTokenClassification.from_pretrained(
        "vblagoje/bert-english-uncased-finetuned-chunk"
    )
elif task == "sum":
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

    tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
    model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
elif task == "wsd":
    from transformers import AutoTokenizer, AutoModel

    tokenizer = AutoTokenizer.from_pretrained("./bert-wsd")
    model = AutoModel.from_pretrained("./bert-wsd")


In [None]:
stimuli_384 = open(
    "./pereira_dataset/stimuli_384sentences_dereferencedpronouns.txt", "r"
)
stimuli_384 = stimuli_384.read()

stimuli_243 = open(
    "./pereira_dataset/stimuli_243sentences_dereferencedpronouns.txt", "r"
)
stimuli_243 = stimuli_243.read()


sentences = []
sentences.extend(stimuli_384.split("\n"))
sentences.pop()
sentences.extend(stimuli_243.split("\n"))
sentences.pop()


In [None]:
features = []

for sentence in sentences:
    question = sentence
    inputs = tokenizer(question, return_tensors="pt")
    outputs = model(**inputs, output_hidden_states=True)
    if task == "sum":
        features.append(outputs.encoder_last_hidden_state)
    else:
        features.append(outputs.hidden_states)


In [None]:
final_vectors = []
for sentence in features:
    temp2 = []
    for layers in sentence:
        temp1 = []
        for tensor_list in layers:
            temp_arr = tensor_list.tolist()
            up = len(temp_arr) - 1
            sliced_arr = temp_arr[1:up]
            sliced_with_numpy = []
            for word_embedding_list in sliced_arr:
                sliced_with_numpy.append(np.array(word_embedding_list))
            cnt = 0
            s = np.zeros(768)
            for i in sliced_with_numpy:
                s += i
                cnt += 1
            sentence_average_arr = s / cnt
            tensor_list = sentence_average_arr
        temp2.append(tensor_list)
    final_vectors.append(temp2)


In [None]:
final_features = []
temp = []
for layer in range(len(final_vectors[0])):
    temp1 = []
    for sentence in range(len(final_vectors)):
        temp1.append(final_vectors[sentence][layer])
    temp.append(temp1)

final_features = np.array(temp)
final_features.shape


In [None]:
np.save(f"./features/pereira_{task}.npy", final_features)
