In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# from google.colab import drive
# drive.mount('/content/drive')
!pip install transformers
import pandas as pd
import os

# path
dir = '/content/drive/MyDrive/Colab Notebooks/lab2resources/sentence-completion'
# dataloader
question_data = pd.read_csv(os.path.join(dir,"testing_data.csv"))
answer_data = pd.read_csv(os.path.join(dir,"test_answer.csv"))




import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np
import re

# mscq, model 
multi_choices = {'a)':1, 'b)':2, 'c)':3, 'd)':4, 'e)':5}
model = 'distilroberta-base'

class EvaluateLanguageModel():

  def __init__(self, question_data, answer_data, multi_choices, model):
    print(len(question_data))
    self.questioning_answering_process()
    self.tokenizer = AutoTokenizer.from_pretrained(model)
    self.model = AutoModelForMaskedLM.from_pretrained(model)
    self.sent_encodings, self.word_encodings, self.mask_idxs = self.encoder()

  def run_and_eval_model(self):
    output = self.making_prediction()
    self.accuracy = self.getting_model_accuracy(output, question_data['answer'])

  def questioning_answering_process(self, s='_____'):
    answer_idxs, candidate_question_data = [], []
    for index, row in question_data.iterrows():
      answer = answer_data.iloc[index].answer + ')'
      answer_idxs.append(multi_choices.get(answer))
      candidate_question_data.append([re.sub(s, row.loc[c], row.loc['question']) for c in multi_choices.keys()])
    question_data.loc[:, 'candidate_question_data'] = candidate_question_data
    question_data.loc[:, 'answer'] = answer_idxs

  def getting_sublist_idxs_from_list(self, word, sentence):
    # find mask indicies for encoded sentence
    possibles = np.where(sentence == word[0])[0]
    for p in possibles:
      check = sentence[p:p + len(word)]
      if np.all(check == word):
          return list(range(p, (p + len(word))))

  def encoder(self):
    sent_encodings, word_encodings, mask_idxs = [], [], []
    for index, row in question_data.iterrows():
        _sent_encodings, _word_encodings, _mask_idxs = [], [], []
        for i, (word, sentence) in enumerate(zip(row[multi_choices.keys()], row.loc['candidate_question_data'])):
          encoded_word = self.tokenizer.encode(str(" " + word), add_special_tokens=False)
          encoded_sent = self.tokenizer.encode_plus(sentence, add_special_tokens=True, return_tensors='pt',
                                                padding='max_length', max_length=128, return_attention_mask=True)
          tokens_to_mask_idx = self.getting_sublist_idxs_from_list(np.array(encoded_word), np.array(encoded_sent['input_ids'][0]))
          encoded_sent['input_ids'][0][tokens_to_mask_idx] = self.tokenizer.mask_token_id
          _sent_encodings.append(encoded_sent)
          _word_encodings.append(encoded_word)
          _mask_idxs.append(tokens_to_mask_idx)
        sent_encodings.append(_sent_encodings)
        word_encodings.append(_word_encodings)
        mask_idxs.append(_mask_idxs)
    return sent_encodings, word_encodings, mask_idxs

  def making_prediction(self):
    output = []
    for q_idx, (w, s, m) in enumerate(zip(self.word_encodings, self.sent_encodings, self.mask_idxs)):
      print(f'Question {q_idx}')
      predictions = []
      candidate_input_ids = torch.stack([inp_ids['input_ids'].squeeze(0) for inp_ids in s])
      candidate_attention_masks = torch.stack([am['attention_mask'].squeeze(0) for am in s])
      candidate_logits = self.model(candidate_input_ids, attention_mask=candidate_attention_masks).logits
      for idx, (token, mask_idxs) in enumerate(zip(w, m)):
        mask_token_logits = candidate_logits[idx, mask_idxs, token]
        candidate_score = float(torch.mean(mask_token_logits))
        predictions.append(candidate_score)
      output.append(np.argmax(predictions) + 1)
    return output 

  def getting_model_accuracy(self, predictions, ground_truth):
    correct = 0
    for pred, gt in zip(predictions, ground_truth):
      if pred == gt:
        correct += 1
    return correct/len(ground_truth)

question_data = pd.read_csv(os.path.join(dir,"testing_data.csv"))
answer_data = pd.read_csv(os.path.join(dir,"test_answer.csv"))
evaluator_model = EvaluateLanguageModel(question_data, answer_data, multi_choices, model)

evaluator_model.run_and_eval_model()

evaluator_model.accuracy

print(evaluator_model)

#fill in the blanks testing

# from transformers import BertTokenizer
# tz = BertTokenizer.from_pretrained("bert-base-cased")
# tz.convert_tokens_to_ids(["characteristically"])
# [100]

# sent = "He remains characteristically confident and optimistic."
# tz.tokenize(sent)
# ['He',
#  'remains',
#  'characteristic',
#  '##ally',
#  'confident',
#  'and',
#  'optimistic',
#  '.']

# tz.convert_tokens_to_ids(tz.tokenize(sent))
# [1124, 2606, 7987, 2716, 9588, 1105, 24876, 119]

# tz.convert_tokens_to_ids(["murderous"])

# sent = "He remains kensington confident and optimistic."
# tz.tokenize(sent)

# sent="mystic inevitable police customary dreary"

# tz.tokenize(sent)

# tz.convert_tokens_to_ids(tz.tokenize(sent))


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

Downloading config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

Question 0
Question 1
Question 2
Question 3
Question 4
Question 5
Question 6
Question 7
Question 8
Question 9
Question 10
Question 11
Question 12
Question 13
Question 14
Question 15
Question 16
Question 17
Question 18
Question 19
Question 20
Question 21
Question 22
Question 23
Question 24
Question 25
Question 26
Question 27
Question 28
Question 29
Question 30
Question 31
Question 32
Question 33
Question 34
Question 35
Question 36
Question 37
Question 38
Question 39
Question 40
Question 41
Question 42
Question 43
Question 44
Question 45
Question 46
Question 47
Question 48
Question 49
Question 50
Question 51
Question 52
Question 53
Question 54
Question 55
Question 56
Question 57
Question 58
Question 59
Question 60
Question 61
Question 62
Question 63
Question 64
Question 65
Question 66
Question 67
Question 68
Question 69
Question 70
Question 71
Question 72
Question 73
Question 74
Question 75
Question 76
Question 77
Question 78
Question 79
Question 80
Question 81
Question 82
Question 83
Qu