In [12]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting huggingface-hub<1.0,>=0.10.0
  Using cached huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
Collecting filelock
  Using cached filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2.tar.gz (359 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tokenizers
  Building wheel for tokenizers (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for tokenizers [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[51 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_

In [14]:
import os

from transformers import ElectraTokenizerFast
from transformers import ElectraForQuestionAnswering

from torch.utils.data import Dataset
from torch.utils.data import DataLoader


ModuleNotFoundError: No module named 'transformers'

In [4]:
class QADataset(Dataset):     # 데이터를 input으로 변환해주는 Dataset 클래스를 상속하여, QA(Question Answering) 과제에 맞게 커스터마이징한다
    
    def __init__ (self, data_dir: str, tokenizer, max_seq_len: int, mode = 'train'):     # Dataset 클래스는 기본적으로 __init__, __len__, __getitem__를 정의해 주어야 한다
        self.mode = mode
        self.data = json.load(open(data_dir, 'r', encoding='utf8'))
        
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
        if mode == 'test':
            self.encodings, self.question_ids = self.preprocess()
        else:
            self.encodings, self.answers = self.preprocess()
        
    def __len__(self):     # index를 통해 input을 순차적으로 읽어오기 위해서는 데이터의 길이가 먼저 확인되어야 한다. __len__ 함수는 input의 길이를 반환해주는 함수
        return len(self.encodings.input_ids)

    def __getitem__(self, index: int):     # input의 길이가 확인되면 index를 통해 데이터를 불러올 수 있다. __getitem__ 함수는 index에 해당하는 input 데이터를 반환해주는 함수
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}

    
    def preprocess(self):
        contexts, questions, answers, question_ids = self.read_squad()     # SQuAD(Stanford Question Answering Dataset) 형식의 데이터에서 contexts, questions, answers, question_ids를 읽어오는 함수
        if self.mode == 'test':
            encodings = self.tokenizer(contexts, questions, truncation=True, max_length = self.max_seq_len, padding=True)
            return encodings, question_ids
        else: # train or val
            self.add_end_idx(answers, contexts)     # train.json에는 질문에 대한 답이 context 내에서 시작되는 index인 'answer_srart'만 있기 때문에, 추가로 'answer_end'를 찾아주는 함수
            encodings = self.tokenizer(contexts, questions, truncation=True, max_length = self.max_seq_len, padding=True)
            self.add_token_positions(encodings, answers)
        
            return encodings, answers
        
    
    def read_squad(self):     # SQuAD(Stanford Question Answering Dataset) 형식의 데이터에서 contexts, questions, answers, question_ids를 읽어오는 함수
        contexts = []
        questions = []
        question_ids = []
        answers = []
        
        # train - val split
        if self.mode == 'train':
            self.data['data'] = self.data['data'][:-1*int(len(self.data['data'])*0.1)]
        elif self.mode == 'val':
            self.data['data'] = self.data['data'][-1*int(len(self.data['data'])*0.1):]
        
        
        till = len(self.data['data'])
        

        for group in self.data['data'][:till]:
            for passage in group['paragraphs']:
                context = passage['context']
                for qa in passage['qas']:
                    question = qa['question']
                    if self.mode == 'test':
                        contexts.append(context)
                        questions.append(question)
                        question_ids.append(qa['question_id'])
                    else: # train or val
                        for ans in qa['answers']:
                            contexts.append(context)
                            questions.append(question)

                            if qa['is_impossible']:
                                answers.append({'text':'','answer_start':-1})
                            else:
                                answers.append(ans)
                
        # return formatted data lists
        return contexts, questions, answers, question_ids
    
    
    def add_end_idx(self, answers, contexts):     # train.json에는 질문에 대한 답이 context 내에서 시작되는 index인 'answer_srart'만 있기 때문에, 추가로 'answer_end'를 찾아주는 함수
        for answer, context in zip(answers, contexts):
            gold_text = answer['text']
            start_idx = answer['answer_start']
            end_idx = start_idx + len(gold_text)

            # in case the indices are off 1-2 idxs
            if context[start_idx:end_idx] == gold_text:
                answer['answer_end'] = end_idx
            else:
                for n in [1, 2]:
                    if context[start_idx-n:end_idx-n] == gold_text:
                        answer['answer_start'] = start_idx - n
                        answer['answer_end'] = end_idx - n
                    elif context[start_idx+n:end_idx+n] == gold_text:
                        answer['answer_start'] = start_idx + n
                        answer['answer_end'] = end_idx + n
                        

    def add_token_positions(self, encodings, answers):
        # should use Fast tokenizer
        start_positions = []
        end_positions = []
        for i in range(len(answers)):
            if answers[i]['answer_start'] == -1:
                # set [CLS] token as answer if is_impossible
                start_positions.append(0)
                end_positions.append(1)
            else:
                start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))

                assert 'answer_end' in answers[i].keys(), f'no answer_end at {i}'
                end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

            # answer passage truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length                
            # end position cannot be found, shift until found
            shift = 1
            while end_positions[-1] is None:
                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
                shift += 1
                
        # char-based -> token based
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [8]:
# torch.utils.data.Dataset : 데이터를 input으로 변환
train_dataset = QADataset(data_dir=os.path.join('/Users/lhs/Desktop/GitHub/AIConnect_YDS_1/NLP_MRC/', 'AI_train.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'train')
val_dataset = QADataset(data_dir=os.path.join('/Users/lhs/Desktop/GitHub/AIConnect_YDS_1/NLP_MRC/', 'AI_val.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'val')

# torch.utils.data.DataLoader : input을 배치 단위로 리턴해주는 기능
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS, 
                              shuffle=True,
                              pin_memory=PIN_MEMORY,
                              drop_last=DROP_LAST)

val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=NUM_WORKERS, 
                            shuffle=True,
                            pin_memory=PIN_MEMORY,
                            drop_last=DROP_LAST)

print(f"Load data, train:{len(train_dataset)} val:{len(val_dataset)}")

NameError: name 'tokenizer' is not defined