## Importing

In [1]:
import os
import sys

# path 변수.
GBSQA_PATH = './dataset/GBSQA/GBS_QA_FF.csv' 
GBSQA_HOME_PATH = './dataset/GBSQA'
HF_CACHE_PATH = './HuggingFaceCache/models' # huggingface에서 받는 모델이 저장되는 곳
DATA_CACHE_PATH = '/HuggingFaceCache/dataset' # huggingface에서 받는 데이터셋이 저장되는 곳
os.environ['TRANSFORMERS_CACHE'] = HF_CACHE_PATH # huggingface 캐쉬 디렉토리 지정. 모델이 다운 받는 위치 지정.
os.environ['HF_DATASETS_CACHE'] = DATA_CACHE_PATH # 데이터셋 다운 받는 위치 지정
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # PCI BUS ID로 현재 보이는 그래픽카드 정렬
os.environ['CUDA_VISIBLE_DEVICES'] = '2' # 내가 할당된 그래픽카드만 시스템에서 보이도록 설정.
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import torch
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModel
torch.cuda.current_device() # 0으로 출력되지만, 실제로 모델을 넣어 구동하면 지정한 그래픽카드에 할당될 것임.
import pickle
import evaluate

from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from aim.hugging_face import AimCallback
from aim import Run

# For reproducibility
random_seed = 42
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


### Pretraining 데이터 생성

In [2]:
def dataset_split(li): # dataset 구분
    num_test = 20 # 최소한의 test set 갯수 확보
    train = li[:int(len(li) * 0.7)]
    valid = li[int(len(li) * 0.7): len(li) - num_test]
    test = li[len(li) - num_test:]
    return train, valid, test

def save_shuffle_indices(shuffled_pair, original_pair):
    _o_pair = np.array(original_pair)
    _s_pair = np.array(shuffled_pair)
    find_idx_func = lambda x: np.where((_o_pair[:, 0] == x[0]) & (_o_pair[:, 1] == x[1]))
    return np.array(list(map(find_idx_func, _s_pair))).squeeze()
    
def shuffle_array(array):
    np.random.shuffle(array)
    return array

##
# get ready dataset
def load_preprocess_data(fpath: os.PathLike = './dataset/GBSQA/provision_intersected.csv'): 

    gbsqa = pd.read_csv(GBSQA_HOME_PATH + '/provision_intersected.csv')
    valid_gbsqa = gbsqa[~gbsqa['provision'].isna()] # 전문가의 자세한 답변이 담긴 사항이 없는 것들 제외.
    valid_gbsqa = valid_gbsqa[~valid_gbsqa['Classification'].isna()] # classification이 있는 것만 추출.
    valid_gbsqa.drop_duplicates(subset=['Original answers', 'provision_text'], inplace=True)

    text_pair = valid_gbsqa[['Original answers', 'provision_text']]
    text_pair = list(text_pair.itertuples(index=False, name=None))
    shuffled_pair = shuffle_array(text_pair.copy())
    shuffle_indices = save_shuffle_indices(shuffled_pair, text_pair)

    text_pair = valid_gbsqa[['Original answers', 'provision_text']]
    train_indices, valid_indices, test_indices = dataset_split(shuffle_indices)
    raw_dataset = {'train': text_pair.iloc[train_indices], 'valid': text_pair.iloc[valid_indices], 'test': text_pair.iloc[test_indices]}
    
    return raw_dataset, valid_gbsqa, shuffle_indices

raw_dataset, valid_gbsqa, shuffle_indices = load_preprocess_data()

### Classification dataset 생성

In [3]:
class_tokenizer = AutoTokenizer.from_pretrained(HF_CACHE_PATH + '/xlm_roberta_gbsqa_classification/tokenizer')

labels = valid_gbsqa['Classification']
zero_one = labels.map({'Yes':1, 'No':0})

valid_gbsqa = valid_gbsqa[~zero_one.isna()]
valid_gbsqa['Classification'] = zero_one

def shuffle_array(array):
    np.random.shuffle(array)
    return array

def tokenize_preprocess_classification(examples):
    return class_tokenizer(*examples,max_length=512, padding='max_length', truncation=True)

def dataset_split(li):
    num_test = 20
    train = li[:int(len(li) * 0.7)]
    valid = li[int(len(li) * 0.7): len(li) - num_test]
    test = li[len(li) - num_test:]
    return train, valid, test

def save_shuffle_indices(shuffled_pair, original_pair):
    _o_pair = np.array(original_pair)
    _s_pair = np.array(shuffled_pair)
    find_idx_func = lambda x: np.where((_o_pair[:, 0] == x[0]) & (_o_pair[:, 1] == x[1]))
    return np.array(list(map(find_idx_func, _s_pair))).squeeze()


text_pair = valid_gbsqa[['Original answers', 'Revised questions', 'Classification']] 
text_pair = text_pair.iloc[shuffle_indices]
text_pair = list(text_pair.itertuples(index=False, name=None))
tokenized_classification = {}
tp_np = np.array(text_pair)

raw_dataset_classification_train, raw_dataset_classification_valid, raw_dataset_classification_test = dataset_split(tp_np.tolist())

class GBSQADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        self.length = len(encodings)
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.encodings[idx].items()}
        return item

    def __len__(self):
        return self.length

tokenized_classification['train'] = GBSQADataset([dict({'labels': int(float(label))}, **tokenize_preprocess_classification([text_a, text_b])) for text_a, text_b, label in raw_dataset_classification_train])
tokenized_classification['valid'] = GBSQADataset([dict({'labels': int(float(label))}, **tokenize_preprocess_classification([text_a, text_b]))for text_a, text_b, label in raw_dataset_classification_valid])
tokenized_classification['test'] = GBSQADataset([dict({'labels': int(float(label))}, **tokenize_preprocess_classification([text_a, text_b])) for text_a, text_b, label in raw_dataset_classification_test])


### Model pretraining on revised datasets

### Baseline model loading

In [4]:
# Importing 셀 블록을 실행한 이후에 사용 가능.
model_repository = {'roberta':'roberta-base', "deberta":"microsoft/deberta-v3-base", "xlm-roberta":"xlm-roberta-base", "xlm-large":"xlm-roberta-large"}
model_selection = "xlm-large"
new_tokenizer = AutoTokenizer.from_pretrained(HF_CACHE_PATH + '/xlm_roberta_gbsqa_classification/tokenizer')
model = AutoModelForMaskedLM.from_pretrained(model_repository[model_selection]) # get ready from pre-trained


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def tokenize_preprocess(examples):
    return new_tokenizer(*examples, max_length=512,truncation=True) # 위에 정의한 new_tokenizer.
train = list(raw_dataset['train'].to_numpy())
valid = list(raw_dataset['valid'].to_numpy())
tokenized_train = list(map(tokenize_preprocess,train))
tokenized_valid = list(map(tokenize_preprocess, valid))
tokenized_train.extend(tokenized_valid)


In [6]:

new_tokenizer.pad_token = new_tokenizer.eos_token # for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=new_tokenizer, mlm_probability=0.15, return_tensors='pt')
# Training setting
# Freezing Larger part of DeBERTa for memory
if model_repository == 'deberta':
    for param in model.deberta.parameters():
        param.requires_grad = False

aim_callback = AimCallback(experiment="xlm-roberta-large model pretraining 512 tokens")
training_args = TrainingArguments( # Training argument 정리
    output_dir=HF_CACHE_PATH + '/xlm_roberta_gbsqa_classification/large_model/pretraining',
    evaluation_strategy= 'no',
    save_strategy='no',
    learning_rate=5e-5,
    num_train_epochs=100,
    weight_decay=0.01,
    push_to_hub=False,
    logging_steps=1000,
    per_device_train_batch_size=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    callbacks=[aim_callback]
)




ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
trainer.train() # training

In [None]:
trainer.model.save_pretrained(os.path.join(HF_CACHE_PATH, 'xlm_roberta_gbsqa_classification/revised_dataset_large_model/pretrained_tv'))

## Encoding text from pretrained model

### Encoded text for classifier generating

In [None]:
# pretrained model loading
pretrained_bert = AutoModel.from_pretrained(os.path.join(HF_CACHE_PATH, "xlm_roberta_gbsqa_classification/revised_dataset_large_model/pretrained_tv"))
pretrained_bert.to('cuda')
pretrained_bert.eval()
def text_encode(total_input, total_labels):
    encoding_list = []
    for idx in range(len(total_labels)):
        input_ = {'input_ids':None, 'attention_mask':None}
        input_['input_ids'] = total_input['input_ids'][idx].view(1, -1).to('cuda')
        input_['attention_mask'] = total_input['attention_mask'][idx].view(1, -1).to('cuda')
        encoding_list.append(pretrained_bert(**input_).last_hidden_state.to('cpu').detach())
        input_['input_ids'].detach()
        input_['attention_mask'].detach()
    return {'encode_list':encoding_list, 'labels':total_labels}

# generate encoding
for split in ['train', 'valid', 'test']:
    print(f'encoding {split}')
    total_input = {'input_ids':[], 'attention_mask':[]}
    total_labels = []
    for item in tokenized_classification[split]:
        total_input['input_ids'].append(item['input_ids'])
        total_input['attention_mask'].append(item['attention_mask'])
        total_labels.append(item['labels'])

    total_input['input_ids'] = torch.stack(total_input['input_ids'], dim=0)
    total_input['attention_mask'] = torch.stack(total_input['attention_mask'], dim=0)
    total_labels = torch.stack(total_labels, dim=0)
    to_save_pkl = text_encode(total_input, total_labels)
    pickle.dump(to_save_pkl, open(f'./dataset/GBSQA_encode/{split}/xlm-roberta-large-revised-{split}-encode3.pkl', "wb"))

## Classifier Learning

In [5]:

class NL_classifier(torch.nn.Module):
    def __init__(self):
        super(NL_classifier, self).__init__()
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(512 * 1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 2) 
        )
    def forward(self, x):
        logits = self.classifier(x)
        return logits
class Encode_dataset(Dataset):
    def __init__(self,x,y):
        self.x = x
        self.y = y
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    def __len__(self):
        return len(self.x)
    
aim_session = Run(experiment='simple_2_layer_classifier')

classifier = NL_classifier()
epoch = 200
learning_rate = 1e-4
optimizer = torch.optim.AdamW(classifier.parameters(), lr=learning_rate)
target_splits = ['train']
class_x = None
class_y = None


for split in target_splits:
    test_encode_load = pickle.load(open(f'./dataset/GBSQA_encode/{split}/xlm-roberta-large-revised-{split}-encode3.pkl', "rb"))
    if (class_x is None) and (class_y is None):
        class_x, class_y = test_encode_load['encode_list'], test_encode_load['labels']
    else:
        _class_x, _class_y = test_encode_load['encode_list'], test_encode_load['labels']
        class_x = torch.stack([*class_x, *_class_x], dim=0)
        class_y = torch.stack([*class_y, *_class_y], dim=0)
if isinstance(class_x, list):
    class_x = torch.stack([*class_x], dim=0)
    class_y = torch.stack([*class_y], dim=0)
class_x = class_x.view(class_x.shape[0], -1)
objective_fn = torch.nn.CrossEntropyLoss()
classifier = classifier.to('cuda')
dset = Encode_dataset(class_x, class_y)
dloader = DataLoader(dset, batch_size=16, shuffle=True)
steps = 0
for ep in range(epoch):
    for x, y in dloader:
        steps += 1
        x = x.to('cuda')
        y = y.to('cuda')
        logits = classifier(x)
        loss = objective_fn(logits, y)
        aim_session.track(loss.item(), 'train_loss', steps)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
torch.save(classifier, './gbsqa_revised_classifier3.pt')

## Classification Test

In [7]:
class NL_classifier(torch.nn.Module):
    def __init__(self):
        super(NL_classifier, self).__init__()
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(512 * 1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 2) 
        )
    def forward(self, x):
        logits = self.classifier(x)
        return logits

classifier = torch.load('gbsqa_revised_classifier3.pt')
target_splits = ['test']
class_x = None
class_y = None

for split in target_splits:
    test_encode_load = pickle.load(open(f'./dataset/GBSQA_encode/{split}/xlm-roberta-large-revised-{split}-encode3.pkl', "rb"))
    if (class_x is None) and (class_y is None):
        class_x, class_y = test_encode_load['encode_list'], test_encode_load['labels']
    else:
        _class_x, _class_y = test_encode_load['encode_list'], test_encode_load['labels']
        class_x = torch.stack([*class_x, *_class_x], dim=0)
        class_y = torch.stack([*class_y, *_class_y], dim=0)
if isinstance(class_x, list):
    class_x = torch.stack([*class_x], dim=0)
    class_y = torch.stack([*class_y], dim=0)

class_x = class_x.view(class_x.shape[0], -1)
classifier = classifier.to('cpu')

classifier.eval()

out = classifier(class_x)

prediction = np.argmax(out.detach().numpy(), axis=1)


In [8]:

acc_metric = evaluate.load('accuracy')
recall_metric = evaluate.load('recall')
precision_metric = evaluate.load('precision')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = acc_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions.tolist(), references=labels.tolist())
    precision = precision_metric.compute(predictions=predictions, references=labels)
    return {'accuracy': accuracy, 'recall':recall, 'precision':precision}

print(compute_metrics((out.detach(), class_y.detach())))

{'accuracy': {'accuracy': 0.85}, 'recall': {'recall': 1.0}, 'precision': {'precision': 0.8333333333333334}}
