<a href="https://colab.research.google.com/github/js-lee-AI/Abnormal-detection/blob/master/research/fine-tuning/temp-DACON_(Data_preprocessing%2C_fine_tuning).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and Import

In [1]:
!pip install --upgrade pip
!pip install transformers pytorch-lightning sentencepiece datasets
!pip install tqdm==4.43.0
!pip install tokenizers==0.10.3
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip3 install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
!pip3 install torchvision

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.4 MB/s 
[?25hCollecting pytorch-lightning
  Downloading pytorch_lightning-1.3.8-py3-none-any.whl (813 kB)
[K     |████████████████████████████████| 813 kB 40.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 43.8 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.10.2-py3-none-any.whl (542 kB)
[K     |████████████████████████████████| 542 kB 45.4 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.3 MB/s 
Collecting sacremoses
  Down

In [18]:
import pandas as pd
import torch
import re
from torch.utils.data import DataLoader
# from datasets import Dataset, DatasetDict
from sklearn.metrics import log_loss, accuracy_score,f1_score

from pytorch_lightning import LightningDataModule

from transformers import AutoTokenizer, AutoModelForSequenceClassification 

import re
import json
import os
import tqdm
import argparse

In [3]:
from typing import List, Optional, Union, Tuple
from tokenizers import Tokenizer, decoders, pre_tokenizers, AddedToken
from tokenizers.implementations import BaseTokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
import torch


class BrainBertTokenizer(BaseTokenizer):

    def __init__(
        self,
        vocab: Union[str, List],
        merges: List[Tuple[str, str]],
        bos_token: str = "<s>",
        eos_token: str = "</s>",
        sep_token: str = "</s>",
        cls_token: str = "<s>",
        pad_token: str = "<pad>",
        unk_token: str = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        dropout: Optional[float] = None,
        normalize: bool = True,
    ):
        bpe = BPE(
            vocab=vocab,
            merges=merges,
            unk_token=unk_token,
            fuse_unk=True,
        )

        tokenizer = Tokenizer(bpe)

        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        if normalize:
            tokenizer.normalizer = NFKC()

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False)
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False)
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False)
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False)
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False)
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False)

        self.add_special_tokens([
            bos_token,
            eos_token,
            sep_token,
            cls_token,
            unk_token,
            pad_token,
        ])

    @staticmethod
    def from_file(
        vocab_filename: str,
        merges_filename: Union[str, None],
        **kwargs,
    ):
        vocab, merges = BPE.read_file(vocab_filename, merges_filename)
        return BrainBertTokenizer(vocab, merges, **kwargs)

    def __call__(
        self,
        text: str,
        return_tensors: bool = True,
        add_special_tokens: str = "pt",
    ) -> Union[List[int], torch.Tensor]:
        """
        encode text for brainbert.
        Args:
            text (str): input sentence
            return_tensors (str): whether convert list of int to `torch.Tensor` or not
            add_special_tokens (bool): whether add <s>, </s> to encoding or not
        Returns:
            (List[str]): list of token ids
            (torch.Tensor): tensor of token ids
        """

        if add_special_tokens:
            text = f"<s>{text}</s>"

        input_ids = self.encode(text).ids

        if return_tensors == "pt":
            input_ids = torch.tensor(input_ids).unsqueeze(0).long()

        return input_ids

# Preprocessing

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
class Train_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


class Test_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [27]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import XLMRobertaTokenizer, BertTokenizerFast
from sklearn.model_selection import train_test_split


class DataModule(LightningDataModule):

    # text_map = ['요약문_연구내용', '과제명']

    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams
        # self.dataset=DatasetDict()
        self.train_dataset = []
        self.val_dataset  = []
        self.test_dataset = []

        self.text_map = ['과제명', '사업명', '내역사업명', '요약문_한글키워드', '요약문_기대효과', '요약문_연구목표', '요약문_연구내용']
        self.random_seed = 45


        # korean/multilingual tokenizers [kobert, brainbert, xlm-roberta, albert, .. etc]
        if hparams.language == 'ko' or hparams.language == 'multi':
          if self.hparams.model_or_tokenizer_name == 'skt/kobert-base-v1':
            self.tokenizer = KoBERTTokenizer.from_pretrained(self.hparams.model_or_tokenizer_name)
          elif self.hparams.model_or_tokenizer_name == 'hyunwoongko/brainbert-base-ko-kornli':
                download_brainbert_tokenizer.from_file(
                vocab_filenmae='brainbert.merges.txt', 
                merges_filename='brainbert.vocam.json'
                )

                self.tokenizer = BrainBertTokenizer.from_file(
                    vocab_filenmae='brainbert.merges.txt', 
                    merges_filename='brainbert.vocam.json'
                )
          elif self.hparams.model_or_tokenizer_name == 'xlm-roberta-large':
            self.tokenizer = XLMRobertaTokenizer.from_pretrained(self.hparams.model_or_tokenizer_name)
          elif self.hparams.model_or_tokenizer_name == 'kykim/albert-kor-base':
            self.tokenizer = BertTokenizerFast.from_pretrained(self.hparams.model_or_tokenizer_name)

          else :
            self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_or_tokenizer_name)

        # english tokenizers [ ,.. etc]
        elif hparams.language == 'en':
          pass


    def preprocessing(self, pandas_data):
        all_col = pandas_data.columns.values.tolist()
        # drop other columns except 'label'
        for col in all_col :
          if not col in self.text_map and col != 'label':
            pandas_data = pandas_data.drop(col, axis=1)        
        
        # training data에서 짧은 순서 -> 긴 순서로 컬럼 위치 변경
        if 'label' in pandas_data.columns.values.tolist() :
          temp = pandas_data['label']
          pandas_data = pandas_data[self.text_map]
          pandas_data['label'] = temp

        # fillna and delete everything but korean
        for col in self.text_map:
            pandas_data[col].fillna('NAN', inplace=True)
            if self.hparams.language == 'ko' :
              pandas_data[col] = pandas_data[col].apply(lambda x:re.sub("[^가-힣ㄱ-하-ㅣ]", " ",x))
        #if 'label' in pandas_data.columns:
            # for t5, label is feeded by str
            #pandas_data['label'] = pandas_data['label'].apply(str)
        return pandas_data

    def prepare_data(self, train_or_test):
        """
        train_or_test : one of [train], [test], [train test]
        이쪽부분에서 data agument 할 수 있을듯.
        """
        if 'train' not in train_or_test and 'test' not in train_or_test:
            raise Exception('please choose one of [train], [test], [train test]')
        
        # prepare train or test data
        if 'train' in train_or_test:
            pandas_data = pd.read_csv(self.hparams.data_path  + '/train.csv',index_col=0)
            pandas_data = self.preprocessing(pandas_data)

            # train과 test의 label 분포를 일정하게 만들기 위해서 sklearn의 train_test_split사용   
            train, valid, y_train, y_valid = train_test_split(
                                                      pandas_data[self.text_map], 
                                                      pandas_data['label'], 
                                                      test_size=self.hparams.train_test_split, 
                                                      stratify= pandas_data['label'],
                                                      random_state=self.random_seed
                                                      )
            # train['label'] = y_train
            # valid['label'] = y_valid
            
            # self.dataset['train'] = Dataset.from_pandas(train)
            # self.dataset['valid'] = Dataset.from_pandas(valid)


            
            self.train_dataset = Train_Dataset(train, y_train)
            self.val_dataset = Train_Dataset(valid, y_valid)

          
            # 인섭님 코드
            # train_valid = Dataset.from_pandas(pandas_data).train_test_split(self.hparams.train_test_split, seed=self.random_seed)
            # self.dataset['train'] = train_valid['train']
            # self.dataset['valid'] = train_valid['test']

        if 'test' in train_or_test:
            pandas_data = pd.read_csv(self.hparams.data_path + '/test.csv',index_col=0)
            pandas_data = self.preprocessing(pandas_data)
            # self.dataset['test'] = Dataset.from_pandas(pandas_data)
            self.test_dataset = Test_Dataset(pandas_data)
        
        
        
    def setup(self, stage: str):
        # convert data and tokenize
        # for split in self.dataset.keys():
        #     self.dataset[split] = self.dataset[split].map(
        #             self.convert_to_features,
        #             batched=True,
        #             remove_columns = self.dataset[split].column_names,
        #         )
            # self.dataset[split].set_format(type="torch", columns=self.dataset[split].column_names)
        for dataset in [self.train_dataset, self.val_dataset, self.test_dataset] :
          for split in dataset[0].keys():
              dataset[split] = convert_to_features(dataset[split])
            # self.dataset[split].set_format(type="torch", columns=self.dataset[split].column_names)


    def convert_to_features(self, example_batch, indices=None):
        """
        example_batch : dict(list)
        test_map = ['color','smell','fruit']
        test_example_batch = {'fruit':['apple', 'banana'],'color':['red','yellow'],'smell':['bad','good']}
        texts_or_text_pairs = [' '.join(i) for i in list(zip(*[test_example_batch[i] for i in test_map]))]
        print(texts_or_text_pairs) #['red bad apple', 'yellow good banana']
        """
        '''
        # 과제명과 요약문_연구내용을 붙여서 토크나이저에 넣음. self.text_map 설정으로 순서, 추가피쳐 넣을 수 있음.
        concatted_texts = [' '.join(i) for i in list(zip(*[example_batch[i] for i in self.text_map]))]
        # 토크나이저는 input_ids랑 attentions_mask를 줌. 여기서는 기본배치사이즈 1000일거임
        features = self.tokenizer.batch_encode_plus(
                concatted_texts, 
                max_length=self.hparams.max_seq_length, 
                padding='max_length',
                truncation=True
            )
        print(concatted_texts[0])
        

        # features = self.tokenizer(
        #     concatted_texts,
        #     padding='max_length',
        #     truncation=True,
        #     max_length=self.max_length,
        #     return_tensors='pt'
        # )


        if example_batch.get('label') is not None:
            features['labels'] = example_batch.get('label')
        
        return features
        '''
        pass

In [28]:
data_path = "/content/drive/MyDrive/dataset/open"
tokenizer_or_model_path = "skt/kobert-base-v1"

parser = argparse.ArgumentParser()
parser.add_argument("--data_path", required=False, type=str, default=data_path)
parser.add_argument("--model_or_tokenizer_name", required=False, type=str, default=tokenizer_or_model_path)
parser.add_argument("--max_seq_length", required=False, type=int, default=512)
parser.add_argument("--train_test_split", required=False, type=float, default=0.1)
parser.add_argument("-f", "--file", required=False)
# language of dataset or model
parser.add_argument("--language", required=False, type=str, default='ko') # [ko, multi, en]
args = parser.parse_args()



dm = DataModule(args)
dm.prepare_data(['train','test'])
dm.setup('fit')
dm.dataset

loading file https://huggingface.co/skt/kobert-base-v1/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/6920ce54223b52af14e36b32047ced34c47ec88ac51f45ce0141aaa1054e3263.7eed87d19282a93a2d45e130f20b4d8e831cbf8e957f1476628fd4ab99ae977f
loading file https://huggingface.co/skt/kobert-base-v1/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/skt/kobert-base-v1/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/2ad28172340bc816ccd4ffc7a51682e0c5c89a88a0d618ab40eeb81a3980b356.3db0799720217f7da35e92d033f167ac40c8d2c02fa035130b7bb070f6355074
loading file https://huggingface.co/skt/kobert-base-v1/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/ebd8df8703bef77849f188d10c4ca994fd45a1b41518401a08c13487ef55c723.55c5c51d9ae1a9f730238c32bd0fa05b12cd7d99d757ee0e6accc4c6e4085f40
loading file https://huggingface.co/skt/kobert-base-v1/resolve/main/tokenizer.json fro

TypeError: ignored

In [6]:
dm.dataset['train']['input_ids'][0] # original

tensor([   1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [11]:
dm.tokenizer.decode(dm.dataset['train']['input_ids'][0])

'[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][CLS] 프로바이오틱스 조성물과 발효산물을 이용한 코코아 제품 및 과자류 개발 지역특화산업육성 지역주력산업육성 프로바이오틱스 유산균 발효산물 코코아 제품 과자류 제품 상용화로 전후방산업 동반성장 및 경제 활성화와 원천기술 확보 및 기술혁신형 기업 육성으로 인한 산업 고도화를 야기한다 건강 유지에 유용한 장내 세균에 대한 영양적 효과와 질병의 예방 및 치료에 미치는 효과로 인해 일

In [40]:

# replace '/' with '-' to save
tokenizer_or_model_path = "skt/kobert-base-v1"
drive_path = tokenizer_or_model_path.replace('/', '-')

path = "/content/drive/MyDrive/dataset/open/tokenized_dataset/"
os.mkdir(path + drive_path)

# save tokenized dataset (앞으로 데이터 여러 버전이 생길텐데, 데이터 버전도 경로에 추가하면 안헷갈릴듯?)
dm.dataset.save_to_disk("/content/drive/MyDrive/dataset/open/tokenized_dataset/" +drive_path)


## 여기서 부터는 load 임. 

# load tokenized dataset    (앞으로 데이터 여러 버전이 생길텐데, 데이터 버전도 경로에 추가하면 안헷갈릴듯?)
from datasets import load_from_disk, load_from_disk
train_dataset = load_from_disk(path + drive_path + "/train")
valid_dataset = load_from_disk(path + drive_path + "/valid")
test_dataset = load_from_disk(path + drive_path + "/test")

# make DatasetDict from loaded tokenized dataset
dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['valid'] = valid_dataset
dataset['test'] = test_dataset

In [41]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 156873
    })
    valid: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 17431
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 43576
    })
})

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 156873
    })
    valid: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 17431
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 43576
    })
})

In [None]:
len(train_pd)

174304

In [None]:
train_pd = preprocessing(train_pd)

train_pd.head(5)

Unnamed: 0_level_0,과제명,사업명,내역사업명,요약문_한글키워드,요약문_기대효과,요약문_연구목표,요약문_연구내용,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,유전정보를 활용한 새로운 해충 분류군 동정기술 개발,농업기초기반연구,농산물안전성연구,뉴클레오티드 염기서열 분자마커 종 동정 침샘 전사체,새로운 돌발 및 외래해충의 신속 정확한 동정법 향상 돌발 및 외래해충의...,새로운 해충분류군의 동정기술 개발 및 유입확산 추적,가 외래 및 돌발해충의 발생조사 및 종 동정 대상해충 최근 새...,24
1,대장암의 내성 표적 인자 발굴 및 반응 예측 유전자 지도 구축...,이공학학술연구기반구축,지역대학우수과학자지원사업 년 년,대장암 항암제 내성 세포사멸 유전자발굴,내성 특이적 표적분자를 발굴하고 이를 이용한 효과 증진...,최종목표 감수성 표적 유전자를 발굴하고 내성제어 기전을 연구 발굴된...,차년도 를 통한 선천적 내성 표적 후보 유전자 ...,0
2,비목질계 셀룰로오스 식물자원을 활용한 기능성 부직포 및 고부가가치 뷰티케어 ...,중소기업기술혁신개발,혁신기업기술개발,기능성 샐룰로오스 파이버 천연섬유 기능성 부직포 뷰티 케어 제품 미용 솜,국내 독자적인 비목질계 셀룰로오스 자원의 파이버 및 부직포 제조 등의 기술 확보...,식물계자원 정련 및 최적 신서란 파이버 기초연구 개발 소비자 및 바이어들...,식물계자원 정련 및 최적 신서란 파이버 기초연구 개발 - ...,0
3,소화기 암 진단용 분자영상 형광프로브 개발,창업성장기술개발,창업사업화연계과제,분자 진단 형광 조영제 프로브 항체 대장암,암 진단기술의 차별성 소화기 암 특이 프로브 개발 - 최근 체외진단시장은 ...,암특이적 바이오마커 발굴 및 바이오마커에 대한 프로브 개발 소화기 암...,소화기 암 진단용 분자영상 형광프로브 개발 - 국소 도포형 소화기 암 분자 ...,0
4,위암환자의 항암제반응예측을 위한 발현검사,이공학개인기초연구지원,기본연구지원,제자리부합법 조직미세배열 마이크로 위암 항암제반응 젊은 연령 가족성 위암,-본 연구는 파라핀보관조직에서 로 및...,수술이 불가능한 위암환자는 생존기간은 개월 안팎에 지나지 않고 항암화학요법에 ...,- 검사의 정확성을 확인하기 위해 위암세포주 ...,0


Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-cvv4zxkb/kobert-tokenizer_772b1e5013ff488b903264c5733518f9
  Running command git clone -q https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-cvv4zxkb/kobert-tokenizer_772b1e5013ff488b903264c5733518f9
Building wheels for collected packages: kobert-tokenizer
  Building wheel for kobert-tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert-tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4627 sha256=3e05a2cf728aa2a967c5bc5ccabddfdec94ddf79d9513f4736da9e41fbe8efd7
  Stored in directory: /tmp/pip-ephem-wheel-cache-qpypxork/wheels/10/b4/d9/cb627bbfaefa266657b0b4e8127f7bf96d27376fa1a23897b4
Successfully built kobert-tokenizer
Installing collected packages: kobert-tokenizer
Successfully installed kobert-tokenizer-0.1


# Model

In [None]:
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import MT5EncoderModel
from transformers.modeling_outputs import SequenceClassifierOutput

class Mt5ForSequenceClassification(MT5EncoderModel):
    """
    # modify from DistilBertForSequenceClassification
    from transformers import MT5Config

    tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")

    config = MT5Config.from_pretrained("google/mt5-small",
                                    vocab_size=tokenizer.vocab_size,
                                    num_labels=22,
                                    pad_token_id=tokenizer.pad_token_id)
    model = Mt5ForSequenceClassification(config)

    article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    inputs = tokenizer(article, return_tensors="pt")
    labels = torch.tensor([1]).unsqueeze(0)
    outputs = model(**inputs)
    print(outputs.keys()) # odict_keys(['logits'])
    outputs = model(**inputs, labels=labels)
    print(outputs.keys()) # odict_keys(['loss', 'logits'])"""
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        config.update({"dim":config.d_model,
                       "seq_classif_dropout":0.1})
        #help(config.update)
        self.config = config
        #print(config)

        #self.distilbert = DistilBertModel(config)
        self.model = MT5EncoderModel(config)
        
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        model_output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = model_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + model_output[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=model_output.hidden_states,
            attentions=model_output.attentions,
        )

In [28]:
def mkdirForSave(model_name, path, dataset_number):
  # replace '/' with '-' to save
  drive_path = model_name.replace('/', '-')
  
  new_dir = path + drive_path + '/DataVersion-' + str(dataset_number)
  if not os.path.exists(new_dir):
    os.makedirs(new_dir)
    print("your '{}' folder has been successfully created.".format(new_dir))
  else:
    print("'{}' folder already exists.".format(new_dir))
 

In [30]:
mkdirForSave(
    model_name=tokenizer_or_model_path, 
    path="/content/drive/MyDrive/dataset/open/models/",
    dataset_number=1)

your '/content/drive/MyDrive/dataset/open/models/skt-kobert-base-v1/DataVersion-1' folder has been successfully created.


In [8]:
from transformers import BertForSequenceClassification, BertModel
from transformers import TrainingArguments, Trainer


if __name__ == '__main__':
  
  device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

  config= BertModel.from_pretrained(
      args.model_or_tokenizer_name,
      vocab_size=dm.tokenizer.vocab_size,
      num_labels=22,
      pad_token_id=dm.tokenizer.pad_token_id
  )

  model = BertForSequenceClassification.from_pretrained(
      'skt/kobert-base-v1', 
      pad_token_id = tokenizer.pad_token_id,
      num_labels=22)


  # training
  # fine-tuning
  
  training_args = TrainingArguments(
    output_dir="./", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    # per_device_train_batch_size=32, # batch size for training
    # per_device_eval_batch_size=64,  # batch size for evaluation
    # eval_steps = 400, # Number of update steps between two evaluations.
    # save_steps=800, # after # steps model is saved
    # warmup_steps=500,# number of warmup steps for learning rate scheduler)
  )





    # config = MT5Config.from_pretrained("google/mt5-small",
    #                                 vocab_size=tokenizer.vocab_size,
    #                                 num_labels=22,
    #                                 pad_token_id=tokenizer.pad_token_id)

  trainer = Trainer(
      model=model,
      args=training_args,
      num_labels = 22,
      train_dataset=dm.dataset['train'],
      eval_dataset=dm.dataset['valid'],
  )

  trainer.train()
  trainer.evaluate()
  trainer.save_model('./')


loading configuration file https://huggingface.co/skt/kobert-base-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/b8898dabd49ed32401ee6a6bc5eb011f12728750b44d08b151acf270bf1732ca.1007ab583c49854e3c65c61288a980ae4d25a4bbfa51b51915ec1772f02f992d
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",

IndexError: ignored

In [None]:
# classifier = pipeline('sentiment-analysis', model=)

# model_name = 
# pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained('kykim/bert-kor-base')

inputs = tokenizer(" , , , 인공지능 , AI",
                   padding='max_length',
                   truncation=True,
                   max_length=20)

# print(tokenizer.decode(inputs['input_ids']))
# print(inputs)
for ids in inputs['input_ids'] :
  print(tokenizer.decode(ids), end='\n')

[CLS]
,
,
,
인공지능
,
ai
[SEP]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
[PAD]
