# *Data load*

In [1]:
import json
import config
import pandas as pd
from pprint import pprint
from pshmodule.utils import filemanager as fm

In [2]:
print("data load")

data = []
with open(config.train_json, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('\n|\r')))
df = pd.DataFrame(data)

data load


In [3]:
df.head()

Unnamed: 0,content,room_no,speaker
0,이제 위드 코로나 하면 회식 시작하겠구만,0,1
1,11월달부터 위드 코로나 하려나 보던데요,0,2
2,그러게 슬슬 준비하고 있네,0,1
3,회식 좋아하는 사람들에게 희소식이겠네요,0,2
4,그러게 신입 사원들은 이제 큰일이네,0,1


# *Definition*

##### 사전 크기, 사용자 정의 토큰 정의

In [4]:
vocab_size = 24000

In [5]:
user_defined_symbols = ["<pad>", "<unk>", "<cls>", "<sep>", "<mask>", "<bos>", "<eos>", "<tsep>", "<name>", "<url>"]

In [6]:
unused_token_num = 100
unused_list = [f"<unused{i}>" for i in range(unused_token_num)]
user_defined_symbols += unused_list

In [7]:
pprint(user_defined_symbols)

['<pad>',
 '<unk>',
 '<cls>',
 '<sep>',
 '<mask>',
 '<bos>',
 '<eos>',
 '<tsep>',
 '<name>',
 '<url>',
 '<unused0>',
 '<unused1>',
 '<unused2>',
 '<unused3>',
 '<unused4>',
 '<unused5>',
 '<unused6>',
 '<unused7>',
 '<unused8>',
 '<unused9>',
 '<unused10>',
 '<unused11>',
 '<unused12>',
 '<unused13>',
 '<unused14>',
 '<unused15>',
 '<unused16>',
 '<unused17>',
 '<unused18>',
 '<unused19>',
 '<unused20>',
 '<unused21>',
 '<unused22>',
 '<unused23>',
 '<unused24>',
 '<unused25>',
 '<unused26>',
 '<unused27>',
 '<unused28>',
 '<unused29>',
 '<unused30>',
 '<unused31>',
 '<unused32>',
 '<unused33>',
 '<unused34>',
 '<unused35>',
 '<unused36>',
 '<unused37>',
 '<unused38>',
 '<unused39>',
 '<unused40>',
 '<unused41>',
 '<unused42>',
 '<unused43>',
 '<unused44>',
 '<unused45>',
 '<unused46>',
 '<unused47>',
 '<unused48>',
 '<unused49>',
 '<unused50>',
 '<unused51>',
 '<unused52>',
 '<unused53>',
 '<unused54>',
 '<unused55>',
 '<unused56>',
 '<unused57>',
 '<unused58>',
 '<unused59>',
 '<unus

# *Tokenizer Train*

##### Huggingface BPE Tokenizer

In [8]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
from transformers import GPT2TokenizerFast, AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [9]:
tokenizer = Tokenizer(models.BPE())
print(f"tokenizer : {tokenizer}")

tokenizer : <tokenizers.Tokenizer object at 0x55bc44353040>


In [10]:
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Metaspace()])
tokenizer.decoders = decoders.Metaspace()

In [11]:
trainer = trainers.BpeTrainer(
    vocab_size=vocab_size, 
    show_progress=True,
    special_tokens=user_defined_symbols,
)

In [12]:
def gen():
    for row in df.content:
        yield row

In [13]:
tokenizer.train_from_iterator(gen(), trainer=trainer)

In [14]:
tokenizer.model.save(config.tokenizer_path+"temp")

['/home/jovyan/shpark-datashare/generation_model/GPTNeoX/outputs/tokenizer/temp/vocab.json',
 '/home/jovyan/shpark-datashare/generation_model/GPTNeoX/outputs/tokenizer/temp/merges.txt']

##### test

In [15]:
output = tokenizer.encode("본 고안은 이러한 특성을 이용해 사용한다.")
print(output.ids)

[3099, 2887, 19238, 4120, 2643, 3344, 7225, 8284, 4001, 20697]


In [16]:
tokenizer.decode(output.ids)

'▁본 ▁고 안은 ▁이러 한 ▁특 성을 ▁이용해 ▁사용 한다.'

In [17]:
tokenizer.decoder = decoders.BPEDecoder(suffix='_')
tokenizer.decode(output.ids)

'▁본▁고안은▁이러한▁특성을▁이용해▁사용한다.'

# *Save according to form*

In [18]:
tokenizer_for_load = GPT2TokenizerFast.from_pretrained(config.tokenizer_path+"temp")  # 로드

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


##### Definition

In [19]:
tokenizer_for_load.pad_token = "<pad>"
tokenizer_for_load.unk_token = "<unk>"
tokenizer_for_load.cls_token = "<cls>"
tokenizer_for_load.sep_token = "<sep>"
tokenizer_for_load.mask_token = "<mask>"
tokenizer_for_load.bos_token = "<bos>"
tokenizer_for_load.eos_token = "<eos>"

special_tokens_dict = {'additional_special_tokens': user_defined_symbols}
tokenizer_for_load.add_special_tokens(special_tokens_dict)

0

In [20]:
tokenizer_for_load.save_pretrained(config.tokenizer_path, legacy_format=False)

('/home/jovyan/shpark-datashare/generation_model/GPTNeoX/outputs/tokenizer/tokenizer_config.json',
 '/home/jovyan/shpark-datashare/generation_model/GPTNeoX/outputs/tokenizer/special_tokens_map.json',
 '/home/jovyan/shpark-datashare/generation_model/GPTNeoX/outputs/tokenizer/tokenizer.json')

in `tokenizer.json`  
```json
    "normalizer": {
        "type": "Sequence",
        "normalizers": [
            {
                "type": "NFKC"
            },
            {
                "type": "BertNormalizer",
                "clean_text": false,
                "handle_chinese_chars": false,
                "strip_accents": false,
                "lowercase": false
            }
        ]
    },
    "pre_tokenizer": {
        "type": "Sequence",
        "pretokenizers": [
            {
                "type": "Metaspace",
                "replacement": "▁",
                "add_prefix_space": true
            }
        ]
    },
    "post_processor": null,
    "decoder": {
        "type": "Metaspace",
        "replacement": "▁",
        "add_prefix_space": true
    },
```

in `tokenizer_config.json`  
```json
,
    "model_type": "gpt2"
```

rename `tokenizer_config.json` => `config.json`

# *Test*

In [21]:
t = AutoTokenizer.from_pretrained(config.tokenizer_path)

In [22]:
e = t("본 고안은 이러한 특성을 이용해 사용한다.")
print(e)
print(t.decode(e['input_ids']))

{'input_ids': [3099, 2887, 19238, 4120, 2643, 3344, 7225, 8284, 4001, 20697], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
본 고안은 이러한 특성을 이용해 사용한다.
