In [1]:
import transformers, datasets, torch
from datasets import load_dataset
from pathlib import Path
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET_DIR = Path(os.getcwd())/ "my_model"/ "datasets"

In [3]:
dataset = load_dataset("json", data_files={
    'train': str(DATASET_DIR/"processed_output"/"train.json"),
    'test' : str(DATASET_DIR/"processed_output"/"test.json"),
    'val' : str(DATASET_DIR/"processed_output"/"val.json"),
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    val: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [4]:
def create_full_article(row):
    return {'full_article' : f"TITLE: {row['title']} \n\n BODY: {row['body']}"}

dataset = dataset.map(create_full_article)

In [5]:
print(dataset['train'][0]['full_article'])

TITLE: INCO SEES NO MAJOR IMPACT FROM DOW REMOVAL 

 BODY: Inco Ltd said it did not expect its
earlier reported removal from the Dow Jones industrial index to
make a major impact on the company's stock.
    "We don't think that individuals or institutions buy our
shares because we were one of the Dow Jones industrials,"
spokesman Ken Cherney said in reply to a query.
    Inco closed 1-3/8 lower at 19-3/8 in second most active
trading on the Toronto Stock Exchange.
    The Wall Street Journal, which selects the index, said Inco
was dropped to make the index more representative of the
market. Inco, the non-Communist world's largest nickel
producer, was a member of the index since 1928.
    Replacing Inco and Owens-Illinois Inc will be Coca-Cola Co
and Boeing Co, effective tomorrow.
    Nickel analyst Ilmar Martens at Walwyn Stodgell Cochran
Murray Ltd said Inco's removal from the index would likely
spark short-term selling pressure on the stock.
    "Some investors who have Inco may sudd

## Training own tokenizer

In [6]:
training_corpus = (
    dataset['train'][i:i+1000]['full_article']
    for i in range(0,len(dataset['train']), 1000)
)
print(type(training_corpus))

<class 'generator'>


This code does a complete retraining of the tokenizer's vocabulary while keeping the tokenizer's architecture intact. Behind the Scenes, The template from the old tokenizer is kept but the vocab is discarded

In [7]:
## download old tokenizer
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

## fine tune the tokenizer
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=52_000)






In [10]:
example = dataset['test'][2]['full_article']
example

"TITLE: DOMINION <D> SAYS GROUP DROPS OPPOSITION TO PLAN \n\n BODY: Dominion Resources Inc said it\nhas reahced an understanding under which the National Coal\nAssociation would withdraw its opposition to the company's\npetition for exemptions from the Powerplant and Industrial Fuel\nUse Act of 1978.\n    The company needs the exemptions from the U.S. Department\nof Energy to allow it to build the combined cycle generating\nunits Chesterfield 7 and 8.  Chesterfield 7, a 210 megawatt\nunit planned for service in June 1990 at a cost of about 130\nmln dlrs, would initially burn natural gas. Companion unit\nChesterfield 8 would be built by June 1992.\n    Dominion said the coal group withdrew its opposition to the\nplants after Dominion said it would be willing to pursue\ninstallation of a coal gasifier for Chesterfield under proper\neconomic conditions, so that the units could burn coal gas.\n Reuter\n\x03"

In [11]:
old_tokenizer.tokenize(example)

['TIT',
 'LE',
 ':',
 'ĠDOM',
 'IN',
 'ION',
 'Ġ<',
 'D',
 '>',
 'ĠSAY',
 'S',
 'ĠGROUP',
 'ĠDR',
 'OPS',
 'ĠO',
 'PP',
 'OS',
 'ITION',
 'ĠTO',
 'ĠPLAN',
 'Ġ',
 'ĊĊ',
 'ĠB',
 'ODY',
 ':',
 'ĠDominion',
 'ĠResources',
 'ĠInc',
 'Ġsaid',
 'Ġit',
 'Ċ',
 'has',
 'Ġre',
 'ah',
 'ced',
 'Ġan',
 'Ġunderstanding',
 'Ġunder',
 'Ġwhich',
 'Ġthe',
 'ĠNational',
 'ĠCoal',
 'Ċ',
 'Ass',
 'ociation',
 'Ġwould',
 'Ġwithdraw',
 'Ġits',
 'Ġopposition',
 'Ġto',
 'Ġthe',
 'Ġcompany',
 "'s",
 'Ċ',
 'pet',
 'ition',
 'Ġfor',
 'Ġexemptions',
 'Ġfrom',
 'Ġthe',
 'ĠPower',
 'plant',
 'Ġand',
 'ĠIndustrial',
 'ĠFuel',
 'Ċ',
 'Use',
 'ĠAct',
 'Ġof',
 'Ġ1978',
 '.',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠThe',
 'Ġcompany',
 'Ġneeds',
 'Ġthe',
 'Ġexemptions',
 'Ġfrom',
 'Ġthe',
 'ĠU',
 '.',
 'S',
 '.',
 'ĠDepartment',
 'Ċ',
 'of',
 'ĠEnergy',
 'Ġto',
 'Ġallow',
 'Ġit',
 'Ġto',
 'Ġbuild',
 'Ġthe',
 'Ġcombined',
 'Ġcycle',
 'Ġgenerating',
 'Ċ',
 'units',
 'ĠChester',
 'field',
 'Ġ7',
 'Ġand',
 'Ġ8',
 '.',
 'Ġ',
 'ĠChester',
 

In [12]:
tokenizer.tokenize(example)

['TITLE',
 ':',
 'ĠDOMINION',
 'Ġ<',
 'D',
 '>',
 'ĠSAYS',
 'ĠGROUP',
 'ĠDROPS',
 'ĠOPPOSITION',
 'ĠTO',
 'ĠPLAN',
 'ĠĊĊ',
 'ĠBODY',
 ':',
 'ĠDominion',
 'ĠResources',
 'ĠInc',
 'Ġsaid',
 'Ġit',
 'Ċ',
 'has',
 'Ġre',
 'ah',
 'ced',
 'Ġan',
 'Ġunderstanding',
 'Ġunder',
 'Ġwhich',
 'Ġthe',
 'ĠNational',
 'ĠCoal',
 'Ċ',
 'Association',
 'Ġwould',
 'Ġwithdraw',
 'Ġits',
 'Ġopposition',
 'Ġto',
 'Ġthe',
 'Ġcompany',
 "'s",
 'Ċ',
 'petition',
 'Ġfor',
 'Ġexemptions',
 'Ġfrom',
 'Ġthe',
 'ĠPower',
 'plant',
 'Ġand',
 'ĠIndustrial',
 'ĠFuel',
 'Ċ',
 'U',
 'se',
 'ĠAct',
 'Ġof',
 'Ġ1978',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġneeds',
 'Ġthe',
 'Ġexemptions',
 'Ġfrom',
 'Ġthe',
 'ĠU',
 '.',
 'S',
 '.',
 'ĠDepartment',
 'Ċ',
 'of',
 'ĠEnergy',
 'Ġto',
 'Ġallow',
 'Ġit',
 'Ġto',
 'Ġbuild',
 'Ġthe',
 'Ġcombined',
 'Ġcycle',
 'Ġgenerating',
 'Ċ',
 'units',
 'ĠChester',
 'field',
 'Ġ7',
 'Ġand',
 'Ġ8',
 '.',
 'Ġ',
 'ĠChester',
 'field',
 'Ġ7',
 ',',
 'Ġa',
 'Ġ210',
 'Ġmegawatt',
 'Ċ',
 'unit',
 'Ġ