In [1]:
# Fine tuning based on https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py

import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForPermutationLanguageModeling,
    HfArgumentParser,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)


In [2]:
logger = logging.getLogger(__name__)


MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [11]:
def get_dataset(
    data_path,
    tokenizer: PreTrainedTokenizer,
    line_by_line: bool =True,
    evaluate: bool = False,
    cache_dir: Optional[str] = None,
    overwrite_cache: bool = False
):
#     file_path = args.eval_data_file if evaluate else args.train_data_file
    if line_by_line:
    # input a line by line text dataset
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=data_path, block_size=tokenizer.max_len)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=data_path,
            block_size=tokenizer.max_len,
            overwrite_cache=overwrite_cache,
            cache_dir=cache_dir,
        )

In [12]:
# Set up
# Set seed
set_seed(0)
model_name = 'gpt2'
model_path = 'gpt2-model'
cache_dir = None
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelWithLMHead.from_pretrained(
            model_name,
            from_tf=bool(".ckpt" in model_path),
            config=config,
            cache_dir=cache_dir,
        )
block_size = tokenizer.model_max_length



In [13]:
# get dataset
train_path = "../Data Processing Related/train.txt"
valid_path = "../Data Processing Related/valid.txt"

train_dataset = get_dataset(train_path, tokenizer=tokenizer, line_by_line=False,
                            cache_dir=cache_dir)
eval_dataset = get_dataset(valid_path, tokenizer=tokenizer, line_by_line=False,
                           evaluate=True, cache_dir=cache_dir)



In [14]:
# mlm defaults
data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=False, mlm_probability=0.15
        )

In [15]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    prediction_loss_only=True,
)



In [None]:
# Train
model_path = (
    model_path
    if model_path is not None and os.path.isdir(model_path)
    else None
)
trainer.train(model_path=model_path)
trainer.save_model()
# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)
if trainer.is_world_master():
    tokenizer.save_pretrained(training_args.output_dir)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=70.0, style=ProgressStyle(description_wid…

In [9]:
import torch
torch.__version__

'1.3.1'

In [None]:
# Evaluate