### Loding dataset

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [3]:
eli5 = load_dataset("eli5", split="train_asks[:5000]")

Found cached dataset eli5 (/home/lklimkiewicz/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [7]:
# nice
eli5 = eli5.train_test_split(test_size=0.2)

In [8]:
eli5['train'][0]

{'q_id': 'lloin',
 'title': 'How would one minimize turbulence in a hookah hose?',
 'selftext': "If any of the folks of Askscience have smoked hookah before, you may have found that you get the best hit by pulling nice and slowly. It would appear that the quality of the hit would depend on how turbulent the flow is in the hookah. We can probably assume roughly constant density of smoke plus air in the hose, but I don't know the actual smoke concentration in the hose. How would someone go about finding out exactly how hard to pull so as to maximize flow rate while minimizing turbulence? \n\n(Don't be afraid to bring on the maths.)",
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['c2tq9cn'],
  'text': ['I think [this](_URL_0_)(pdf) would be a great help to you.\nIf you know the fluid density and dynamic viscosity as well as the pipe diameter etc, you can determine the reynolds number which will tell you whether the flow will be prrimarily laminar or turbulent. Working

### Tokenizing

In [11]:
# nice!
eli5 = eli5.flatten()

In [12]:
eli5["train"][0]

{'q_id': 'lloin',
 'title': 'How would one minimize turbulence in a hookah hose?',
 'selftext': "If any of the folks of Askscience have smoked hookah before, you may have found that you get the best hit by pulling nice and slowly. It would appear that the quality of the hit would depend on how turbulent the flow is in the hookah. We can probably assume roughly constant density of smoke plus air in the hose, but I don't know the actual smoke concentration in the hose. How would someone go about finding out exactly how hard to pull so as to maximize flow rate while minimizing turbulence? \n\n(Don't be afraid to bring on the maths.)",
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c2tq9cn'],
 'answers.text': ['I think [this](_URL_0_)(pdf) would be a great help to you.\nIf you know the fluid density and dynamic viscosity as well as the pipe diameter etc, you can determine the reynolds number which will tell you whether the flow will be prrimarily laminar or turbulent. Work

In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [15]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [None]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=8,
    remove_columns=eli5["train"].column_names,
)

### Splitting to fit context window

In [20]:
block_size = 128

In [19]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [21]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [23]:
lm_dataset['train'].column_names

['input_ids', 'attention_mask', 'labels']

In [25]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Model

In [26]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [None]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [36]:
training_args = TrainingArguments(
    output_dir="logs",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1
)

In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [39]:
trainer.train()

  0%|          | 0/1128 [00:00<?, ?it/s]

{'loss': 3.6864, 'learning_rate': 0.0, 'epoch': 0.44}
{'loss': 3.7018, 'learning_rate': 0.0, 'epoch': 0.89}


  0%|          | 0/282 [00:00<?, ?it/s]

{'eval_loss': 3.717766046524048, 'eval_runtime': 9.9532, 'eval_samples_per_second': 226.561, 'eval_steps_per_second': 28.333, 'epoch': 1.0}
{'train_runtime': 141.1638, 'train_samples_per_second': 63.897, 'train_steps_per_second': 7.991, 'train_loss': 3.696293391234486, 'epoch': 1.0}


TrainOutput(global_step=1128, training_loss=3.696293391234486, metrics={'train_runtime': 141.1638, 'train_samples_per_second': 63.897, 'train_steps_per_second': 7.991, 'train_loss': 3.696293391234486, 'epoch': 1.0})

## Evaluate

In [31]:
# get "perplexity"
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/282 [00:00<?, ?it/s]

Perplexity: 41.17


## Inferrence with pipeline

In [40]:
from transformers import pipeline

In [43]:
generator = pipeline("text-generation", model="logs/checkpoint-1000")

In [44]:
generator("Somatic hypermutation allows the immune system to")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Somatic hypermutation allows the immune system to do a better job of protecting itself against viruses from exposure).  I think its definitely [misunderstood and interpreted](_URL_2_). \n\nIt actually has some important advantages,'}]

## Manual inferrence

In [45]:
prompt = "Somatic hypermutation allows the immune system to"

In [48]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("logs/checkpoint-1000")
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [49]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("logs/checkpoint-1000")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [50]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["Somatic hypermutation allows the immune system to respond to pathogens and viruses and make it more capable of detecting or detecting pathogens.The body's own immune system also has a lot to work with in the same way. It's often the body's own immune system, which is actually involved in producing most of the body's own immune responses, so you wouldn't be able to be immune to any diseases in your body's immune system. \n\nAlso, we're not really aware of any particular immune response, just to be clear: there"]