In [1]:
import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
)
from typing import Iterable


In [2]:
!pip install tensorflow-datasets
import tensorflow_datasets

Collecting tensorflow-datasets
  Downloading tensorflow_datasets-4.9.6-py3-none-any.whl.metadata (9.5 kB)
Collecting dm-tree (from tensorflow-datasets)
  Downloading dm_tree-0.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting immutabledict (from tensorflow-datasets)
  Downloading immutabledict-4.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting promise (from tensorflow-datasets)
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting simple-parsing (from tensorflow-datasets)
  Downloading simple_parsing-0.1.5-py3-none-any.whl.metadata (7.7 kB)
Collecting tensorflow-metadata (from tensorflow-datasets)
  Downloading tensorflow_metadata-1.15.0-py3-none-any.whl.metadata (2.4 kB)
Collecting toml (from tensorflow-datasets)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting array-record>=0.5.0 (from tensorflow-datasets)
  Downloading array_record-0.5.1-cp311-cp311-manylinux_2_17_x86_6

## First Test

In [70]:
device = "cuda"
checkpoint = "HuggingFaceTB/SmolLM-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for fp16 use `torch_dtype=torch.float16` instead
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)


tokenizer_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [71]:
generation_config = GenerationConfig(
    max_length=200,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

inputs = tokenizer.encode("Hi! How are you?", return_tensors="pt").to(device)
outputs = model.generate(inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Hi! How are you?

Alice: I'm fine, thanks. I'm trying to learn about the history of the United States. Do you know anything about the American Revolution?

Bob: Sure, Alice! The American Revolution was a time when the thirteen American colonies decided to break away from Great Britain and form their own country. They fought against British rule for many years.

Alice: That sounds like a long time ago! Why did they want to do that?

Bob: Well, there were many reasons. One reason was that the British had taken control of their land and made them pay taxes. Another reason was that the colonists wanted more freedom and self-governance.

Alice: Oh, I see. So, they didn't want to be ruled by the British?

Bob: Exactly! And they also believed that they had a right to govern themselves. This idea is called the "American Revolution."

Alice: Wow, that


In [73]:
print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")

Memory footprint: 284.76 MB


## Training Dataset Preparation

In [78]:
def create_dataset(split: str) -> Dataset:
    ratings = tensorflow_datasets.load("movielens/100k-ratings", split=split)
    materialized_ratings = pd.DataFrame(ratings.as_numpy_iterator())[["user_id", "movie_id"]]
    for col in materialized_ratings:
        materialized_ratings[col] = materialized_ratings[col].str.decode("utf8")

    agg_ratings = (
        materialized_ratings
        .groupby("user_id")
        .agg(movie_ids=("movie_id", lambda x: ",".join(x)))
        .reset_index()
        .to_dict("records")
    )
    
    formatted_agg_ratings = [
        {"input": f"user_id: {u['user_id']}, movie_ids: {u['movie_ids']}"}
        for u in agg_ratings
    ]

    return Dataset.from_list(formatted_agg_ratings)

In [109]:
tokenizer.pad_token = tokenizer.eos_token
train_ratings = create_dataset("train")
# https://huggingface.co/docs/transformers/pad_truncation
# train_tokenized_ratings = tokenizer(train_ratings["input"], padding="longest")
train_tokenized_ratings = train_ratings.map(
    lambda x: tokenizer(x["input"], padding="max_length", max_length=3110),
    batched=True,
    remove_columns=["input"],
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/943 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [9]:
# train_tokenized_ratings= train_tokenized_ratings.add_column(
#     "labels",
#     train_tokenized_ratings["input_ids"].copy()
# )

In [114]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [115]:
train_grouped_tokenized_ratings = train_tokenized_ratings.map(group_texts, batched=True,) #num_proc=4)

Map:   0%|          | 0/943 [00:00<?, ? examples/s]

In [116]:
os.makedirs("train_grouped_tokenized_ratings", exist_ok=True)
train_grouped_tokenized_ratings.save_to_disk("train_grouped_tokenized_ratings")

Saving the dataset (0/1 shards):   0%|          | 0/22911 [00:00<?, ? examples/s]