In [1]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
csv_path = 'data/book_genre_prediction.csv'
df = pd.read_csv(csv_path)
df

Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...
...,...,...,...,...
4652,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ..."
4653,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...
4654,4654,Red Rising,fantasy,"""I live for the dream that my children will be..."
4655,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ..."


In [3]:
def make_prompt(row):
    return f"Summarize the book {row['title']}."

def make_target(row):
    return row['summary'] if isinstance(row['summary'], str) else ''

In [4]:
df = df.dropna(subset=['title', 'genre', 'summary'])
df['prompt'] = df.apply(lambda row: make_prompt(row), axis=1)
df['target'] = df.apply(lambda row: make_target(row), axis=1)
df

Unnamed: 0,index,title,genre,summary,prompt,target
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,Summarize the book Drowned Wednesday.,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",Summarize the book The Lost Hero.,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,Summarize the book The Eyes of the Overworld.,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,Summarize the book Magic's Promise.,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,Summarize the book Taran Wanderer.,Taran and Gurgi have returned to Caer Dallben...
...,...,...,...,...,...,...
4652,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",Summarize the book Hounded.,"Atticus O’Sullivan, last of the Druids, lives ..."
4653,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,Summarize the book Charlie and the Chocolate F...,Charlie Bucket's wonderful adventure begins wh...
4654,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",Summarize the book Red Rising.,"""I live for the dream that my children will be..."
4655,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",Summarize the book Frostbite.,"Rose loves Dimitri, Dimitri might love Tasha, ..."


In [5]:
print(df.iloc[0,4])
print(df.iloc[0,5])

Summarize the book Drowned Wednesday.
 Drowned Wednesday is the first Trustee among the Morrow Days who is on Arthur's side and wishes the Will to be fulfilled. She appears as a leviathan/whale and suffers from Gluttony. The book begins when Leaf is visiting Arthur and they are discussing the invitation that Drowned Wednesday sent him. Arthur had been admitted to hospital because of the damage done to his leg when he attempted to enter Tuesday's Treasure Tower. Suddenly, the hospital room becomes flooded with water as the two are transported to the Border Sea of the House. Leaf is snatched away by a large ship with green sails, known as the Flying Mantis, while Arthur remains in his bed. When the Medallion given him by the immortal called the Mariner apparently fails to summon help, Arthur is without hope. Eventually, a buoy marking the pirate Elishar Feverfew's treasure floats toward him. As soon as Arthur opens it, his hand is marked with a bloody red colour. Arthur now has the Red H

In [6]:
hf_dataset = Dataset.from_pandas(df[['prompt', 'target']])

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [8]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 4070'

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
)

In [10]:
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map='auto', quantization_config=bnb_config)

In [11]:
model = prepare_model_for_kbit_training(model)

In [12]:
lora_config = LoraConfig(
    #r=8,
    #lora_alpha=32,
    #target_modules=["q_proj", "v_proj"], 
    #lora_dropout=0.05,
    #bias="none",
    task_type="CAUSAL_LM",
)

In [13]:
model = get_peft_model(model, lora_config)

In [14]:
next(model.parameters()).device

device(type='cuda', index=0)

In [15]:
torch.cuda.memory_allocated()

1039134208

In [16]:
def format_chat(examples):
    messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot.",
    },
    {"role": "user", "content": examples["prompt"]},
    {"role":"assistant", "content": examples["target"]}
    ]

    chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    
    return {"formatted_chat": chat}


In [17]:
chat_dataset = hf_dataset.map(format_chat, batched=False)

Map:   0%|          | 0/4657 [00:00<?, ? examples/s]

Map: 100%|██████████| 4657/4657 [00:00<00:00, 14424.77 examples/s]


In [18]:
def tokenize_function(examples):
    return tokenizer(examples['formatted_chat'])

In [19]:
tokenized_dataset = chat_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4657 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2119 > 2048). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 4657/4657 [00:00<00:00, 4926.33 examples/s]


In [20]:
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
eval_dataset = split['test']

In [21]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [22]:
training_args = TrainingArguments(
    output_dir='./TinyLlama_finetuned',
    overwrite_output_dir=True,
    #num_train_epochs=2,
    per_device_train_batch_size=2,
    #save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,
    logging_steps=100,
)

In [23]:
tokenized_dataset

Dataset({
    features: ['prompt', 'target', 'formatted_chat', 'input_ids', 'attention_mask'],
    num_rows: 4657
})

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [25]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


  return fn(*args, **kwargs)


Step,Training Loss
100,2.4843
200,2.3976
300,2.3675
400,2.2429
500,2.2715
600,2.2477
700,2.2296
800,2.186
900,2.1977
1000,2.2732


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=6288, training_loss=2.2199099251938836, metrics={'train_runtime': 3136.3605, 'train_samples_per_second': 4.009, 'train_steps_per_second': 2.005, 'total_flos': 6.646918823353958e+16, 'train_loss': 2.2199099251938836, 'epoch': 3.0})

In [26]:
trainer.save_model('./TinyLlama_finetuned')
tokenizer.save_pretrained('./TinyLlama_finetuned') 

('./TinyLlama_finetuned/tokenizer_config.json',
 './TinyLlama_finetuned/special_tokens_map.json',
 './TinyLlama_finetuned/tokenizer.model',
 './TinyLlama_finetuned/added_tokens.json',
 './TinyLlama_finetuned/tokenizer.json')