In [3]:
from transformers import LlamaForCausalLM, LlamaTokenizer,TrainingArguments, Trainer
from datasets import Dataset
# from torch.utils.data import Dataset, random_split
import pandas as pd
import torch
import re
import os

In [4]:
checkpoint = 'llama-2-7b-converted'

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = LlamaTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = LlamaForCausalLM.from_pretrained(checkpoint,torch_dtype=torch.float16).to(device)  
max_seq_len = 500

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


In [6]:
class HoroscopeDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt.strip('\"') + '<|endoftext|>', truncation=True,
                                    max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [7]:
horoscope = pd.read_csv('horoscope.csv')
horoscope.head()

Unnamed: 0,zodiac,area,text
0,aries,daily,\nYou should feel grounded and connected with ...
1,aries,love,"\nAs the heavens turn, we watch their captivat..."
2,aries,work,"\nRedirect your energy, Aries. You may feel in..."
3,aries,dating,\nIt's okay to have a minor outburst every now...
4,taurus,daily,\nThe moon continues its journey through your ...


In [10]:
# clean the text column
horoscope['text'] = horoscope['text'].str.strip('\"')
horoscope_small = horoscope[['zodiac','text']]

# load the dataset into the Dataset object
dataset = Dataset.from_pandas(horoscope_small)

# a tokenization function to apply on every text in the dataset
def horoscope_tokenizer(horoscope):
    return tokenizer(horoscope['text'], padding='max_length', max_length=max_seq_len, truncation=True)

tokenized_datasets = dataset.map(horoscope_tokenizer)
# cleaned_datasets = tokenized_datasets.remove_columns(['day', 'month', 'year', 'sign', 'text', '__index_level_0__'])


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map: 100%|██████████| 48/48 [00:00<00:00, 834.56 examples/s]


In [None]:
# lets make sure the size of the first input is set to 100, so the padding worked.
assert (len(tokenized_datasets['text'][0]['input_ids'])) == max_seq_len


In [11]:
tokenized_datasets['text'][0]

"\nYou should feel grounded and connected with the world around you, dear Aries, as the moon continues its journey through earthy Taurus. Consider what can be added to your plate when Luna and Jupiter unite this afternoon, especially when it comes to money-making ideas. This cosmic climate brings luck to your finances, though you may find yourself in a generous mood as well. You'll crave the finer things in life later today when the moon and Uranus unite, and you shouldn't feel guilty about treating yourself to a small indulgence or two as a reward for all your hard work.\n"

In [7]:
horoscope_small = horoscope[['zodiac','text']]

In [8]:
max_length = max([len(tokenizer.encode(description)) for description in horoscope_small])
dataset = HoroscopeDataset(horoscope_small, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [15]:
horoscope_small

Unnamed: 0,zodiac,text
0,aries,\nYou should feel grounded and connected with ...
1,aries,"\nAs the heavens turn, we watch their captivat..."
2,aries,"\nRedirect your energy, Aries. You may feel in..."
3,aries,\nIt's okay to have a minor outburst every now...
4,taurus,\nThe moon continues its journey through your ...
5,taurus,"\nA miraculous, gorgeous cosmic alignment has ..."
6,taurus,"\nComparison is the thief of joy, Taurus. This..."
7,taurus,"\nIf you're single right now, what are you wai..."
8,gemini,\nTake note of any interesting dreams early th...
9,gemini,\nWe are firmly planted within earthy cosmic v...


In [14]:
len(dataset)

2

In [9]:
dataset[0]

(tensor([    1,   529, 29989,  2962]), tensor([1, 1, 1, 1]))

In [10]:
dataset[1]

(tensor([    1,   529, 29989,  2962]), tensor([1, 1, 1, 1]))

In [11]:
dataset[-1]

(tensor([    1,   529, 29989,  2962]), tensor([1, 1, 1, 1]))

In [None]:
training_args = TrainingArguments(output_dir='/output/model_checkpoints/', num_train_epochs=5,
                                    logging_steps=1000, save_steps=20000,
                                    per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                    warmup_steps=100, weight_decay=0.01, logging_dir='./logs')
Trainer(model=model, args=training_args, train_dataset=train_dataset,
          eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                                'attention_mask': torch.stack([f[1] for f in data]),
                                                                'labels': torch.stack([f[0] for f in data])}).train()
