In [1]:
# Just like shaekspear we will try and train
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.a")

# Prepare model for k-bit training
tokenizer.pad_token = "!"
CUTOFF_LEN = 256
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1


config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["w1", "w2", "w3"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
dataset = load_dataset("harpreetsahota/modern-to-shakesperean-translation")
print("dataset", dataset)
train_data = dataset["train"]


def generate_prompt(user_query):
    sys_msg = "Translate the given text to Shakespearean style."
    p = (
        "<s> [INST]"
        + sys_msg
        + "\n"
        + user_query["modern"]
        + "[/INST]"
        + user_query["shakespearean"]
        + "</s>"
    )
    return p


tokenize = lambda prompt: tokenizer(
    prompt + tokenizer.eos_token,
    truncation=True,
    max_length=CUTOFF_LEN,
    padding="max_length",
)
train_data = train_data.shuffle().map(
    lambda x: tokenize(generate_prompt(x)), remove_columns=["modern", "shakespearean"]
)

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 22.8k/22.8k [00:00<00:00, 110kB/s]


Generating train split:   0%|          | 0/274 [00:00<?, ? examples/s]

dataset DatasetDict({
    train: Dataset({
        features: ['modern', 'shakespearean'],
        num_rows: 274
    })
})


Map:   0%|          | 0/274 [00:00<?, ? examples/s]

In [2]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)


# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
tokenizer.pad_token = "!"
CUTOFF_LEN = 256
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1


config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["w1", "w2", "w3"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

dataset DatasetDict({
    train: Dataset({
        features: ['modern', 'shakespearean'],
        num_rows: 274
    })
})


In [3]:
dataset = load_dataset("harpreetsahota/modern-to-shakesperean-translation")
print("dataset", dataset)
train_data = dataset["train"]

dataset DatasetDict({
    train: Dataset({
        features: ['modern', 'shakespearean'],
        num_rows: 274
    })
})


In [4]:
def generate_prompt(user_query):
    sys_msg = "Translate the given text to Shakespearean style."
    p = (
        "<s> [INST]"
        + sys_msg
        + "\n"
        + user_query["modern"]
        + "[/INST]"
        + user_query["shakespearean"]
        + "</s>"
    )
    return p


tokenize = lambda prompt: tokenizer(
    prompt + tokenizer.eos_token,
    truncation=True,
    max_length=CUTOFF_LEN,
    padding="max_length",
)
train_data = train_data.shuffle().map(
    lambda x: tokenize(generate_prompt(x)), remove_columns=["modern", "shakespearean"]
)


trainer = Trainer(
    model=model,
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=6,
        learning_rate=1e-4,
        logging_steps=2,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir="mixtral-moe-lora-instruct-shapeskeare",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


model.config.use_cache = False
trainer.train()

Map:   0%|          | 0/274 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
2,7.2439
4,5.1218
6,3.7626
8,2.7293
10,2.333
12,2.0782
14,1.9141
16,1.8325
18,1.7845
20,1.6993


KeyboardInterrupt: 

In [19]:
import os

os.getcwd()

'/root/mlx_week_7'

In [22]:
# Resume training example
atarainer.train(
    resume_from_checkpoint="mixtral-moe-lora-instruct-shapeskeare/checkpoint-62"
)



Step,Training Loss
64,1.218


KeyboardInterrupt: 

In [None]:
def generate_prompt(user_query):
    sys_msg = "Translate the given text to Shakespearean style."
    p = (
        "[INST]"
        + sys_msg
        + "\n"
        + user_query["modern"]
        + "[/INST]"
    )
    return p

In [33]:
tokens = tokenizer.encode(
    "[INST] Convert the following to Shakespearean style: 'What the hell is going on here?'[/INST]"
)

In [31]:
output = model.generate(
    torch.tensor(tokens).unsqueeze(0),
    max_length=100,
    # do_sample=True,
    temperature=0.1,
    pad_token_id=tokenizer.pad_token_id,
    num_return_sequences=1,

)



In [38]:
tokenizer.decode(output[0], skip_special_tokens=True)

"Convert the following to Shakespearean style: 'What the hell is going on here?'\n\nWhat the devil is transpiring here?"