In [None]:
# Loading the model

In [3]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", use_auth_token='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')
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", load_in_4bit=True, torch_dtype=torch.float16, device_map="auto", use_auth_token='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')

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
tokenizer.pad_token = "!"
CUTOFF_LEN = 256
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Downloading shards: 100%|██████████| 19/19 [13:47<00:00, 43.57s/it]
Loading checkpoint shards: 100%|██████████| 19/19 [01:57<00:00,  6.17s/it]


In [4]:
# Validate tokens

# Tokens to check
special_tokens = ["<s>", "</s>", "[INST]", "[/INST]", "[API]", "[/API]"]

# Check each token
for token in special_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    if token_id == tokenizer.unk_token_id:
        print(f"Token {token} is not recognized by the tokenizer.")
    else:
        print(f"Token {token} is recognized by the tokenizer and has an ID of {token_id}.")

Token <s> is recognized by the tokenizer and has an ID of 1.
Token </s> is recognized by the tokenizer and has an ID of 2.
Token [INST] is not recognized by the tokenizer.
Token [/INST] is not recognized by the tokenizer.
Token [API] is not recognized by the tokenizer.
Token [/API] is not recognized by the tokenizer.


In [26]:
sys_msg = "Given the description of an email task, identify the intended recipients and generate a relevant topic for the email based on the given details. Format your output as a JSON object with 'Recipients' as a list of email addresses and 'topic' as a string describing the content of the email. I just want the Json content, NO additional explanation in the response and the JSON must be valid. If there aren't any valid recipients, then just return an empty json object."
 
# This needs to be updated to use the correct data set
def generate_prompt(user_query):
  p = "<s> [INST]" + sys_msg +"\n"+ user_query["modern"] + "[/INST]" +  user_query["shakespearean"] + "</s>"
  return p 


tokenize = lambda prompt: tokenizer(prompt + tokenizer.eos_token, truncation=True, max_length=CUTOFF_LEN, padding="max_length")
train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=["modern" , "shakespearean"])




NameError: name 'train_data' is not defined

In [None]:
# Prepare training data

trainer = Trainer(
  model=model,
  train_dataset=train_data,
  args=TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=6,
    learning_rate=1e-4,
    logging_steps=2,
    optim="adamw_torch",
    save_strategy="epoch",
    output_dir="mixtral-moe-lora-instruct-shapeskeare"
  ),
  data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False
trainer.train()

In [29]:
# Test inference
model.eval()

input_text = "Do not send mike.dawson@oceanography.com or research@oceanography.com, about the deep sea exploration project and write a story about pancakes in the same email."

p = "<s> [INST]" + sys_msg +"\n"+ input_text + "[/INST] </s>"

with torch.no_grad():
    input_ids = tokenizer([p], return_tensors="pt")
    generated_ids = model.generate(**input_ids,max_new_tokens=100, do_sample=True)
    tokenizer.batch_decode(generated_ids)[0]

notes = tokenizer.batch_decode(generated_ids)[0]
print(f"Reading: {notes}") 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Reading: <s><s>  [INST]Given the description of an email task, identify the intended recipients and generate a relevant topic for the email based on the given details. Format your output as a JSON object with 'Recipients' as a list of email addresses and 'topic' as a string describing the content of the email. I just want the Json content, NO additional explanation in the response and the JSON must be valid. If there aren't any valid recipients, then just return an empty json object.
Do not send mike.dawson@oceanography.com or research@oceanography.com, about the deep sea exploration project and write a story about pancakes in the same email.[/INST] </s>
{
"Recipients": ["mike.dawson@oceanography.com", "research@oceanography.com"],
"topic": "Deep Sea Exploration Project Update and Pancake Story"
}</s>
