# MIXTRAL 8x7B - Mixture of Experts

This will not run on the free T4 GPU from Google Colab. You will need A100 to run this.

### Install Required Packages

In [1]:
!pip install transformers trl accelerate torch bitsandbytes peft datasets -qU
!pip install flash-attn --no-build-isolation

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m961.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

#### Load HF Dataset

We need a dataset to fine-tune a model, for this example we will be using a subset of the `mosaicml/instruct-v3` dataset.

In [4]:
import pandas as pd
df = pd.read_csv("/df_final_llm (1).csv")

In [5]:
df.head()

Unnamed: 0,email,entities
0,fm door Quinto Di Treviso to HKG / Totali Quan...,"{'pickup_location': ""Quinto Di Treviso Viale d..."
1,", mi quotereste quanto segue per Favore? , Ori...","{'pickup_location': 'Colico LC', 'quantity': '..."
2,"PORT JEBEL ALI Buon pomeriggio, , potreste quo...","{'delivery_port': 'JEBEL ALI Jebel Ali', 'pick..."
3,"d'oglio /cat lai Buon giorno a tutti , Da Quin...","{'pickup_location': ""d'oglio Quinzano d'Oglio""..."
4,CUCINE INOX // LCL ex. Italy to HK .Atelier Gr...,"{'pickup_state': 'Italy Italy Italy', 'deliver..."


In [6]:
INS = """<s>[INST]Your task is to extract the information corresponding to the provided labels from the below given email.

### Labels:
* pickup_location: Street address of the origin location of goods.
* pickup_cap: Postal code or ZIP code of the pickup location.
* pickup_port: Port of pickup, often used in international shipping.
* pickup_state: Only Country of pickup location.
* delivery_location: Street address of the destination location of goods.
* delivery_cap: Postal code or ZIP code of delivery location.
* delivery_port: Port of delivery, similar to pickup port.
* delivery_state: State or region of delivery location.
* total_quantity: Overall quantity of shipped items (e.g., pieces, boxes). Calculate the total_quantity by summing the quantity of all packages.
* total_weight: Total weight of the shipment (e.g., kg, lbs). Calculate the total_weight by summing the weights of all packages.
* total_volume: Total volume of the shipment (e.g., cubic meters, cubic feet). Calculate the total_volume by summing the volumes of all packages.
* quantity: Individual Quantity of a specific item being shipped.
* package_type: Individual Type of packaging used (e.g., pallets, cartons).
* weight: Individual Weight of a specific package.
* measures: Individual Dimensions or measurements of a package.
* stackable: Indicates whether the shipment is stackable (True or False).
* volume: Individual Volume of a specific package.
* commodity: Type of goods or commodities being shipped.
* company: Name of the email sending company, also the shipping company or carrier.
* incoterms: Choose available options: EXW, FCA, FAS, FOB, CFR, CIF, CPT, CIP, DAP, DPU, DDP.

For attributes with multiple values, such as measures, volume, weight, package_type, and quantity, provide each value separately in a JSON format.
"""

In [7]:
from datasets import load_dataset
for i in range(len(df)):
  email = df['email'][i]
  output = df['entities'][i]
  prompt = f"""{INS} ### Input data:{email} ### Output:### Response"""
  df['email'].iloc[i] = prompt
# Assuming df is your DataFrame
instruct_tune_dataset = df.rename(columns={"email": "prompt", "entities": "response"})
instruct_tune_dataset.head()

Unnamed: 0,prompt,response
0,<s>[INST]Your task is to extract the informati...,"{'pickup_location': ""Quinto Di Treviso Viale d..."
1,<s>[INST]Your task is to extract the informati...,"{'pickup_location': 'Colico LC', 'quantity': '..."
2,<s>[INST]Your task is to extract the informati...,"{'delivery_port': 'JEBEL ALI Jebel Ali', 'pick..."
3,<s>[INST]Your task is to extract the informati...,"{'pickup_location': ""d'oglio Quinzano d'Oglio""..."
4,<s>[INST]Your task is to extract the informati...,"{'pickup_state': 'Italy Italy Italy', 'deliver..."


#### Data structure

The dataset contains three different columns. We are only interested in the columns `prompt` and `response`. There are 9 different possible source value in the `source` column. We are only interested in one of them.

In [9]:
from datasets import Dataset
#Importing the dataset
instruct_tune_dataset = Dataset.from_pandas(instruct_tune_dataset)
instruct_tune_dataset=instruct_tune_dataset.train_test_split(test_size=0.2)
instruct_tune_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 562
    })
    test: Dataset({
        features: ['prompt', 'response'],
        num_rows: 141
    })
})

We will use just a small subset of the data for this training example.





#### Create Formatted Prompt

In the following function we'll be merging our `prompt` and `response` columns by creating the following template:

```
<s>[INST] Use the provided input to create an instruction that could have been used to generate the response with an LLM.

{input} [/INST]

{response}</s>
```

In [10]:
instruct_tune_dataset["train"][0]

{'prompt': '<s>[INST]Your task is to extract the information corresponding to the provided labels from the below given email.\n\n### Labels:\n* pickup_location: Street address of the origin location of goods.\n* pickup_cap: Postal code or ZIP code of the pickup location.\n* pickup_port: Port of pickup, often used in international shipping.\n* pickup_state: Only Country of pickup location.\n* delivery_location: Street address of the destination location of goods.\n* delivery_cap: Postal code or ZIP code of delivery location.\n* delivery_port: Port of delivery, similar to pickup port.\n* delivery_state: State or region of delivery location.\n* total_quantity: Overall quantity of shipped items (e.g., pieces, boxes). Calculate the total_quantity by summing the quantity of all packages.\n* total_weight: Total weight of the shipment (e.g., kg, lbs). Calculate the total_weight by summing the weights of all packages.\n* total_volume: Total volume of the shipment (e.g., cubic meters, cubic feet

In [11]:
def create_prompt(sample):
  bos_token = "<s>"
  original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  system_message = "[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM."
  response = sample["prompt"].replace(original_system_message, "").replace("\n\n### Instruction\n", "").replace("\n### Response\n", "").strip()
  input = sample["response"]
  eos_token = "</s>"

  full_prompt = ""
  full_prompt += bos_token
  full_prompt += system_message
  full_prompt += "\n" + input
  full_prompt += "[/INST]"
  full_prompt += response
  full_prompt += eos_token

  return full_prompt

In [12]:
create_prompt(instruct_tune_dataset["train"][0])

"<s>[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM.\n{'quantity': 'n.1 n.1', 'package_type': 'pedana pedana', 'measures': '80x120x70 80x120x70', 'weight': 'kg.250 kg.250', 'incoterms': 'CIF', 'delivery_port': 'Port of Spain Trinitad', 'pickup_location': 'PROTECO Srl , Via Neive 77 , Castagnito ( CN )', 'pickup_cap': '12050', 'pickup_state': 'ITALY', 'company': 'www.alpiworld.com'}[/INST]<s>[INST]Your task is to extract the information corresponding to the provided labels from the below given email.\n\n### Labels:\n* pickup_location: Street address of the origin location of goods.\n* pickup_cap: Postal code or ZIP code of the pickup location.\n* pickup_port: Port of pickup, often used in international shipping.\n* pickup_state: Only Country of pickup location.\n* delivery_location: Street address of the destination location of goods.\n* delivery_cap: Postal code or ZIP code of delivery location.\n* delivery_port: Port

### Map the Dataset

In [13]:
# mapped_data = instruct_tune_dataset.map(create_prompt)

### Loading the Base Model

Load the model in `4bit`, with double quantization, with `bfloat16` as the compute dtype.

In this case we are using the instruct-tuned model - instead of the base model. For fine-tuning a base model will need a lot more data!

In [14]:
model_id = "mistralai/Mixtral-8x7B-v0.1"

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [16]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [1]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=True,
    attn_implementation="flash_attention_2",cache_dir="new_cache_dir/"
)

NameError: name 'AutoModelForCausalLM' is not defined

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="new_cache_dir/")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Let's example how well the model does at this task currently:

In [19]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=512,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [20]:
prompt="""[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM. \nThere are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.[\INST]"""

generate_response(prompt, model)

NameError: name 'model' is not defined

### Tokenization

In [None]:
def tokenize_prompts(prompt):
    return tokenizer(create_prompt(prompt))

tokenized_train_dataset = instruct_tune_dataset["train"].map(tokenize_prompts)
tokenized_val_dataset = instruct_tune_dataset["test"].map(tokenize_prompts)

In [None]:
import matplotlib.pyplot as plt

def plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=50, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    # plt.xlim([0, 1500])
    plt.show()


plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

In [None]:
def plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=50, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.xlim([0, 2048])
    plt.show()


plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

In [None]:
print(model)

### Setting up the Training
we will be using the `huggingface` and the `peft` library!

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
        target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    task_type="CAUSAL_LM"
)

we need to prepare the model to be trained in 4bit so we will use the  `prepare_model_for_kbit_training` function from peft

> Indented block



In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model)

In [None]:
print(model)

### Hyper-paramters for training
These parameters will depend on how long you want to run training for.
Most important to consider:

`num_train_epochs/max_steps`: How many iterations over the data you want to do, BE CAREFUL, don't try too many, you will over-fit!!!!!

`learning_rate`: Controls the speed of convergence


In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    print(torch.cuda.device_count())
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
  output_dir = "Mixtral_Alpace_v2",
  #num_train_epochs=5,
  max_steps = 1000, # comment out this line if you want to train in epochs
  per_device_train_batch_size = 32,
  warmup_steps = 0.03,
  logging_steps=10,
  save_strategy="epoch",
  #evaluation_strategy="epoch",
  evaluation_strategy="steps",
  push_to_hub=True,
  eval_steps=10, # comment out this line if you want to evaluate at the end of each epoch
  learning_rate=2.5e-5,
  bf16=True,
  # lr_scheduler_type='constant',
)

Setting up the trainer.

`max_seq_length`: Context window size


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from trl import SFTTrainer

max_seq_length = 1024

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
  args=args,
  train_dataset=instruct_tune_dataset["train"],
  eval_dataset=instruct_tune_dataset["test"]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("Mixtral_Alpace_v2")

# Save Model and Push to Hub

In [None]:
# !pip install huggingface-hub -qU

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
# trainer.push_to_hub("Promptengineering/mistral-instruct-generation")

In [None]:
merged_model = model.merge_and_unload()

In [None]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=150,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0]

In [None]:
prompt = "[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM.\nThere are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.[/INST]"


In [None]:
generate_response(prompt, merged_model)

In [None]:
250*32