In [None]:
### Let's check the GPU ###
!nvidia-smi

Tue Nov 21 10:11:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
################################################################################
# Install Prerequisites
################################################################################

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# For running models
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

!pip install -q torch
!pip install -q git+https://github.com/huggingface/transformers # huggingface transformers for downloading models weights
!pip install -q bitsandbytes # For Model weights quantisation
!pip install -q accelerate

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# For fine tuning models
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

#!pip install -q datasets #huggingface datasets to download and manipulate datasets
#!pip install -q peft #Parameter efficient finetuning - for qLora Finetuning
#!pip install -q trl #Transformer Reinforcement Learning - For Finetuning using Supervised Fine-tuning
#!pip install -q wandb -U #Used to monitor the model score during training
#
#pip install -U tokenizers ?

In [None]:
################################################################################
# Load Required Libraries
################################################################################

import torch
#from huggingface_hub import notebook_login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

#from transformers import (
#    TrainingArguments,
#    pipeline,
#    logging,
#)
#import json
#import pandas as pd
#from datasets import Dataset, load_dataset
#from trl import SFTTrainer
#from peft import LoraConfig, PeftModel

################################################################################
# Setting Model Parameters
################################################################################

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# AutoModelForCausalLM Parameters
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

model_name = {
    'mistral-7b-instruct-v01': "mistralai/Mistral-7B-Instruct-v0.1",
    'mistral-7b-v01-sharded': "ybelkada/Mistral-7B-v0.1-bf16-sharded",
}['mistral-7b-v01-sharded']

# Load the entire model on the GPU 0
device_map = {"": 0}

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# BitsAndBytesConfig Parameters
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = torch.float16

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# Loading the Base Model
################################################################################

# Load the base model with QLoRA configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map=device_map
)

base_model.config.use_cache = False

# Load MitsralAi tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

pytorch_model-00001-of-00008.bin:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

pytorch_model-00002-of-00008.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

pytorch_model-00003-of-00008.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

pytorch_model-00004-of-00008.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

pytorch_model-00005-of-00008.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

pytorch_model-00006-of-00008.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

pytorch_model-00007-of-00008.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

pytorch_model-00008-of-00008.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/145 [00:00<?, ?B/s]

In [None]:
################################################################################
# Inference
################################################################################

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Enable text wrapping so we don't have to scroll horizontally.
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Inference Functions
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

# https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/text_generation
# https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration

from transformers import TextStreamer

runtimeFlag = "cuda:0" # alternatively set torch.set_default_device('cuda') and omit .to('cuda') ?

generate_global_config = {
    #pad_token_id: 2,
    #do_sample: True,
}

def build_prompt(system_prompt, user_prompt):
  return f"{system_prompt}[INST]{user_prompt.strip()}\n[/INST]"

# stream
def inference_stream(
    model,
    user_prompt,
    system_prompt="",
    max_new_tokens=512,
    temperature=1.0):

    prompt = build_prompt(system_prompt, user_prompt)

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        **generate_global_config
    )

def inference_batched(
    model,
    user_prompt,
    system_prompt="",
    max_new_tokens=256,
    temperature=1.0):

    prompt = build_prompt(system_prompt, user_prompt)

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)
    model.eval()
    with torch.no_grad():
      outputs = model.generate(
          **inputs,
          max_new_tokens=max_new_tokens,
          temperature=temperature,
          **generate_global_config
      )
      text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
      print(text)

def inference(*args, stream=False, **kwargs):
  if stream:
    inference_stream(*args, **kwargs)
  else:
    inference_batched(*args, **kwargs)

prompt = "code in python and c detecting prime numbers"
inference(base_model, prompt, stream=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




I'm trying to write a program that will detect prime numbers. I'm using the Sieve of Eratosthenes. I'm using the python code below.

```
def sieve(n):
   primes = [True] * (n+1)
   primes[0] = primes[1] = False
   for i in range(2, int(n**0.5)+1):
       if primes[i]:
           for j in range(i*i, n+1, i):
               primes[j] = False
   return [i for i in range(n+1) if primes[i]]
```

I'm trying to convert this to C. I'm not sure how to do the range(2, int(n**0.5)+1) part. I'm also not sure how to do the [i for i in range(n+1) if primes[i]] part.


In [None]:
################################################################################
# Fine Tuning
################################################################################

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Fine-Tuning with qLora and Supervised Finetuning
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

# Set LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=100, # the total number of training steps to perform
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Initialize the SFTTrainer for fine-tuning
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,  # You can specify the maximum sequence length here
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Lets start the training process
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

# Start the training process
trainer.train()

# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Merge and Share
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

# Merge the model with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Test the merged model
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

from random import randrange
sample = train_dataset[randrange(len(train_dataset ))]

prompt = f"""<s>
{sample['instruction']}
{sample['input']}
[INST]

"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = merged_model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

print(f"Prompt:\n{prompt}\n")
print(f"\nGenerated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"\nGround truth:\n{sample['output']}")

In [None]:
################################################################################
# Troubleshooting
################################################################################

'''
Challenges are part and parcel of model training. Let’s discuss some common issues and their resolutions.
'''

# OOM
'''
If you encounter an Out of Memory (OOM) error:

- Consider reducing the batch size.
- Shorten training samples by cutting down on context length (max_length in tokenize()).
'''

# Training too Slow
'''
If training seems sluggish:

- Increase batch size.
- Multiple GPUs, buy or rent (on runpod for example). The code provided here is ready for accelerate and can be used to train in multi-GPU settings, simply launch with accelerate launch qlora.py instead of python qlora.py.
'''

# Bad Quality of the Final Model
'''
- The quality of your model is a reflection of your dataset’s quality. To improve model quality:

- Ensure your dataset is rich and relevant.
- Tune hyperparameters: learning_rate, epochs, rank r, lora_alpha
'''
