<a href="https://colab.research.google.com/github/kmalhotra18/HuggingFace/blob/main/Models_(Llama%2C_Phi3%2C_Gemma2%2C_Qwen2%2C_Mixtral).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Models**

*   Looking at the lower level API of Transformers - the models that wrap PyTorch code for the transformers themselves.




In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate

In [None]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

Sign in to Hugging Face

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# instruct models

LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct"                  # Alibaba model
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1"  # If this doesn't fit it your GPU memory, try others from the hub with 8B parameters or fewer

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

Accessing Llama 3.1 from Meta

In [None]:
# Quantization Config - this allows us to load the model into memory and use less memory (reducing precision/weights that make up the model. It slightly reduces accuracy but saves lot of compute memory)
#This helps often in fine-tuning the model.
#Using library 'BitsAndBytes' and creating BitsAndBytesConfig. You're loading 4bits (can also use 8bits)

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,                                                        #4bits
    bnb_4bit_use_double_quant=True,                                           #use double quantization
    bnb_4bit_compute_dtype=torch.bfloat16,                                    #use bfloat16 data type
    bnb_4bit_quant_type="nf4"                                                 #use 4-bit neural floating point (n=normalize)
)

In [None]:
# Tokenizer for llama

tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token                                             # pad_token used to fill up the prompt (usually set as same as special token)
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")      # apply_chat_template function - takes messages as list of dictionaries and converts it to tokens

In [None]:
# Load the model (AutoModelForCausalLM is general class of creating Gen LLM, which takes some set of tokens in past and predicts future tokens)
#Running this - downloads all model weights and puts to google box temporarily, and loaded to memory, ready to use

model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

In [None]:
# See how much memory the model is using

memory = model.get_memory_footprint() / 1e6
print(f"Memory footprint: {memory:,.1f} MB")

Looking under the hood at the Transformer model

In [None]:
# Look at model itself - it shows description of deep neural network (layes of the code); begins with embeddings layer (how tokens are embedded). SiLU (Sigmoid Linear Unit) activation function
# Embedding(128256, 4096) are dimensions. YOu can see the outputs with similar dimensionality.

model

In [None]:
# Now let's run the model!
# model.generate takes inputs sitting in GPU

outputs = model.generate(inputs, max_new_tokens=80)
print(tokenizer.decode(outputs[0]))

In [None]:
# Clean up memory
# If you select "Show Resources" on the top right to see GPU memory, it might not drop down right away
# But it does seem that the memory is available for use by new models in the later code.

del model, inputs, tokenizer, outputs
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Wrapping everything in a function - and adding Streaming and generation prompts

def generate(model, messages):
  tokenizer = AutoTokenizer.from_pretrained(model)                                                                # Use AutoTokenizer class to create a new tokenizer based on model you're working with
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")    # Apply chat template suitable to tokenizer above, and put into GPU
  streamer = TextStreamer(tokenizer)                                                                              # Stream back results using TextStreamer - that will convert tokens to text
  model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config)        # Device_map = 'auto' (use GPU if you have one)
  outputs = model.generate(inputs, max_new_tokens=80, streamer=streamer)
  del model, inputs, tokenizer, outputs, streamer
  gc.collect()
  torch.cuda.empty_cache()

In [None]:
# PHI3 model with the function above

generate(PHI3, messages)

Accessing **Gemma from Google**

In [None]:
# Use Gemma2 model from Google using the function above - this is 2B model and we're quantizing it
# Gemma doesnt support system prompt - just pass user prompt.

messages = [
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]
generate(GEMMA2, messages)

**Qwen2 model**

In [None]:
# Define the message
messages = [
    {"role": "user", "content": "What's the difference between a list and a tuple in Python?"}
]

In [None]:
# Run Qwen2
generate(QWEN2, messages)

**Mixtral model**

In [None]:
# Run Mixtral
generate(MIXTRAL, messages)