In [None]:
# libraires needed
%pip install torch transformers bitsandbytes accelerate
%pip install protobuf
%pip install transformers torch accelerate sentencepiece

In [30]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

# Define model name
model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"

# Load Tokenizer (Fix SentencePiece Issue)
tokenizer = LlamaTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load Model (Use FP16 if GPU, otherwise auto)
model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("Model and Tokenizer Loaded Successfully!")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model and Tokenizer Loaded Successfully!


In [18]:
input_text = "3 + 6 ="
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


3 + 6 = 0. Let x be (-1 - -1) + 2 + 1. Suppose -x*q + 2*q = -2. Solve -q*v + 3*v =


In [13]:
# Input Prompt
prompt = "3 + 6 ="

# Tokenize Input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

# Print Token IDs
print("Input Token IDs:", input_ids.tolist())

# Generate Text Token-by-Token
output_ids = input_ids
for _ in range(20):  # Generate up to 20 tokens
    with torch.no_grad():
        outputs = model(output_ids)  # Forward pass
        logits = outputs.logits[:, -1, :]  # Get last token's logits

    # Apply Decoding Strategy (Greedy)
    next_token = torch.argmax(logits, dim=-1)  # Get highest probability token
    output_ids = torch.cat([output_ids, next_token.unsqueeze(-1)], dim=-1)  # Append new token

    # Print Decoding Step
    print(f"Step {_+1}: {tokenizer.decode(next_token.item())}")

# Convert Final Tokens to Text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\nGenerated Text:", generated_text)

Input Token IDs: [[63619, 6, 969, 63619, 6, 694]]
Step 1: 
Step 2: 0
Step 3: .
Step 4: Let
Step 5: x
Step 6: be
Step 7: (-
Step 8: 1
Step 9: -
Step 10: -
Step 11: 1
Step 12: )
Step 13: +
Step 14: 
Step 15: 1
Step 16: +
Step 17: 
Step 18: 1
Step 19: .
Step 20: Let

Generated Text: 3 + 3 = 0. Let x be (-1 - -1) + 1 + 1. Let


In [1]:


# Input promp
def test(prompt, model, tokenizer):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    print("Input prompt:", prompt)
    print("Input Token IDs:", input_ids.tolist())
    print("Input tokens:", [tokenizer.decode([id]) for id in input_ids[0].tolist()])

    # 1. Greedy Decoding
    print("\n==== GREEDY DECODING ====")
    output_ids = input_ids.clone()
    for i in range(20):  # Generate up to 20 tokens
        with torch.no_grad():
            outputs = model(output_ids)  # Forward pass
            logits = outputs.logits[:, -1, :]  # Get last token's logits
            
            # Get top 5 candidates before selection (for logging)
            top5_values, top5_indices = torch.topk(logits, 5)
            print(f"\nStep {i+1} candidates:")
            for j, (value, index) in enumerate(zip(top5_values[0], top5_indices[0])):
                token = tokenizer.decode([index])
                prob = torch.softmax(logits, dim=-1)[0, index].item()
                print(f"  {j+1}. '{token}' (ID: {index}) - prob: {prob:.4f}, logit: {value.item():.4f}")
            
            # Greedy selection (argmax)
            next_token = torch.argmax(logits, dim=-1)
            output_ids = torch.cat([output_ids, next_token.unsqueeze(-1)], dim=-1)
            
            chosen_token = tokenizer.decode([next_token.item()])
            print(f"  → Selected: '{chosen_token}' (ID: {next_token.item()})")
            
            # Optional: stop if we generate an EOS token
            if next_token.item() == tokenizer.eos_token_id:
                print("  Reached EOS token, stopping generation.")
                break

    # Print final generated text
    greedy_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print("\nGreedy decoding result:", greedy_text)

    # 2. Beam Search Decoding
    print("\n==== BEAM SEARCH DECODING ====")
    num_beams = 3
    output_beam = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + 20,
        num_beams=num_beams,
        num_return_sequences=1,
        return_dict_in_generate=True,
        output_scores=True,
    )

    # Log the beam search process
    beam_tokens = output_beam.sequences[0, input_ids.shape[1]:]
    beam_scores = output_beam.scores

    print(f"Using beam search with {num_beams} beams:")
    for i, (token_id, score_set) in enumerate(zip(beam_tokens, beam_scores)):
        # Get the top-k tokens for this step (k=num_beams)
        step_scores = score_set[0]  # Get scores for the first beam
        top_tokens = torch.topk(step_scores, num_beams)
        
        print(f"\nStep {i+1} candidates:")
        for j, (score, token_idx) in enumerate(zip(top_tokens.values, top_tokens.indices)):
            token = tokenizer.decode([token_idx])
            prob = torch.softmax(step_scores, dim=-1)[token_idx].item()
            print(f"  {j+1}. '{token}' (ID: {token_idx.item()}) - prob: {prob:.4f}, score: {score.item():.4f}")
        
        chosen_token = tokenizer.decode([token_id])
        print(f"  → Selected: '{chosen_token}' (ID: {token_id.item()})")

    # Print final beam search result
    beam_text = tokenizer.decode(output_beam.sequences[0], skip_special_tokens=True)
    print("\nBeam search result:", beam_text)

    # 3. Top-p (Nucleus) Sampling
    print("\n==== TOP-P SAMPLING ====")
    top_p = 0.9
    temperature = 0.7
    output_ids = input_ids.clone()

    for i in range(20):  # Generate up to 20 tokens
        with torch.no_grad():
            outputs = model(output_ids)  # Forward pass
            logits = outputs.logits[:, -1, :]  # Get last token's logits
            
            # Apply temperature
            logits = logits / temperature
            
            # Get top candidates before applying top-p (for logging)
            top5_values, top5_indices = torch.topk(logits, 5)
            print(f"\nStep {i+1} candidates (before top-p):")
            for j, (value, index) in enumerate(zip(top5_values[0], top5_indices[0])):
                token = tokenizer.decode([index])
                prob = torch.softmax(logits, dim=-1)[0, index].item()
                print(f"  {j+1}. '{token}' (ID: {index}) - prob: {prob:.4f}, logit: {value.item():.4f}")
            
            # Apply top-p filtering
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
            
            # Remove tokens with cumulative probability above the threshold
            sorted_indices_to_remove = cumulative_probs > top_p
            # Shift indices to the right to keep first token above threshold
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0
            
            # Create mask for indices to remove
            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
            filtered_logits = logits.masked_fill(indices_to_remove, -float('inf'))
            
            # Count tokens in sampling pool
            valid_indices = torch.softmax(filtered_logits, dim=-1) > 0
            num_valid = valid_indices.sum().item()
            print(f"  After top-p ({top_p}): {num_valid} tokens in sampling pool")
            
            # Sample from the filtered distribution
            probs = torch.softmax(filtered_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(-1)
            output_ids = torch.cat([output_ids, next_token.unsqueeze(-1)], dim=-1)
            
            chosen_token = tokenizer.decode([next_token.item()])
            chosen_prob = probs[0, next_token.item()].item()
            print(f"  → Sampled: '{chosen_token}' (ID: {next_token.item()}) - prob: {chosen_prob:.4f}")
            
            # Optional: stop if we generate an EOS token
            if next_token.item() == tokenizer.eos_token_id:
                print("  Reached EOS token, stopping generation.")
                break

    # Print final sampled text
    sampled_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print("\nTop-p sampling result:", sampled_text)

    # Summary of different decoding methods
    print("\n==== DECODING COMPARISON ====")
    print("Original prompt:", prompt)
    print("1. Greedy decoding:", greedy_text)
    print("2. Beam search:", beam_text)
    print("3. Top-p sampling:", sampled_text)

In [1]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Torch version: 2.6.0+cu118
CUDA available: True


In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# ALLAM 7B

In [2]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

# Define model name
model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"

# Load Tokenizer (Fix SentencePiece Issue)
allam_tokenizer = LlamaTokenizer.from_pretrained(model_name)
allam_tokenizer.pad_token = allam_tokenizer.eos_token

# Load Model (Use FP16 if GPU, otherwise auto)
allam_model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("Model and Tokenizer Loaded Successfully!")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Model and Tokenizer Loaded Successfully!


# LLAMA 7B

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define model name
model_name = "CohereForAI/c4ai-command-r7b-arabic-02-2025"

# Load Tokenizer (Fix SentencePiece Issue)
R7B_tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
R7B_tokenizer.pad_token = R7B_tokenizer.eos_token

# Load Model (Use FP16 if GPU, otherwise auto)
R7B_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("Model and Tokenizer Loaded Successfully!")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model and Tokenizer Loaded Successfully!


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define model name
model_name = "Navid-AI/Yehia-7B-preview"

# Load Tokenizer (Fix SentencePiece Issue)
Yehia_tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
Yehia_tokenizer.pad_token = Yehia_tokenizer.eos_token

# Load Model (Use FP16 if GPU, otherwise auto)
Yehia_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("Model and Tokenizer Loaded Successfully!")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


Model and Tokenizer Loaded Successfully!


In [35]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

# Define model name
model_name = "meta-llama/Llama-3.1-8B"

# Load Tokenizer (Fix SentencePiece Issue)
llam_tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
llam_tokenizer.pad_token = llam_tokenizer.eos_token

# Load Model (Use FP16 if GPU, otherwise auto)
llam_model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("Model and Tokenizer Loaded Successfully!")

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:  52%|#####1    | 2.57G/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Model and Tokenizer Loaded Successfully!


In [36]:
from transformers import AutoConfig

import pandas as pd
def get_model_config_info(model, model_name):
    try:
        config = model.config
        config_dict = config.to_dict()  # Convert all attributes to a dictionary
        config_dict["Model Name"] = model_name  # Add model name for reference
        return config_dict
    except Exception as e:
        return {"Model Name": model_name, "Error": str(e)}


# Compare multiple models
models = [
    (R7B_model, "CohereForAI/c4ai-command-r7b-arabic-02-2025"),
    (Yehia_model, "Navid-AI/Yehia-7B-preview"),
    (allam_model, "ALLaM-AI/ALLaM-7B-Instruct-preview"),
    (llam_model, "meta-llama/Llama-3.1-8B")
]

# Collect information
model_info_list = [get_model_config_info(model[0], model[1]) for model in models]

# Create and display dataframe
df = pd.DataFrame(model_info_list)

In [37]:
df

Unnamed: 0,vocab_size,max_position_embeddings,hidden_size,logit_scale,intermediate_size,num_hidden_layers,num_attention_heads,num_key_value_heads,hidden_act,initializer_range,...,rotary_pct,use_embedding_sharing,use_gated_activation,use_parallel_block,use_parallel_embedding,Model Name,rms_norm_eps,pretraining_tp,mlp_bias,internal_version
0,256000,16384,4096,0.25,14336,32,32,8,silu,0.02,...,1.0,True,True,True,True,CohereForAI/c4ai-command-r7b-arabic-02-2025,,,,
1,64000,4096,4096,,11008,32,32,32,silu,0.006,...,,,,,,Navid-AI/Yehia-7B-preview,1e-05,1.0,False,7b-alpha-v1.27.2.25
2,64000,4096,4096,,11008,32,32,32,silu,0.006,...,,,,,,ALLaM-AI/ALLaM-7B-Instruct-preview,1e-05,1.0,False,7b-alpha-v1.27.2.25
3,128256,131072,4096,,14336,32,32,8,silu,0.02,...,,,,,,meta-llama/Llama-3.1-8B,1e-05,1.0,False,


In [1]:
from transformers import AutoTokenizer, OPTForCausalLM

tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-6.7b")
model = OPTForCausalLM.from_pretrained("facebook/galactica-6.7b")

input_text = "The Transformer architecture [START_REF]"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [21]:
prompt = "3+6="
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print("Input prompt:", prompt)
print("Input Token IDs:", input_ids.tolist())
print("Input tokens:", [tokenizer.decode([id]) for id in input_ids[0].tolist()])

Input prompt: 3+6=
Input Token IDs: [[41, 33, 44, 51]]
Input tokens: ['3', '+', '6', '=']


In [22]:
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

3+6=10\).

The \(\mathbb{Z}_{2}


: 

In [9]:
%pip install ace-tools

Note: you may need to restart the kernel to use updated packages.
