<a href="https://colab.research.google.com/github/harshbopaliya/LLM/blob/main/deepseek_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

In [5]:
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU name: Tesla T4


In [12]:
MODEL_NAME = "gpt2"  # Small model that works well on T4 GPU
DATASET_NAME = "gsm8k"  # Grade School Math dataset from Hugging Face
DATASET_CONFIG = "main"  # The main configuration of GSM8K
MAX_LENGTH = 512
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
OUTPUT_DIR = "./math_model_finetuned"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
pip install dataset

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Downloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Downloading SQLAlchemy-1.4.54-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m

In [13]:
dataset = load_dataset(DATASET_NAME, DATASET_CONFIG)
print(f"Dataset loaded: {dataset}")


Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})


In [14]:
def preprocess_function(examples):
    # For GSM8K dataset, we'll format questions and answers
    texts = []
    for question, answer in zip(examples["question"], examples["answer"]):
        # Format: Question followed by answer including reasoning
        text = f"Question: {question}\nAnswer: {answer}"
        texts.append(text)

    return tokenizer(
        texts,
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )

In [15]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal language modeling, not masked
)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [16]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,  # Use mixed precision training to reduce memory usage
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

In [19]:
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mharshalbopaliya597[0m ([33mharshalbopaliya597-ahmedabad-institute-of-technology-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.221
20,3.2853
30,3.2514
40,3.1162
50,3.1128
60,2.969
70,2.8929
80,2.795
90,2.7433
100,2.6147


TrainOutput(global_step=1401, training_loss=1.8822505349861052, metrics={'train_runtime': 1857.4022, 'train_samples_per_second': 12.07, 'train_steps_per_second': 0.754, 'total_flos': 5849283428352000.0, 'train_loss': 1.8822505349861052, 'epoch': 2.995184590690209})

In [20]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")


Model saved to ./math_model_finetuned


In [21]:
def generate_answer(question, max_length=200):
    input_text = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [22]:
test_question = "If there are 5 apples and 3 baskets, and we want to distribute the apples equally among the baskets, how many apples will be left over?"
generated_solution = generate_answer(test_question)
print(f"Test question: {test_question}")
print(f"Generated solution: {generated_solution}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Test question: If there are 5 apples and 3 baskets, and we want to distribute the apples equally among the baskets, how many apples will be left over?
Generated solution: Question: If there are 5 apples and 3 baskets, and we want to distribute the apples equally among the baskets, how many apples will be left over?
Answer: If there are 5*3 = <<5*3=10>>10 apples, then there are 10*3 = <<10*3=30>>30 apples.
We have 30*10 = <<30*10=100>>100 apples.
#### 100 apples = <<100*10=100>>100 apples are left over.
#### 100-100 = <<100-100=100>>100 apples are left.
#### 100/100 = <<100/100=10>>10 apples are left over.
#### 10 apples = <<10=10>>10 apples are left over.
#### 10/10 = <<10/10=3>>3 apples are left over.
#### 3 apples = <<3=3>>3 apples are left over.
#### 3 apples = <<3=4


In [25]:
test_question = "A bookstore received a shipment of 48 new books. The owner placed them equally on 5 display shelves, with any remaining books going into storage. How many books went into storage?"
generated_solution = generate_answer(test_question)
print(f"Test question: {test_question}")
print(f"Generated solution: {generated_solution}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Test question: A bookstore received a shipment of 48 new books. The owner placed them equally on 5 display shelves, with any remaining books going into storage. How many books went into storage?
Generated solution: Question: A bookstore received a shipment of 48 new books. The owner placed them equally on 5 display shelves, with any remaining books going into storage. How many books went into storage?
Answer: The seller received 48 new books on the 5 display shelves, so 48 * 5 = <<48*5=48>>48 books went into storage.
The remaining books were left in storage with 48 * 5 = <<48*5=80>>80 books.
#### 80 books were left in storage, so 80 - 48 = <<80-48=80>>80 books were left in storage.
#### 80 books were left in storage with 48, so 48 - 48 = <<48-48=80>>80 books were left in storage.
#### 80 books were left in storage with 80, so 80 - 80 = <<80-80=80>>80 books were left in storage.
#### 80 books were left in storage with 48, so 48 - 48 = <<48-


In [26]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [27]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)

In [28]:
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


GPU available: True
GPU name: Tesla T4
GPU memory: 15.83 GB


In [29]:
!pip install -q transformers>=4.34.0 datasets>=2.14.0 peft>=0.5.0 bitsandbytes>=0.41.0 accelerate>=0.21.0


In [33]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Smaller DeepSeek model suitable for T4
DATASET_NAME = "gsm8k"
DATASET_CONFIG = "main"
MAX_LENGTH = 512
OUTPUT_DIR = "./deepseek_math_finetuned"

In [34]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

In [35]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [36]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [37]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,  # Rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
)

In [38]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
print(f"Trainable parameters: {model.print_trainable_parameters()}")


trainable params: 18,464,768 || all params: 1,795,552,768 || trainable%: 1.0284
Trainable parameters: None


In [39]:
dataset = load_dataset(DATASET_NAME, DATASET_CONFIG)
print(f"Dataset loaded: {dataset}")


Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})


In [40]:
def format_math_example(example):
    question = example["question"]
    answer = example["answer"]

    # Format template for DeepSeek models
    formatted_text = f"Question: {question}\n\nAnswer: {answer}"
    return formatted_text


In [42]:
def preprocess_function(examples):
    formatted_texts = [format_math_example({"question": q, "answer": a})
                       for q, a in zip(examples["question"], examples["answer"])]

    # Tokenize the texts
    tokenized_inputs = tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Create labels (for causal language modeling, labels are the same as input_ids)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()

    return tokenized_inputs


In [43]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [44]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [45]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,  # Small batch size for T4
    gradient_accumulation_steps=8,  # Accumulate gradients to compensate for small batch size
    gradient_checkpointing=True,    # Save memory by using gradient checkpointing
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,                      # Use mixed precision training
    report_to="none",               # Disable wandb/tensorboard to save memory
    save_total_limit=1,             # Only keep the latest checkpoint
)

In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [47]:
print("Starting training...")
trainer.train()

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,1.636
20,1.4257
30,0.9752
40,0.7999
50,0.7603
60,0.7465
70,0.689
80,0.7043
90,0.7113
100,0.6895


Step,Training Loss
10,1.636
20,1.4257
30,0.9752
40,0.7999
50,0.7603
60,0.7465
70,0.689
80,0.7043
90,0.7113
100,0.6895


TrainOutput(global_step=934, training_loss=0.6039011123604131, metrics={'train_runtime': 9353.7997, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.1, 'total_flos': 7.164448163600794e+16, 'train_loss': 0.6039011123604131, 'epoch': 1.9975916510569975})

In [48]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model adapter saved to {OUTPUT_DIR}")

Model adapter saved to ./deepseek_math_finetuned


In [74]:
import shutil
shutil.make_archive("model_files", 'zip', OUTPUT_DIR)


'/content/model_files.zip'

In [79]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StopOnHash(StoppingCriteria):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.hash_token_id = tokenizer.encode("#", add_special_tokens=False)[0]

    def __call__(self, input_ids, scores, **kwargs):
        return self.hash_token_id in input_ids[0].tolist()


In [75]:
def generate_answer(question, max_new_tokens=300):
    input_text = f"Question: {question}\n\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, return_attention_mask=True).to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [77]:
def generate_answer(question, max_new_tokens=150):
    """
    Generates an answer to the given question using the loaded language model.

    Args:
        question (str): The question to answer.
        max_new_tokens (int, optional): The maximum number of new tokens to generate.
                                         Defaults to 150.

    Returns:
        str: The generated answer.
    """
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, return_attention_mask=True).to(model.device)
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.9,
            top_p=0.8,
            top_k=40,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [1]:
def generate_answer(question, max_new_tokens=150):
    '''
    Generates an answer to the given question using the loaded language model.

    Args:
        question (str): The question to answer.
        max_new_tokens (int, optional): The maximum number of new tokens to generate.
                                         Defaults to 150.

    Returns:
        str: The generated answer.
    '''
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, return_attention_mask=True).to(model.device)
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.8,
            top_k=40,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [2]:
test_question = "A store sells 20 chocolates. How many **combinations (order doesn't matter)** of 3 chocolates can be selected?"
print("\nTesting model with sample question:")
print(f"Question: {test_question}")
generated_answer = generate_answer(test_question)
print(f"Generated answer: {generated_answer}")



Testing model with sample question:
Question: A store sells 20 chocolates. How many **combinations (order doesn't matter)** of 3 chocolates can be selected?


NameError: name 'tokenizer' is not defined

In [78]:
test_question = '''A medical test detects a certain disease with 99% accuracy. That means:

If a person has the disease, the test is positive 99% of the time.

If a person does not have the disease, the test is positive 5% of the time (false positive rate).

Suppose 1% of the population actually has the disease.
A person takes the test and the result is positive.

What is the probability that the person actually has the disease?'''
print("\nTesting model with sample question:")
print(f"Question: {test_question}")
generated_answer = generate_answer(test_question)
print(f"Generated answer: {generated_answer}")



Testing model with sample question:
Question: A medical test detects a certain disease with 99% accuracy. That means:

If a person has the disease, the test is positive 99% of the time.

If a person does not have the disease, the test is positive 5% of the time (false positive rate).

Suppose 1% of the population actually has the disease.
A person takes the test and the result is positive.

What is the probability that the person actually has the disease?
Generated answer: Question: A medical test detects a certain disease with 99% accuracy. That means:

If a person has the disease, the test is positive 99% of the time.

If a person does not have the disease, the test is positive 5% of the time (false positive rate).

Suppose 1% of the population actually has the disease.
A person takes the test and the result is positive.

What is the probability that the person actually has the disease?
Answer: Let P be the probability that the person actually has the disease.
Then 1-P is the probab