<a href="https://colab.research.google.com/github/konmavedant/Docker/blob/main/deepseek_R1_Latest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sun Mar  2 19:40:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install transformers torch accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.m

In [3]:
# Install dependencies
!pip install transformers torch accelerate bitsandbytes pandas -q

# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import psutil
import pandas as pd
import time

# Check GPU
!nvidia-smi

# Quantization config for 4-bit loading
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

# Load the latest DeepSeek model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Latest model as per your link
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",  # Use T4 GPU
    trust_remote_code=True
)
print(f"Loaded model: {model_name}")

# Memory usage function
def check_memory_usage():
    ram = psutil.virtual_memory()
    cpu_ram_used = ram.used / 1024**3
    cpu_ram_total = ram.total / 1024**3
    gpu_mem_used = torch.cuda.memory_allocated() / 1024**3
    gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    return {
        "CPU RAM (GB)": f"{cpu_ram_used:.2f}/{cpu_ram_total:.2f}",
        "CPU RAM (%)": ram.percent,
        "GPU VRAM (GB)": f"{gpu_mem_used:.2f}/{gpu_total:.2f}",
        "GPU VRAM (%)": (gpu_mem_used / gpu_total) * 100
    }

# Inference function with context length parameter
def run_deepseek(prompt, context_length, max_new_tokens=100):
    start_time = time.time()
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_length)
    inputs = inputs.to("cuda")

    try:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        elapsed_time = time.time() - start_time
        return response, elapsed_time, True
    except Exception as e:
        print(f"Error with context length {context_length}: {str(e)}")
        return None, 0, False

# Comparison function across context lengths
def compare_context_lengths(prompt, context_lengths=[2048, 4096, 8192, 16384]):
    results = []

    print("Initial memory status:")
    initial_mem = check_memory_usage()
    for k, v in initial_mem.items():
        print(f"{k}: {v}")

    for ctx_len in context_lengths:
        print(f"\nTesting context length: {ctx_len}")
        response, elapsed_time, success = run_deepseek(prompt, ctx_len)

        # Measure memory after inference
        mem_usage = check_memory_usage()

        # Store results
        result = {
            "Context Length": ctx_len,
            "Success": "Yes" if success else "No",
            "Response": response[:50] + "..." if response else "N/A",
            "Time (s)": f"{elapsed_time:.2f}",
            "CPU RAM (GB)": mem_usage["CPU RAM (GB)"],
            "CPU RAM (%)": f"{mem_usage['CPU RAM (%)']:.1f}",
            "GPU VRAM (GB)": mem_usage["GPU VRAM (GB)"],
            "GPU VRAM (%)": f"{mem_usage['GPU VRAM (%)']:.1f}"
        }
        results.append(result)

        if response:
            print(f"Response: {response}")

    # Create comparison table
    df = pd.DataFrame(results)
    print("\nComparison Table:")
    print(df)
    return df

# Test with long prompt
long_prompt = "Summarize this: " + " ".join(["The quick brown fox jumps over the lazy dog."] * 300)  # ~13,200 characters

# Run comparison
context_lengths = [2048, 4096, 8192, 16384]  # Extended range for 1.5B model
df = compare_context_lengths(long_prompt)

# Save results
df.to_csv("deepseek_context_comparison.csv", index=False)
with open("deepseek_full_response.txt", "w") as f:
    for ctx, resp in zip(df["Context Length"], df["Response"]):
        f.write(f"Context Length {ctx}:\n{resp}\n\n")

Sun Mar  2 19:43:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Loaded model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Initial memory status:
CPU RAM (GB): 3.81/12.67
CPU RAM (%): 32.6
GPU VRAM (GB): 1.57/14.74
GPU VRAM (%): 10.619557279698245

Testing context length: 2048


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

In [4]:
# Install dependencies
!pip install transformers torch accelerate bitsandbytes pandas -q

# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import psutil
import pandas as pd
import time

# Check GPU
!nvidia-smi

# Quantization config for 4-bit loading
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

# Load the latest DeepSeek model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",  # Use T4 GPU
    trust_remote_code=True
)
print(f"Loaded model: {model_name}")

# Memory usage function
def check_memory_usage():
    ram = psutil.virtual_memory()
    cpu_ram_used = ram.used / 1024**3
    cpu_ram_total = ram.total / 1024**3
    gpu_mem_used = torch.cuda.memory_allocated() / 1024**3
    gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    return {
        "CPU RAM (GB)": f"{cpu_ram_used:.2f}/{cpu_ram_total:.2f}",
        "CPU RAM (%)": ram.percent,
        "GPU VRAM (GB)": f"{gpu_mem_used:.2f}/{gpu_total:.2f}",
        "GPU VRAM (%)": (gpu_mem_used / gpu_total) * 100
    }

# Updated inference function with batch processing
def run_deepseek(prompt, context_length, max_new_tokens=100, batch_size=4):
    start_time = time.time()
    # Batch processing: repeat prompt for batch_size
    inputs = tokenizer([prompt] * batch_size, return_tensors="pt", truncation=True, max_length=context_length)
    inputs = inputs.to("cuda")

    try:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
        # Decode only the first response (others are identical due to same prompt)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        elapsed_time = time.time() - start_time
        return response, elapsed_time, True
    except Exception as e:
        print(f"Error with context length {context_length}: {str(e)}")
        return None, 0, False

# Comparison function across context lengths
def compare_context_lengths(prompt, context_lengths=[2048, 4096, 8192, 16384, 32768]):
    results = []

    print("Initial memory status:")
    initial_mem = check_memory_usage()
    for k, v in initial_mem.items():
        print(f"{k}: {v}")

    for ctx_len in context_lengths:
        print(f"\nTesting context length: {ctx_len}")
        response, elapsed_time, success = run_deepseek(prompt, ctx_len)

        # Measure memory after inference
        mem_usage = check_memory_usage()

        # Store results
        result = {
            "Context Length": ctx_len,
            "Success": "Yes" if success else "No",
            "Response": response[:50] + "..." if response else "N/A",
            "Time (s)": f"{elapsed_time:.2f}",
            "CPU RAM (GB)": mem_usage["CPU RAM (GB)"],
            "CPU RAM (%)": f"{mem_usage['CPU RAM (%)']:.1f}",
            "GPU VRAM (GB)": mem_usage["GPU VRAM (GB)"],
            "GPU VRAM (%)": f"{mem_usage['GPU VRAM (%)']:.1f}"
        }
        results.append(result)

        if response:
            print(f"Response: {response}")

    # Create comparison table
    df = pd.DataFrame(results)
    print("\nComparison Table:")
    print(df)
    return df

# Test with very long prompt
long_prompt = "Summarize this: " + " ".join(["The quick brown fox jumps over the lazy dog."] * 1000)  # ~44,000 characters

# Run comparison
context_lengths = [2048, 4096, 8192, 16384, 32768]  # Extended range
df = compare_context_lengths(long_prompt)

# Save results
df.to_csv("deepseek_context_comparison.csv", index=False)
with open("deepseek_full_response.txt", "w") as f:
    for ctx, resp in zip(df["Context Length"], df["Response"]):
        f.write(f"Context Length {ctx}:\n{resp}\n\n")

Sun Mar  2 19:47:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P0             28W /   70W |    2162MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Loaded model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Initial memory status:
CPU RAM (GB): 5.03/12.67
CPU RAM (%): 42.3
GPU VRAM (GB): 3.14/14.74
GPU VRAM (%): 21.29873670405472

Testing context length: 2048


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

In [5]:
# Install dependencies
!pip install transformers torch accelerate bitsandbytes pandas -q

# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import psutil
import pandas as pd
import time

# Check GPU
!nvidia-smi

# Quantization config for 4-bit loading
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

# Load the latest DeepSeek model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",  # Use T4 GPU
    trust_remote_code=True
)
print(f"Loaded model: {model_name}")

# Memory usage function
def check_memory_usage():
    ram = psutil.virtual_memory()
    cpu_ram_used = ram.used / 1024**3
    cpu_ram_total = ram.total / 1024**3
    gpu_mem_used = torch.cuda.memory_allocated() / 1024**3
    gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    return {
        "CPU RAM (GB)": f"{cpu_ram_used:.2f}/{cpu_ram_total:.2f}",
        "CPU RAM (%)": ram.percent,
        "GPU VRAM (GB)": f"{gpu_mem_used:.2f}/{gpu_total:.2f}",
        "GPU VRAM (%)": (gpu_mem_used / gpu_total) * 100
    }

# Updated inference function with batch processing and enhanced error handling
def run_deepseek(prompt, context_length, max_new_tokens=200, batch_size=8):
    start_time = time.time()
    # Batch processing: repeat prompt for batch_size
    try:
        inputs = tokenizer([prompt] * batch_size, return_tensors="pt", truncation=True, max_length=context_length)
        inputs = inputs.to("cuda")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
        # Decode only the first response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        elapsed_time = time.time() - start_time

        # Clear GPU memory
        torch.cuda.empty_cache()
        return response, elapsed_time, True

    except torch.cuda.OutOfMemoryError as e:
        print(f"Out of Memory Error at context length {context_length}: {str(e)}")
        return None, 0, False
    except Exception as e:
        print(f"Error with context length {context_length}: {str(e)}")
        return None, 0, False

# Comparison function across context lengths
def compare_context_lengths(prompt, context_lengths=[2048, 4096, 8192, 16384, 32768]):
    results = []

    print("Initial memory status:")
    initial_mem = check_memory_usage()
    for k, v in initial_mem.items():
        print(f"{k}: {v}")

    for ctx_len in context_lengths:
        print(f"\nTesting context length: {ctx_len}")
        response, elapsed_time, success = run_deepseek(prompt, ctx_len)

        # Measure memory after inference
        mem_usage = check_memory_usage()

        # Store results
        result = {
            "Context Length": ctx_len,
            "Success": "Yes" if success else "No",
            "Response": response[:50] + "..." if response else "N/A",
            "Time (s)": f"{elapsed_time:.2f}",
            "CPU RAM (GB)": mem_usage["CPU RAM (GB)"],
            "CPU RAM (%)": f"{mem_usage['CPU RAM (%)']:.1f}",
            "GPU VRAM (GB)": mem_usage["GPU VRAM (GB)"],
            "GPU VRAM (%)": f"{mem_usage['GPU VRAM (%)']:.1f}"
        }
        results.append(result)

        if response:
            print(f"Response: {response}")

    # Create comparison table
    df = pd.DataFrame(results)
    print("\nComparison Table:")
    print(df)
    return df

# Test with very long prompt
long_prompt = "Summarize this: " + " ".join(["The quick brown fox jumps over the lazy dog."] * 1000)  # ~44,000 characters

# Run comparison
context_lengths = [2048, 4096, 8192, 16384, 32768]  # Extended range
df = compare_context_lengths(long_prompt)

# Save results
df.to_csv("deepseek_context_comparison.csv", index=False)
with open("deepseek_full_response.txt", "w") as f:
    for ctx, resp in zip(df["Context Length"], df["Response"]):
        f.write(f"Context Length {ctx}:\n{resp}\n\n")

# Final memory check
print("\nFinal memory status after all runs:")
final_mem = check_memory_usage()
for k, v in final_mem.items():
    print(f"{k}: {v}")

Sun Mar  2 19:50:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   69C    P0             30W /   70W |    8218MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Response: Summarize this: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps ove