# Welcome to Modal Koboldcpp notebooks!

## This is an unofficial notebook simply made to run local models in Modal. It's not perfect but it works

## Recommended Compute Profile:
- CPU: 2 cores
- Ram: 2GB
- GPU: based of the model size (e.g: Valkryie v2 with Q6K_L quants and 16K context you can use Nvidia L40S)
- You can use less CPU and Ram like: 1 core 512MB if you really want to save credits

## What model can i run?
- Basically almost anything.

## So i can run deepseek? 
- Technically yeah. But that would be expensive and i don't think Modal's free credits can last long enough for a session of roleplay with deepseek (most of your compute time wasted on downloading and compiling the model anyway)

## So what can i *exactly* run?
- With free credits, any model below 100B parameters should run comfortably for a moderate amount of time here.
- You can use this [huggingface vram calc](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator) (calc is calculator in short) to see how many vram a model with certain context size and a certain quantization would need. If you're very unsure, just use H100 or H200

In [None]:
# Install the necessary dependencies and installing koboldcpp
import json
!echo Downloading KoboldCpp, please wait...
!wget -O dlfile.tmp https://kcpplinux.concedo.workers.dev && mv dlfile.tmp koboldcpp_linux
!test -f koboldcpp_linux && echo Download Successful || echo Download Failed
!chmod +x ./koboldcpp_linux
!apt update
!apt install aria2 -y

# Change the model link to be any model link from huggingface (.gguf only)
modellink = "https://huggingface.co/bartowski/invisietch_L3.3-Ignition-v0.1-70B-GGUF/resolve/main/invisietch_L3.3-Ignition-v0.1-70B-Q6_K/invisietch_L3.3-Ignition-v0.1-70B-Q6_K-00001-of-00002.gguf?download=true"
Context = "16384" # Change the modet's maximum context size
Layers = "99" # Change the number of layers to offload to GPU (0 for CPU only)
Instruct_Preset = "llama-3" # Change this to match your model's instruction format
# Supported instructs: 
# alpaca,
# vicuna, 
# llama-3,
# chatml,
# command-r,
# mistral,
# metharme,
# gemma2
# use "custom" if you wish to use a custome one

# If your instruct preset is not present, you can make/paste a custom one
# edit the custom instruct preset here
custom_instruct = {
    "custom": {
        "system_start": "",
        "system_end": "",
        "user_start": "",
        "user_end": "",
        "assistant_start": "",
        "assistant_end": "",
    }
}

premade_instruct = {
    "alpaca": {
        "system_start": "\n### Input: ",
        "system_end": "",
        "user_start": "\n### Instruction: ",
        "user_end": "",
        "assistant_start": "\n### Response: ",
        "assistant_end": "",
    },
    "vicuna": {
        "system_start": "\nSYSTEM: ",
        "system_end": "",
        "user_start": "\nUSER: ",
        "user_end": "",
        "assistant_start": "\nASSISTANT: ",
        "assistant_end": "",
    },
    "llama-3": {
        "system_start": "<|start_header_id|>system<|end_header_id|>\n\n",
        "system_end": "<|eot_id|>",
        "user_start": "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n",
        "user_end": "<|eot_id|>",
        "assistant_start": "<|start_header_id|>assistant<|end_header_id|>\n\n",
        "assistant_end": "<|eot_id|>",
    },
    "chatml": {
        "system_start": "<|im_start|>system",
        "system_end": "<|im_end|>",
        "user_start": "<|im_start|>user",
        "user_end": "<|im_end|>",
        "assistant_start": "<|im_start|>assistant",
        "assistant_end": "<|im_end|>",
    },
    "command-r": {
        "system_start": "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
        "system_end": "<|END_OF_TURN_TOKEN|>",
        "user_start": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>",
        "user_end": "<|END_OF_TURN_TOKEN|>",
        "assistant_start": "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
        "assistant_end": "<|END_OF_TURN_TOKEN|>",
    },
    "mistral":  {
      "system_start": "",
      "system_end": "",
      "user_start": "[INST] ",
      "user_end": "",
      "assistant_start": " [/INST]",
      "assistant_end": "</s> "
    },
    "gemma2":{
      "system_start": "<start_of_turn>system\n",
      "system_end": "<end_of_turn>\n",
      "user_start": "<start_of_turn>user\n",
      "user_end": "<end_of_turn>\n",
      "assistant_start": "<start_of_turn>model\n",
      "assistant_end": "<end_of_turn>\n"
    },
    "metharme": {
      "system_start": "<|system|>",
      "system_end": "",
      "user_start": "<|user|>",
      "user_end": "",
      "assistant_start": "<|model>",
      "assistant_end": ""
    }
}

#create instruct file
with open("instruct.json", "w") as f:
    f.write(json.dumps(premade_instruct[Instruct_Preset], separators=(",", ":")))

In [None]:
# Download the model (or the first model part)
modellink = "https://huggingface.co/bartowski/Sao10K_Llama-3.3-70B-Vulpecula-r1-GGUF/resolve/main/Sao10K_Llama-3.3-70B-Vulpecula-r1-Q6_K/Sao10K_Llama-3.3-70B-Vulpecula-r1-Q6_K-00001-of-00002.gguf?download=true"
modelName = modellink.split('/')[-1].split('.')[0]
!aria2c -x 16 -s 16 -k 1M -o model-00001-of-00002.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $modellink


In [None]:
# Download the second part of the model (if any)
modellink_2 = "https://huggingface.co/bartowski/Sao10K_Llama-3.3-70B-Vulpecula-r1-GGUF/resolve/main/Sao10K_Llama-3.3-70B-Vulpecula-r1-Q6_K/Sao10K_Llama-3.3-70B-Vulpecula-r1-Q6_K-00002-of-00002.gguf?download=true"
!aria2c -x 16 -s 16 -k 1M -o model-00002-of-00002.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $modellink_2


In [None]:
# Run without instruct preset
!./koboldcpp_linux model-00001-of-00002.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $Context --flashattention --hordemodelname $modelName --quiet --remotetunnel

In [None]:
# Run with instruct preset
!./koboldcpp_linux model-00001-of-00002.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $Context --flashattention --hordemodelname $modelName --quiet --remotetunnel --chatcompletionsadapter instruct.json