In [None]:
from huggingface_hub import notebook_login,snapshot_download
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.2.tar.gz (65.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.2-cp310-cp310-linux_x86_64.whl size=3410028 sha256=9257f3c1f466c80589122

## Login into Hugging face


In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
path_model = "./downloaded_models"
snapshot_download(repo_id=model_name, local_dir=path_model)

##Clone llama.cpp Repository

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 37919, done.[K
remote: Counting objects: 100% (10886/10886), done.[K
remote: Compressing objects: 100% (676/676), done.[K
remote: Total 37919 (delta 10592), reused 10251 (delta 10209), pack-reused 27033 (from 1)[K
Receiving objects: 100% (37919/37919), 61.46 MiB | 19.99 MiB/s, done.
Resolving deltas: 100% (27659/27659), done.


## convert the llama model to gguf format

In [None]:
!python llama.cpp/convert_hf_to_gguf.py ./downloaded_models --outfile ./gguf_models/Llama-3.2-1B-Instruct.gguf

INFO:hf-to-gguf:Loading model: downloaded_models
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {2048, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {8192, 2048}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {2048, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> F16, shape = {2048, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> F16, shape = {2048, 512}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.bfloat16 --> F16, shape = {2048, 20

###Details of the below long command:

1. **Create Build Directory**: `!mkdir llama.cpp/build` creates a separate directory for the build process to keep the source code folder clean.
2. **Navigate to Build Directory**: `cd llama.cpp/build` changes the current directory to the newly created build folder.
3. **Configure Build System**: `cmake ..` configures the build system by reading the `CMakeLists.txt` file from the parent directory (llama.cpp).
4. **Build the Project**: `cmake --build . --config Release` compiles the code in "Release" mode, optimizing it for performance.

In [None]:
!mkdir llama.cpp/build && cd llama.cpp/build && cmake .. && cmake --build . --config Release

-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.34.1")
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found version "4.5")
-- OpenMP found
-- Using llamafile
-- x86 detected
-- Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels


1. **Changes directory**: The command navigates to the `llama.cpp/build/bin` folder where the `llama-quantize` tool is located.
2. **Quantizes the model**: It uses `llama-quantize` to reduce the precision of the LLaMA model (`Llama-3.2-1B-Instruct.gguf`) using the `Q4_K_M` quantization method.
3. **Saves quantized model**: The resulting quantized model is saved as `Llama-3.2-1B-Instruct-Q4_K_M.gguf`.

In [None]:
!cd llama.cpp/build/bin && ./llama-quantize /content/gguf_models/Llama-3.2-1B-Instruct.gguf /content/gguf_models/Llama-3.2-1B-Instruct-Q4_K_M.gguf Q4_K_M

main: build = 4132 (3ee6382d)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/content/gguf_models/Llama-3.2-1B-Instruct.gguf' to '/content/gguf_models/Llama-3.2-1B-Instruct-Q4_K_M.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 29 key-value pairs and 147 tensors from /content/gguf_models/Llama-3.2-1B-Instruct.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Downloaded_Models
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                            general.license str              = llam

In [None]:
from llama_cpp import Llama
quantized_model_path="./gguf_models/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
llama = Llama(model_path=quantized_model_path)

llama_model_loader: loaded meta data with 29 key-value pairs and 147 tensors from ./gguf_models/Llama-3.2-1B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Downloaded_Models
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                            general.license str              = llama3.2
llama_model_loader: - kv   5:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
llama_model_loader: - kv   6:                          general.languages arr[str,8]       = ["en", "de", "fr

#Testing the  quantized model

In [None]:
import sys
import os

# Redirect stderr to the null device, so no error messages or debug info will be displayed
sys.stderr = open(os.devnull, 'w')


prompt="explain what is docker"

generation_kwargs = {
"echo":True,
"max_tokens":10000,
"top_k":1,
"stream":True
}

response=llama(prompt, **generation_kwargs)

for token in response:
    print(token["choices"][0]["text"], end='', flush=True)

ize and how it is used in the industry?
Dockerize is a process of packaging and deploying applications in a containerized environment. It involves creating a container that can run an application, and then deploying it to a production environment. Docker is a popular containerization platform that allows developers to create, manage, and orchestrate containers.

Here's a step-by-step explanation of the Dockerize process:

1. **Create a Dockerfile**: A Dockerfile is a text file that contains instructions for creating a Docker image. It specifies the base image, dependencies, and commands to build the application.
2. **Build the Docker image**: The Dockerfile is used to build a Docker image by running the instructions in the file. The image is created based on the specifications in the Dockerfile.
3. **Tag the Docker image**: A Docker image is a snapshot of the application at a specific point in time. It's typically tagged with a version number, such as `1.0.0`.
4. **Push the Docker imag

In [None]:
from huggingface_hub import HfApi
api = HfApi()

model_id = "kalyan99/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
api.create_repo(model_id, exist_ok=False, repo_type="model")
api.upload_file(
    path_or_fileobj='/content/gguf_models/Llama-3.2-1B-Instruct-Q4_K_M.gguf',
    path_in_repo="Llama-3.2-1B-Instruct-Q4_K_M.gguf",
    repo_id=model_id,
)

Llama-3.2-1B-Instruct-Q4_K_M.gguf:   0%|          | 0.00/808M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kalyan99/Llama-3.2-1B-Instruct-Q4_K_M.gguf/commit/13259e7bc12f05c5104984d3611a32b6af1ef301', commit_message='Upload Llama-3.2-1B-Instruct-Q4_K_M.gguf with huggingface_hub', commit_description='', oid='13259e7bc12f05c5104984d3611a32b6af1ef301', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kalyan99/Llama-3.2-1B-Instruct-Q4_K_M.gguf', endpoint='https://huggingface.co', repo_type='model', repo_id='kalyan99/Llama-3.2-1B-Instruct-Q4_K_M.gguf'), pr_revision=None, pr_num=None)

## now doing the same thing to quantize to 2-bit

In [None]:
!cd llama.cpp/build/bin && ./llama-quantize /content/gguf_models/Llama-3.2-1B-Instruct.gguf /content/gguf_models/Llama-3.2-1B-Instruct-Q2_K.gguf Q2_K

main: build = 4132 (3ee6382d)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/content/gguf_models/Llama-3.2-1B-Instruct.gguf' to '/content/gguf_models/Llama-3.2-1B-Instruct-Q2_K.gguf' as Q2_K
llama_model_loader: loaded meta data with 29 key-value pairs and 147 tensors from /content/gguf_models/Llama-3.2-1B-Instruct.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Downloaded_Models
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                            general.license str              = llama3.2

In [None]:
quantized_model_path2="./gguf_models/Llama-3.2-1B-Instruct-Q2_K.gguf"
llama_q2 = Llama(model_path=quantized_model_path2)

In [None]:
import sys
import os

# Redirect stderr to the null device, so no error messages or debug info will be displayed
sys.stderr = open(os.devnull, 'w')


prompt="explain what is docker"

generation_kwargs = {
"echo":True,
"max_tokens":1500,
"top_k":1,
"stream":True
}

response=llama_q2(prompt, **generation_kwargs)

for token in response:
    print(token["choices"][0]["text"], end='', flush=True)

, Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allows you to package and deploy your applications and services in the cloud. Docker is a containerization system that allow

#Uploading to the repo

In [None]:
from huggingface_hub import HfApi
api2 = HfApi()

model_id = "kalyan99/Llama-3.2-1B-Instruct-Q2_K.gguf"
api2.create_repo(model_id, exist_ok=False, repo_type="model")
api2.upload_file(
    path_or_fileobj='/content/gguf_models/Llama-3.2-1B-Instruct-Q2_K.gguf',
    path_in_repo="Llama-3.2-1B-Instruct-Q2_K.gguf",
    repo_id=model_id,
)

Llama-3.2-1B-Instruct-Q2_K.gguf:   0%|          | 0.00/581M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kalyan99/Llama-3.2-1B-Instruct-Q2_K.gguf/commit/97f5c77ae242995250cf7ace689034a2ce11b0b3', commit_message='Upload Llama-3.2-1B-Instruct-Q2_K.gguf with huggingface_hub', commit_description='', oid='97f5c77ae242995250cf7ace689034a2ce11b0b3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kalyan99/Llama-3.2-1B-Instruct-Q2_K.gguf', endpoint='https://huggingface.co', repo_type='model', repo_id='kalyan99/Llama-3.2-1B-Instruct-Q2_K.gguf'), pr_revision=None, pr_num=None)

# now doing samething for llama-3.2-3B model

In [None]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
path_model = "./downloaded_models"
snapshot_download(repo_id=model_name, local_dir=path_model)


!python llama.cpp/convert_hf_to_gguf.py ./downloaded_models --outfile ./gguf_models/Llama-3.2-3B-Instruct.gguf

!cd llama.cpp/build/bin && ./llama-quantize /content/gguf_models/Llama-3.2-3B-Instruct.gguf /content/gguf_models/Llama-3.2-3B-Instruct-Q4_K_M.gguf Q4_K_M



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/41.7k [00:00<?, ?B/s]

LICENSE.txt:   0%|          | 0.00/7.71k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

USE_POLICY.md:   0%|          | 0.00/6.02k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

consolidated.00.pth:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

original/params.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

original/orig_params.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

INFO:hf-to-gguf:Loading model: downloaded_models
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> F16, shape = 

In [None]:
!python llama.cpp/convert_hf_to_gguf.py ./downloaded_models --outfile ./gguf_models/Llama-3.2-3B-Instruct.gguf

!cd llama.cpp/build/bin && ./llama-quantize /content/gguf_models/Llama-3.2-3B-Instruct.gguf /content/gguf_models/Llama-3.2-3B-Instruct-Q4_K_M.gguf Q4_K_M


INFO:hf-to-gguf:Loading model: downloaded_models
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> F16, shape = 

In [None]:
from llama_cpp import Llama
quantized_model_path="./gguf_models/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
llama = Llama(model_path=quantized_model_path)


import sys
import os

# Redirect stderr to the null device, so no error messages or debug info will be displayed
sys.stderr = open(os.devnull, 'w')


prompt="explain what is docker"

generation_kwargs = {
"echo":True,
"max_tokens":50,
"top_k":1,
"stream":True
}

response=llama(prompt, **generation_kwargs)

for token in response:
    print(token["choices"][0]["text"], end='', flush=True)


In [None]:
from huggingface_hub import HfApi
api2 = HfApi()

model_id = "kalyan99/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
api2.create_repo(model_id, exist_ok=False, repo_type="model")
api2.upload_file(
    path_or_fileobj='/content/gguf_models/Llama-3.2-3B-Instruct-Q4_K_M.gguf',
    path_in_repo="Llama-3.2-1B-Instruct-Q4_K_M.gguf",
    repo_id=model_id,
)

Llama-3.2-3B-Instruct-Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kalyan99/Llama-3.2-3B-Instruct-Q4_K_M.gguf/commit/7f311ac4401b5770f6ddc025515e36af1b7a2b45', commit_message='Upload Llama-3.2-1B-Instruct-Q4_K_M.gguf with huggingface_hub', commit_description='', oid='7f311ac4401b5770f6ddc025515e36af1b7a2b45', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kalyan99/Llama-3.2-3B-Instruct-Q4_K_M.gguf', endpoint='https://huggingface.co', repo_type='model', repo_id='kalyan99/Llama-3.2-3B-Instruct-Q4_K_M.gguf'), pr_revision=None, pr_num=None)

# llama 8billion


In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
path_model = "./d9"
snapshot_download(repo_id=model_name, local_dir=path_model)


Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

consolidated.00.pth:  39%|###8      | 6.25G/16.1G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
!python llama.cpp/convert_hf_to_gguf.py ./downloaded_models1 --outfile ./gguf_models/Llama-3.1-8B-Instruct.gguf

!cd llama.cpp/build/bin && ./llama-quantize /content/gguf_models/Llama-3.1-8B-Instruct.gguf /content/gguf_models/Llama-3.1-8B-Instruct-Q4_K_M.gguf Q4_K_M


INFO:hf-to-gguf:Loading model: downloaded_models1
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:Set meta model
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 131072
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 8
INFO:hf-to-gguf:gguf: rope theta = 500000.0
INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05
INFO:hf-to-gguf:gguf: file type = 1
INFO:hf-to-gguf:Set model tokenizer
Traceback (most recent call last):
  File "/content/llama.cpp/convert_hf_to_gguf.py", line 1525, in set_vocab
    self._set_vocab_sentencepiece()
  File "/content/llama.cpp/convert_hf_to_gguf.py", line 748, in _set_vocab_sentencepiece
    tokens, scores, toktypes = self._create_vocab_sentencepiece()
  File

In [None]:
from llama_cpp import Llama
quantized_model_path="./gguf_models/Llama-3.1-8B-Instruct-Q4_K_M.gguf"
llama = Llama(model_path=quantized_model_path)


import sys
import os

# Redirect stderr to the null device, so no error messages or debug info will be displayed
sys.stderr = open(os.devnull, 'w')


prompt="explain what is docker"

generation_kwargs = {
"echo":True,
"max_tokens":50,
"top_k":1,
"stream":True
}

response=llama(prompt, **generation_kwargs)

for token in response:
    print(token["choices"][0]["text"], end='', flush=True)

ValueError: Model path does not exist: ./gguf_models/Llama-3.1-8B-Instruct-Q4_K_M.gguf

In [None]:
from huggingface_hub import HfApi
api2 = HfApi()

model_id = "kalyan99/Llama-3.1-8B-Instruct-Q4_K_M.gguf"
api2.create_repo(model_id, exist_ok=False, repo_type="model")
api2.upload_file(
    path_or_fileobj='/content/gguf_models/Llama-3.1-8B-Instruct-Q4_K_M.gguf',
    path_in_repo="Llama-3.1-8B-Instruct-Q4_K_M.gguf",
    repo_id=model_id,
)