# Run an experiment mixing StreamingLLM and Quantization
- [x] Run StreamingLLM
- [ ] Add Quantization to it

In [1]:
from dataclasses import dataclass
from contextlib import contextmanager

import torch
from torch import nn
from transformers import pipeline

from kvpress import BasePress, KnormPress, ScorerPress

In [2]:
# Load pipeline

device = "cuda:0"
ckpt = "Qwen/Qwen2.5-1.5B-Instruct"
attn_implementation = "flash_attention_2"
pipe = pipeline("kv-press-text-generation", model=ckpt, device=device, torch_dtype="auto", model_kwargs={"attn_implementation":attn_implementation})

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Device set to use cuda:0


In [3]:
# Load data

context = "In this step-by-step guide, you will learn how to create a new press in kvpress !"
question = "\nWhat is the purpose of this guide?"
tokens = pipe.tokenizer(context, return_tensors="pt").to(device)

In [4]:
compression_ratio = 0.6
press = KnormPress(compression_ratio)

with torch.no_grad():
    outputs_without_press = pipe.model(**tokens, output_hidden_states=True)

with torch.no_grad(), press(pipe.model):
    output_with_press = pipe.model(**tokens)

print(f"Cache shape w/o press: {outputs_without_press.past_key_values[0][0].shape}")
print(f"Cache shape w/ press:  {output_with_press.past_key_values[0][0].shape}\n")

# The `KVPressTextGenerationPipeline` simply applies the `press` as above on the context tokens (see `_forward` method for more details).
print(pipe(context, question=question, press=press)["answer"])

Cache shape w/o press: torch.Size([1, 2, 20, 128])
Cache shape w/ press:  torch.Size([1, 2, 8, 128])

The purpose of this guide is to provide step-by-step instructions on how to create a new press in KV (Kubernetes Versioning) version 1.18. The guide covers the necessary steps to set up a new press, including creating a


In [5]:
from transformers import QuantizedCacheConfig, QuantoQuantizedCache

config = QuantizedCacheConfig(nbits=4)
cache = QuantoQuantizedCache(config)

In [7]:
print(pipe(context, question=question, press=press, cache=cache)["answer"])



!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Your compiler (/opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc++) is not compatible with the compiler Pytorch was
built with for this platform, which is g++ on linux. Please
use g++ to to compile your extension. Alternatively, you may
compile PyTorch from source using /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc++, and then you can also use
/opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc++ to compile your extension.

See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help
with compiling PyTorch from source.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


RuntimeError: Error building extension 'quanto_cuda': [1/9] /opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output gptq_marlin_repack.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cu -o gptq_marlin_repack.cuda.o 
[31mFAILED: [0mgptq_marlin_repack.cuda.o 
/opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output gptq_marlin_repack.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cu -o gptq_marlin_repack.cuda.o 
nvc-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
nvcc fatal   : Unsupported NVHPC compiler found. nvc++ is the only NVHPC compiler that is supported.
[2/9] /opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output gemm_cuda.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.cu -o gemm_cuda.cuda.o 
[31mFAILED: [0mgemm_cuda.cuda.o 
/opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output gemm_cuda.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.cu -o gemm_cuda.cuda.o 
nvc-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
nvcc fatal   : Unsupported NVHPC compiler found. nvc++ is the only NVHPC compiler that is supported.
[3/9] /opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output gemv_cuda.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.cu -o gemv_cuda.cuda.o 
[31mFAILED: [0mgemv_cuda.cuda.o 
/opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output gemv_cuda.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.cu -o gemv_cuda.cuda.o 
nvc-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
nvcc fatal   : Unsupported NVHPC compiler found. nvc++ is the only NVHPC compiler that is supported.
[4/9] /opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output fp8_marlin.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cu -o fp8_marlin.cuda.o 
[31mFAILED: [0mfp8_marlin.cuda.o 
/opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output fp8_marlin.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cu -o fp8_marlin.cuda.o 
nvc-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
nvcc fatal   : Unsupported NVHPC compiler found. nvc++ is the only NVHPC compiler that is supported.
[5/9] /opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output marlin_cuda_kernel.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu -o marlin_cuda_kernel.cuda.o 
[31mFAILED: [0mmarlin_cuda_kernel.cuda.o 
/opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output marlin_cuda_kernel.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu -o marlin_cuda_kernel.cuda.o 
nvc-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
nvcc fatal   : Unsupported NVHPC compiler found. nvc++ is the only NVHPC compiler that is supported.
[6/9] /opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output unpack.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/unpack.cu -o unpack.cuda.o 
[31mFAILED: [0munpack.cuda.o 
/opt/packages/cuda/v12.6.1/bin/nvcc --generate-dependencies-with-compile --dependency-output unpack.cuda.o.d -ccbin /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' --expt-extended-lambda --use_fast_math -DQUANTO_CUDA_ARCH=900 -std=c++17 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/unpack.cu -o unpack.cuda.o 
nvc-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
nvcc fatal   : Unsupported NVHPC compiler found. nvc++ is the only NVHPC compiler that is supported.
[7/9] /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc++ -MMD -MF marlin_cuda.o.d -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -g -O3 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp -o marlin_cuda.o 
[31mFAILED: [0mmarlin_cuda.o 
/opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc++ -MMD -MF marlin_cuda.o.d -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -g -O3 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp -o marlin_cuda.o 
nvc++-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
"/ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/c10/util/C++17.h", line 13: catastrophic error: #error directive: "You're trying to build PyTorch with a too old version of GCC. We need GCC 9 or later."
  #error \
   ^

1 catastrophic error detected in the compilation of "/ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp".
Compilation terminated.
[8/9] /opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc++ -MMD -MF pybind_module.o.d -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -g -O3 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/pybind_module.cpp -o pybind_module.o 
[31mFAILED: [0mpybind_module.o 
/opt/packages/nvidia/hpc_sdk//Linux_x86_64/22.9/compilers/bin/nvc++ -MMD -MF pybind_module.o.d -DTORCH_EXTENSION_NAME=quanto_cuda -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/torch/csrc/api/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/TH -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/THC -isystem /opt/packages/cuda/v12.6.1/include -isystem /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/include/python3.13 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -g -O3 -c /ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/pybind_module.cpp -o pybind_module.o 
nvc++-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
"/ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/torch/include/c10/util/C++17.h", line 13: catastrophic error: #error directive: "You're trying to build PyTorch with a too old version of GCC. We need GCC 9 or later."
  #error \
   ^

1 catastrophic error detected in the compilation of "/ocean/projects/cis240042p/hhirairi/.conda/envs/kvpress/lib/python3.13/site-packages/optimum/quanto/library/extensions/cuda/pybind_module.cpp".
Compilation terminated.
ninja: build stopped: subcommand failed.
