In [1]:
import os 
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, pipeline
from transformers.cache_utils import DynamicCache
import time
import json 
import gc
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
import random
from sentence_transformers import SentenceTransformer, util
#from bert_score import BERTScorer
import evaluate
score = evaluate.load("bertscore", config="roberta-base")

In [2]:
"""
# 4-bit quantization to speed up inference and reduce memory
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# tokenizer and model
model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name) #, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map='auto'
)
"""

'\n# Configure 4-bit quantization to speed up inference and reduce memory usage.\nquant_config = BitsAndBytesConfig(\n    load_in_4bit=True,\n    bnb_4bit_use_double_quant=True,\n    bnb_4bit_quant_type="nf4",\n    bnb_4bit_compute_dtype=torch.bfloat16\n)\n\n# Load the tokenizer and model with the optimized quantization configuration.\nmodel_name = "microsoft/Phi-3.5-mini-instruct"\ntokenizer = AutoTokenizer.from_pretrained(model_name) #, token=access_token)\nmodel = AutoModelForCausalLM.from_pretrained(\n    model_name,\n    quantization_config=quant_config,\n    device_map=\'auto\'\n)\n'

In [3]:
"""content = []
with open("json_input_data.json", 'r', encoding="utf-8") as file:
    data = json.load(file)

for page in data:
    for sec in page.get('sections', []):  # Avoids errors if 'sections' key is missing
        content.append(sec.get("section_content", "").replace("\r\n", " ").replace("\n", " ").strip())  # Strip whitespace

print(f"Total sections collected: {len(content)}")

# Write to file
with open('all_document.txt', 'w', encoding="utf-8") as f:
    for i, line in enumerate(content, 1):
        f.write("%s\n" % line)
        if line.strip() == "":
            print(f"Warning: Empty content at line {i}")  # Debug empty lines

# Count actual written lines
with open('all_document.txt', 'r', encoding="utf-8") as f:
    written_lines = sum(1 for _ in f)

print(f"Lines written in all_document.txt: {written_lines}")"""



In [4]:
"""section_text = []
with open("all_document.txt", "r", encoding="utf-8") as file:
    section_text = file.read().split('\n')
for x, y in zip(section_text,content):
    if x!=y:
        print(x)
        print(y)
        break"""

'section_text = []\nwith open("all_document.txt", "r", encoding="utf-8") as file:\n    section_text = file.read().split(\'\n\')\nfor x, y in zip(section_text,content):\n    if x!=y:\n        print(x)\n        print(y)\n        break'

In [5]:
"""
with open("document.txt", "r", encoding="utf-8") as file:
    document_text = file.read()"""

'# Read the context document used for knowledge.\nwith open("document.txt", "r", encoding="utf-8") as file:\n    document_text = file.read()'

In [6]:
def prepare_system_prompt(doc_text, instruction=None):  
    # delimiters and headers for PHI.
    prompt = f"""
    <|system|>
    You are a concise assistant. Provide a single, clear answer to the User Query below using only the provided OGS Website Content. 
    Do not include extra commentary, repeated phrases, or any information beyond what is in the OGS Website Content. 
    **If the answer is not available in the OGS Website Content, reply with "Information not available." **
    Do not include this language if the information is available. 
    When your answer is complete, immediately append <|endoftext|> with no additional text.
    OGS Website Content:
    {"\n".join(doc_text)}
    <|end|>
    <|user|>
    """.strip()
    return prompt

In [7]:
def build_kv_cache(prompt):
    """
    generates a key-value cache  
    """
    # device assignment
    device = model.model.embed_tokens.weight.device
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    # initialize dynamic cache to store key-value pairs.
    kv_cache = DynamicCache()
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            past_key_values=kv_cache,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=False
        )
    # get the sequence length of the cached keys.
    cache_length = outputs.past_key_values.key_cache[0].shape[-2]
    return outputs.past_key_values, cache_length

In [8]:
def trim_kv_cache(cache, target_length):
    """
    trims kv cache so that only the original doc sequence remains.
    """
    for idx in range(len(cache.key_cache)):
        cache.key_cache[idx] = cache.key_cache[idx][:, :, :target_length, :]
        cache.value_cache[idx] = cache.value_cache[idx][:, :, :target_length, :]

In [9]:
def generate_response(input_ids, kv_cache, max_tokens=200):
    """
    greedy decoding with the provided KV cache to generate a response.
    """
    # device assignment
    device = model.model.embed_tokens.weight.device
    input_ids = input_ids.to(device)
    generated_tokens = input_ids.clone()
    current_token = input_ids

    with torch.no_grad():
        for _ in range(max_tokens):
            outputs = model(
                input_ids=current_token,
                past_key_values=kv_cache,
                use_cache=True
            )
            # get logits for the last token and select the most probable next token.
            next_logits = outputs.logits[:, -1, :]
            current_token = torch.argmax(next_logits, dim=-1, keepdim=True)
            # update cache for the next iteration.
            kv_cache = outputs.past_key_values
            # append new token to the generated sequence.
            generated_tokens = torch.cat([generated_tokens, current_token], dim=1)
            # if end-of-sequence token, stop generation.
            if model.config.eos_token_id is not None and current_token.item() == model.config.eos_token_id:
                break

    # Return only the tokens generated after the initial query.
    return generated_tokens[:, input_ids.shape[-1]:]

In [10]:
#counts = [len(doc) for doc in content]
#sum(counts)/len(counts)

In [11]:
#print(len(content))
#len(content[0])

In [12]:
#document_text = content[:20]
# Build the system prompt and create the knowledge cache.
#system_prompt = prepare_system_prompt(document_text)
#knowledge_cache, orig_cache_len = build_kv_cache(system_prompt)
#print("Initial KV cache length:", orig_cache_len)

In [13]:
def query_response(question, tokenizer, model, knowledge_cache, orig_cache_len):
    # append end of prompt tokens and encode query
    query = question + "<|end|>\n<|assistant|>\n" 
    query_ids = tokenizer.encode(query, return_tensors="pt").to(model.device)
    # call generate response and decode
    response_ids = generate_response(query_ids, knowledge_cache)
    response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True)
    # trim cache back to original length
    trim_kv_cache(knowledge_cache, orig_cache_len) 
    return response_text

In [14]:
#question = 'Does Northeastern have coop opportunities in the healthcare sector?'
#generated_text = query_response(question, tokenizer, model, knowledge_cache, orig_cache_len)
#print(f"Response of the model:\n {generated_text}")

In [15]:
#end = time.time()
#length = end - start 
# Show the results : this can be altered however you like
#print("It took", length, "seconds!")

In [16]:
#question = 'Does Northeastern have coop opportunities in the tech sector?'
#generated_text = query_response(question, tokenizer, model, knowledge_cache, orig_cache_len)
#print(f"Response of the model:\n {generated_text}")

In [17]:
with open('all_test_data.txt', "r") as file:
    all_test_data = json.loads(file.read())

In [18]:
def select_random_n(items, n): 
    """
    select n random docs and their associated test QAs from items dict
    """
    if len(items) < n:
        # return a copy of the original list if there are fewer than 20 items
        return items.copy()
    # Randomly sample 20 unique items from the list
    randos = random.sample(items, n)
    docs = [rand['document'] for rand in randos]
    tests = [rand['tests'] for rand in randos]
    return docs, tests

In [19]:
#docs20, tests20 = select_random_n(all_test_data,15)

In [20]:
#len("\n".join(docs20))

In [21]:
#docs20[0]

In [22]:
def clear_cuda_memory():
    """
    manual memory management to avoid memory errors with repeated model use
    """
    try:
        # sync/wait for all GPU operations to finish
        torch.cuda.synchronize()
    except Exception as e:
        print("Error during synchronization:", e)
    
    # garbage collection
    gc.collect()
    time.sleep(1)  # small delay
    
    # clears the CUDA cache and reset
    try:
        torch.cuda.empty_cache()
    except Exception as e:
        print("Error during empty_cache:", e) 
    try:
        torch.cuda.reset_peak_memory_stats()
    except Exception as e:
        print("Error during reset_peak_memory_stats:", e)
    time.sleep(1)

In [23]:
def evaluate_generated_answer(generated_answer, true_answer, threshold=0.7, max_retries=5):
    """
    evaluates whether a generated answer matches a true answer using multiple modern metrics:
      - Token-level F1 score: measures word-overlap after normalization.
      - Embedding similarity: cosine similarity between mean-pooled embeddings.
      - Zero-shot NLI: uses a natural language inference pipeline to assess entailment.
      - Text classification: uses a classification pipeline (as a proxy for relevance). 
    """
    
    # normalize and compute token-level F1
    def normalize(text: str) -> str:
        return text.strip().lower()
    
    norm_gen = normalize(generated_answer)
    norm_true = normalize(true_answer)
    
    # tokenize on whitespace.
    gen_tokens = norm_gen.split()
    true_tokens = norm_true.split()
    common_tokens = set(gen_tokens) & set(true_tokens)
    
    if not common_tokens:
        token_f1 = 0.0
    else:
        precision = len(common_tokens) / len(gen_tokens)
        recall = len(common_tokens) / len(true_tokens)
        token_f1 = 2 * (precision * recall) / (precision + recall)
    
    scores = {"token_f1": token_f1}
    
    # model names for the evaluation pipelines
    model_names = [
        "sentence-transformers/all-MiniLM-L6-v2", # embedding-based similarity.
        "cross-encoder/nli-roberta-base", # zero-shot NLI.
        "distilbert/distilbert-base-uncased-finetuned-sst-2-english" # text classification (relevance).
    ]
    
    # embedding cosine_similarity
    try:
        goal = "feature-extraction"
        # safe_generate_fit is expected to return embeddings for both texts. 
        embeddings = safe_generate_fit(generated_answer, true_answer, goal, model_names, max_retries=max_retries)
        # mean-pool embeddings for each text.
        pooled_gen = np.mean(np.array(embeddings[0]), axis=1).flatten()
        pooled_true = np.mean(np.array(embeddings[1]), axis=1).flatten()
        embedding_score = cosine_similarity([pooled_gen], [pooled_true])[0][0]
        scores["embedding_similarity"] = embedding_score
    except Exception as e:
        scores["embedding_similarity"] = 0.0
        print(f"Embedding similarity evaluation failed: {e}")
    
    # zero-shot NLI eval
    try:
        goal = "zero-shot-classification"
        # compares generated_answer to the true_answer as candidate.
        nli_result = safe_generate_fit(generated_answer, true_answer, goal, model_names, max_retries=max_retries)
        # s/b a dictionary with a "scores" field.
        nli_score = nli_result["scores"][0]
        scores["nli_score"] = nli_score
    except Exception as e:
        scores["nli_score"] = 0.0
        print(f"NLI evaluation failed: {e}")
    
    # text classification eval
    try:
        goal = "text-classification"
        # pipeline takes a concatenation of the texts.
        text_class_result = safe_generate_fit(generated_answer, true_answer, goal, model_names, max_retries=max_retries)
        text_class_score = text_class_result[0]["score"]
        scores["text_classification_score"] = text_class_score
    except Exception as e:
        scores["text_classification_score"] = 0.0
        print(f"Text classification evaluation failed: {e}")
    
    # aggregate threshold.
    num_metrics_good = sum(1 for score in scores.values() if isinstance(score, float) and score > threshold)
    scores["is_good_enough"] = (num_metrics_good >= 2)
    
    return scores 
    
def safe_generate_fit(response, keywords, goal, model_names, max_retries=5): 
    retries = 0
    while retries < max_retries:
        try:
            if goal == "feature-extraction":
                pipe = pipeline(goal, model=model_names[0])
                result = pipe([response, keywords])
                # clean up and return
                del pipe
                torch.cuda.empty_cache()
                return result
            
            elif goal == "zero-shot-classification":
                pipe = pipeline(goal, model=model_names[1])
                # use true answer as a candidate label
                result = pipe(response, candidate_labels=[keywords])
                del pipe
                torch.cuda.empty_cache()
                return result
            
            elif goal == "text-classification":
                pipe = pipeline(goal, model=model_names[2]) 
                result = pipe(response + " " + keywords)
                del pipe
                torch.cuda.empty_cache()
                return result
            
        except torch.cuda.OutOfMemoryError:
            print(f"OutOfMemoryError. Attempt {retries + 1} of {max_retries}")
            # clean up GPU memory.
            try:
                del pipe
            except Exception:
                pass
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            gc.collect()
            time.sleep(5)
            retries += 1
    
    raise RuntimeError("Maximum retries reached in safe_generate_fit.")
 

In [24]:
#clear_cuda_memory()
#system_prompt = prepare_system_prompt(docs20)
#knowledge_cache, orig_cache_len = build_kv_cache(system_prompt)
#print("Initial KV cache length:", orig_cache_len)

In [25]:
#docs20[2]

In [26]:
#tests20[2]['What storage options are available in Boston for summer storage?']

In [27]:
#question = 'What storage options are available in Boston for summer storage?'
#generated_text = query_response(question, tokenizer, model, knowledge_cache, orig_cache_len)
#generated_text

In [28]:
def check_answer_against_full_context(generated_answer, full_documents, threshold=0.7):
    full_context = " ".join(full_documents)
    
    # cosine similarity  
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    answer_embedding = embedder.encode(generated_answer, convert_to_tensor=True)
    context_embedding = embedder.encode(full_context, convert_to_tensor=True)
    
    cosine_sim = util.pytorch_cos_sim(answer_embedding, context_embedding).item() 
    
    # BERTScore
    try: 
        results =  score.compute(
                    predictions=[generated_answer],
                    references=[full_context],
                    lang="en",
                    verbose=False
                ) 
        bert_f1 = sum(results["f1"]) / len(results["f1"]) 
    except Exception as e:
        print("BERTScore evaluation failed:", e)
        bert_f1 = 0.0

    # If either metric meets or exceeds the threshold, consider the answer supported
    return (cosine_sim >= threshold) or (bert_f1 >= threshold)
 

In [29]:
torch.cuda.memory_allocated()

0

In [30]:
def load_model_with_cleanup(model_name, quant_config, overall_i, n, max_retries=3):
    retries = 0
    model = None
    tokenizer = None
    while retries < max_retries:
        try:
            print(f"Attempt {retries+1} to load the model...")
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=quant_config,
                device_map='auto'
            )
            return model, tokenizer 
        except torch.cuda.OutOfMemoryError as e:
            print("CUDA OutOfMemoryError caught during model loading:", e)
        except Exception as e:
            print("Error during model loading:", e)
        # cleanup and prepare for the next attempt.
        try:
            del model, tokenizer, knowledge_cache, quant_config
        except Exception:
            pass
        gb_value = torch.cuda.memory_allocated() / (1024**3)
        print(f"Memory Fail Before {retries} o-{overall_i},i-{n}: {gb_value:.2f} GB") 
        clear_cuda_memory()
        time.sleep(5)
        gb_value = torch.cuda.memory_allocated() / (1024**3)
        print(f"Memory Fail After {retries} o-{overall_i},i-{n}: {gb_value:.2f} GB") 
        retries += 1
    raise RuntimeError("Failed to load the model after maximum retries.")

In [31]:
results = []
# iterave test 10x 5, 10, and 20 docs
for overall_i in range(1, 11):
    for n in [5,10,20]: #5,10,20
        # generate samples
        docs_n, tests_n = select_random_n(all_test_data,n)
        #clear mem
        clear_cuda_memory()
        try:
            del model, tokenizer, knowledge_cache, quant_config
        except Exception:
            pass
        time.sleep(1)
        gb_value = torch.cuda.memory_allocated() / (1024**3)
        print(f"Memory o-{overall_i},i-{n}: {gb_value:.2f} GB") 

        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        model_name = "microsoft/Phi-3.5-mini-instruct"
        
        # attempt to load the model up to 3 times,
        # cleaning GPU memory if an error occurs.
        model, tokenizer = load_model_with_cleanup(model_name, quant_config, overall_i, n)
    
        #prepare model
        system_prompt = prepare_system_prompt(docs_n)
        knowledge_cache, orig_cache_len = build_kv_cache(system_prompt)
        print(f"Initial KV cache length for n={n}:", orig_cache_len)
    
        n_results = []
        n_pass = []
        for idx, doc in enumerate(docs_n):
            rand_question = random.sample(sorted(tests_n[idx]), 1)[0]
            rand_answer = tests_n[idx][rand_question]
            generated_answer = query_response(rand_question, tokenizer, model, knowledge_cache, orig_cache_len)
    
            this_results = evaluate_generated_answer(generated_answer, rand_answer)
            validate_overall = this_results['is_good_enough'] if this_results['is_good_enough'] else check_answer_against_full_context(generated_answer, docs_n, threshold=0.7)
            n_results.append({
                'doc':doc,
                'query':rand_question,
                'answer_true':rand_answer,
                'answer_gen':generated_answer,
                'results':this_results,
                'is_good_enough': this_results['is_good_enough'],
                'validate_overall_doc': validate_overall
            }
            )
            n_pass.append(validate_overall)
        
        # save results
        file_name = f"iteration_results_n_{n}.json"
        overall_file_name = f"iteration_passfail_n_{n}.json" 
        with open(file_name, "w", encoding="utf-8") as f:
            json.dump({f"{n}_all_result2": n_results}, f, indent=4)
        with open(overall_file_name, "w", encoding="utf-8") as f:
            json.dump({f"{n}_all_passfail2": n_pass}, f, indent=4)
            
        label1 = str(n)+"_"+str(overall_i)+'_all_result_r3'
        label2 = str(n)+"_"+str(overall_i)+'_all_passfail_r3'
        results.append({label1: n_results,
                        label2: n_pass})
with open("overall_results_r3.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)

Memory o-1,i-5: 0.00 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 1347


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memory o-1,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 2770


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-1,i-20: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-1,i-20: 2.75 GB
Memory Fail After 0 o-1,i-20: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 4277


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-2,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-2,i-5: 2.75 GB
Memory Fail After 0 o-2,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 2530


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-2,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 2071


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-2,i-20: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-2,i-20: 2.75 GB
Memory Fail After 0 o-2,i-20: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 4676


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-3,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-3,i-5: 2.75 GB
Memory Fail After 0 o-3,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 1019


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-3,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 2482


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-3,i-20: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-3,i-20: 2.75 GB
Memory Fail After 0 o-3,i-20: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 5534


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-4,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-4,i-5: 2.75 GB
Memory Fail After 0 o-4,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 506


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-4,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 3809


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-4,i-20: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-4,i-20: 2.75 GB
Memory Fail After 0 o-4,i-20: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 4325


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-5,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-5,i-5: 2.75 GB
Memory Fail After 0 o-5,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 1104


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-5,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 1700


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-5,i-20: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 5650


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-6,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-6,i-5: 2.75 GB
Memory Fail After 0 o-6,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 2843


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-6,i-10: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-6,i-10: 2.75 GB
Memory Fail After 0 o-6,i-10: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 1309


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-6,i-20: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 4007


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-7,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-7,i-5: 2.75 GB
Memory Fail After 0 o-7,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 1340


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-7,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 2553


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-7,i-20: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-7,i-20: 2.75 GB
Memory Fail After 0 o-7,i-20: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 4713


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-8,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-8,i-5: 2.75 GB
Memory Fail After 0 o-8,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 1333


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-8,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 2139


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-8,i-20: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 4838


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-9,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-9,i-5: 2.75 GB
Memory Fail After 0 o-9,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 7791


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-9,i-10: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-9,i-10: 2.75 GB
Memory Fail After 0 o-9,i-10: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 1235


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-9,i-20: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 4943


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-10,i-5: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-10,i-5: 2.75 GB
Memory Fail After 0 o-10,i-5: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=5: 736


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-10,i-10: 2.75 GB
Attempt 1 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=10: 3851


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


Memory o-10,i-20: 2.75 GB
Attempt 1 to load the model...
Error during model loading: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Memory Fail Before 0 o-10,i-20: 2.75 GB
Memory Fail After 0 o-10,i-20: 1.00 GB
Attempt 2 to load the model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initial KV cache length for n=20: 3350


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


In [None]:
docs_n[3]

In [None]:
n_results[2]

In [None]:
n_results[2]['answer_gen']