In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import torch

print("PyTorch built with CUDA:", torch.version.cuda is not None)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("  GPU count:   ", torch.cuda.device_count())
    print("  Current dev: ", torch.cuda.current_device())
    print("  Device name: ", torch.cuda.get_device_name(0))
    print("  cuDNN ver:   ", torch.backends.cudnn.version())


# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# model = SentenceTransformer("Snowflake/snowflake-arctic-embed-xs")
# model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m")
model_id = "Snowflake/snowflake-arctic-embed-m-long"
model = SentenceTransformer(model_id, device=device, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

MAX_TOK = 2048            # stay inside the hard limit
STRIDE  = 256             # overlap to avoid boundary loss

def chunk(text, max_tok=MAX_TOK, stride=STRIDE):
    ids = tokenizer.encode(text, add_special_tokens=False)
    for i in range(0, len(ids), max_tok - stride):
        yield tokenizer.decode(ids[i:i+max_tok])


PyTorch built with CUDA: True
CUDA available: True
  GPU count:    1
  Current dev:  0
  Device name:  NVIDIA GeForce RTX 4070 SUPER
  cuDNN ver:    90100
Using device: cuda


<All keys matched successfully>


In [3]:
import json
with open('./daten/data/24_5/prog_lang.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

queries  = ['What is a memory-safe programming language', 'What is not a memory-safe programming language']
docs     = [item['document'] for item in data]        # original JSON
titles = [item['title'] for item in data]

# --- encode ---------------------------------------------------------------
q_emb = model.encode(queries,
                     prompt_name="query",
                     normalize_embeddings=True,        # cosine!
                     convert_to_tensor=True)

# chunk each doc → one embedding per slice
doc_embs, meta = [], []                                # meta keeps (doc_id, slice_no)
for i, d in enumerate(docs):
    for j, slice_text in enumerate(chunk(d)):
        emb = model.encode(slice_text,
                        #    prompt_name="passage",      # critical!
                           normalize_embeddings=True,
                           convert_to_tensor=True)
        doc_embs.append(emb)
        meta.append((i, j))

doc_embs = torch.stack(doc_embs)

Token indices sequence length is longer than the specified maximum sequence length for this model (18124 > 8192). Running this sequence through the model will result in indexing errors


In [4]:
# Run for both queries and display the results for each query
for query_idx, query in enumerate(queries):
    print(f"\n{'='*80}\nQUERY {query_idx+1}: {query}\n{'='*80}\n")
    
    # Get the embedding for this specific query
    query_embedding = q_emb[query_idx].unsqueeze(0)  # Add batch dimension back
    
    # Calculate scores for this query
    dense_scores = query_embedding @ doc_embs.T  
    topk = torch.topk(dense_scores, k=20)  # Get top 20 results
    
    # Print results
    print("Top results:")
    for i, idx in enumerate(topk.indices[0]):
        doc_id, slice_no = meta[idx.item()]
        print(f"{i+1:2d}: {topk.values[0][i]:.4f} - {titles[doc_id]} (doc {doc_id}, slice {slice_no})")
        
        # Get the document slice - only show first 200 chars for readability
        doc_slice = list(chunk(docs[doc_id]))[slice_no]
        preview = doc_slice[:200] + "..." if len(doc_slice) > 200 else doc_slice
        print(f"   Preview: {preview}\n")
    
    # Group results by language for a summary view
    language_scores = {}
    for i, idx in enumerate(topk.indices[0]):
        doc_id, _ = meta[idx.item()]
        title = titles[doc_id]
        score = topk.values[0][i].item()
        
        if title not in language_scores:
            language_scores[title] = score
        else:
            language_scores[title] = max(language_scores[title], score)
    
    # Show summary of languages ranked by top score
    print("\nLANGUAGE RANKING SUMMARY:")
    for i, (lang, score) in enumerate(sorted(language_scores.items(), key=lambda x: x[1], reverse=True)):
        print(f"{i+1:2d}: {score:.4f} - {lang}")


QUERY 1: What is a memory-safe programming language

Top results:
 1: 0.5324 - C (programming language) (doc 14, slice 7)
   Preview: ^ schultz, thomas ( 2004 ). c and the 8051 ( 3rd ed. ). otsego, mi : pagefree publishing inc. p. 20. isbn 978 - 1 - 58961 - 237 - 2. retrieved february 10, 2012. ^ kernighan & ritchie ( 1978 ), p. 6. ...

 2: 0.5255 - C++ (doc 23, slice 3)
   Preview: from the original on 21 march 2015. retrieved 31 march 2015. ^ andrew binstock ( 18 may 2011 ). dr. dobb's : interview with ken thompson. dr. dobb's. archived from the original on 13 march 2014. retri...

 3: 0.5243 - C Sharp (programming language) (doc 13, slice 2)
   Preview: exclusive locks via the keyword lock. c # supports classes with properties. the properties can be simple accesor functions with a backing field, or implement arbitrary getter and setter functions. a p...

 4: 0.5181 - C (programming language) (doc 14, slice 5)
   Preview: data structures, even file systems. the language supports a r