In [47]:
import pysolr
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from IPython.display import display, HTML
import json

# Check device and CUDA availability
print("PyTorch built with CUDA:", torch.version.cuda is not None)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("  GPU count:   ", torch.cuda.device_count())
    print("  Current dev: ", torch.cuda.current_device())
    print("  Device name: ", torch.cuda.get_device_name(0))
    print("  cuDNN ver:   ", torch.backends.cudnn.version())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize Solr connection
solr = pysolr.Solr('http://localhost:8983/solr/DenseIndex', always_commit=True)

# Initialize the embedding model
model_id = "Snowflake/snowflake-arctic-embed-xs"  # Use smaller model for faster processing
model = SentenceTransformer(model_id, device=device, trust_remote_code=True)
solr = pysolr.Solr('http://10.248.16.116:8983/solr/ProgLang24_7_emb', always_commit=True)
# tokenizer = AutoTokenizer.from_pretrained(model_id)

# snowflake-arctic-embed-xs: 384 dimensions
# snowflake-arctic-embed-s: 512 dimensions
# snowflake-arctic-embed-m: 768 dimensions
# snowflake-arctic-embed-l: 1024 dimensions

# Define chunking parameters
# MAX_TOK = 384
# STRIDE = 256

# def chunk(text, max_tok=MAX_TOK, stride=STRIDE):
#     ids = tokenizer.encode(text, add_special_tokens=False)
#     for i in range(0, len(ids), max_tok - stride):
#         yield tokenizer.decode(ids[i:i+max_tok])

PyTorch built with CUDA: True
CUDA available: True
  GPU count:    1
  Current dev:  0
  Device name:  NVIDIA GeForce RTX 4070 SUPER
  cuDNN ver:    90100
Using device: cuda


In [48]:
# Function for hybrid search
def hybrid_search(semantic_query, keyword_query, rows=30):
    query_embedding = list(model.encode(semantic_query, prompt_name="query", normalize_embeddings=True))
    
    solr_response = solr.search(
        fl=['id', 'title', 'document', 'score'],
        df="document",                    
        q="t{!bool should=$lexicalQuery should=$vectorQuery}",
        lexicalQuery=f"{{!type=edismax qf=document}}{keyword_query}",
        vectorQuery="{!knn f=document_vector topK=30}" + str([float(w) for w in query_embedding]),                        
        rows=rows
    )
    
    return solr_response

In [49]:
# Function to display search results
def display_results(solr_response, title="Search Results"):
    print(f"\n{title}")
    print(f"Found {len(solr_response)} results\n")
    
    for doc in solr_response:
        print(f"Title: {doc.get('title', 'No Title')} - Score: {doc.get('score', 'N/A')}")
        # Only display first 300 characters of description for readability
        description = doc.get('description', 'No description available')
        preview = description[:300] + "..." if len(description) > 300 else description
        print(f"Description: {preview}\n")


In [50]:
# Example queries
semantic_query = "What are memory-safe programming languages"
# keyword_query = "memory safe"
keyword_query = "memory-safe"

# Run different search methods
print("\n=== Running different search methods ===\n")

# Hybrid search (OR)
hybrid_should_results = hybrid_search(semantic_query, keyword_query)
display_results(hybrid_should_results, "Hybrid Search (OR) Results")



=== Running different search methods ===


Hybrid Search (OR) Results
Found 22 results

Title: Rust (programming language) - Score: 0.9872973
Description: No description available

Title: Go (programming language) - Score: 0.94943047
Description: No description available

Title: C Sharp (programming language) - Score: 0.8846942
Description: No description available

Title: Swift (programming language) - Score: 0.8564925
Description: No description available

Title: C++ - Score: 0.73008794
Description: No description available

Title: Haskell - Score: 0.70074666
Description: No description available

Title: Java (programming language) - Score: 0.6978935
Description: No description available

Title: Scala (programming language) - Score: 0.6693476
Description: No description available

Title: C (programming language) - Score: 0.6532965
Description: No description available

Title: R (programming language) - Score: 0.61871916
Description: No description available

Title: Dart (programming