In [1]:
!pip install qdrant-client pandas numpy sentence-transformers





[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import json
import pandas as pd
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import SearchRequest, Filter
from sentence_transformers import SentenceTransformer


In [24]:
QDRANT_URL = "http://192.68.10.50:6333"
client = QdrantClient(url=QDRANT_URL,timeout=30)

In [33]:
# queries = {q["_id"]: q["text"] for q in map(json.loads, open("queries.jsonl"))}
# corpus = {c["_id"]: c["text"] for c in map(json.loads, open("SciFact.jsonl"))} # corpus.jsonl
# qrels = pd.read_csv("test.tsv", sep="\t", names=["query-id", "corpus-id", "score"]) #qrels/test.tsv
# Load test queries from test.tsv
import json

# Load queries.jsonl into a dictionary
queries = {q["_id"]: q["text"] for q in map(json.loads, open("queries.jsonl"))}

test_queries = pd.read_csv("train.tsv", sep="\t", names=["query-id", "corpus-id", "score"])
query_ids = test_queries["query-id"].unique()


In [34]:
def retrieve_top_k(query_text, k=10):
    """Retrieve top-k documents for a given query using Qdrant"""
    query_vector = model.encode(query_text).tolist()  # Convert query to embedding
    
    # Qdrant Search
    results = client.search(
        collection_name="scifact",
        query_vector=query_vector,
        limit=k
    )
    
    # Extract document IDs
    retrieved_docs = [hit.id for hit in results]
    return retrieved_docs


In [35]:
def get_relevant_answer_rank(query_id, retrieved_docs):
    """Determine the rank of the relevant document in top-k results"""
    relevant_doc_ids = test_queries[test_queries["query-id"] == query_id]["corpus-id"].tolist()
    
    # Find the rank (position) of relevant doc in retrieved list (if present)
    ranks = [retrieved_docs.index(doc) if doc in retrieved_docs else -1 for doc in relevant_doc_ids]
    
    # Keep only valid ranks (0-9), ignore missing (-1)
    valid_ranks = [r for r in ranks if r != -1]
    
    return min(valid_ranks) if valid_ranks else 10  # Return 10 if no relevant doc found


In [36]:
# 1. Test server availability first
import requests
import time
from qdrant_client import QdrantClient
from qdrant_client.http.exceptions import UnexpectedResponse

def test_qdrant_connection():
    try:
        response = requests.get("http://192.168.10.50:6333/collections")
        print(f"Server connection test: {response.status_code}")
        return True
    except:
        return False

# 2. Initialize client with better settings
if test_qdrant_connection():
    client = QdrantClient(
        url="http://192.168.10.50:6333",
        timeout=60.0,  # Increased timeout
        prefer_grpc=False
    )
    
    # 3. Try scrolling with error handling
    try:
        result = client.scroll(
            collection_name="scifact",
            limit=1,
            timeout=30
        )
        print(result)
    except Exception as e:
        print(f"Error during scroll: {e}")
        # print("Try using IP 127.0.0.1 instead of localhost")
else:
    print("Could not connect to Qdrant server")

Server connection test: 200
([Record(id=0, payload={'id': '4983', 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.', 'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. 

In [37]:
print(client.get_collections())

collections=[CollectionDescription(name='Arguana'), CollectionDescription(name='scidocs'), CollectionDescription(name='eyJ1c2VyX2lkIjogImpvaG5fZG9lIiwgImRvY3VtZW50X25hbWUiOiAiam9obl9rYV9kb2N1bWVudCIsICJ0aW1lc3RhbXAiOiAiMjAyNS0wMi0wNFQxMTowMjo1NS42NTgyMzIrMDE6MDAifQ=='), CollectionDescription(name='eyJ1c2VyX2lkIjogImpvaG5fZG9lIiwgImRvY3VtZW50X25hbWUiOiAiam9obl9rYV9kb2N1bWVudCIsICJ0aW1lc3RhbXAiOiAiMjAyNS0wMi0wNFQxMDoyODoyOS4xNzQwNjcrMDE6MDAifQ=='), CollectionDescription(name='eyJ1c2VyX2lkIjogImhlbGxvd29ybGQiLCAiZG9jdW1lbnRfbmFtZSI6ICJtYWluIiwgInRpbWVzdGFtcCI6ICIyMDI1LTAyLTAyVDE4OjQ5OjA2Ljg0NjM1OSswMTowMCJ9'), CollectionDescription(name='akshat.gupta_main'), CollectionDescription(name='trec-covid-beir'), CollectionDescription(name='eyJ1c2VyX2lkIjogImhlbGxvd29ybGQiLCAiZG9jdW1lbnRfbmFtZSI6ICJtYWluIiwgInRpbWVzdGFtcCI6ICIyMDI1LTAyLTAyVDE4OjM2OjE5LjI5NDEwMCswMTowMCJ9'), CollectionDescription(name='eyJ1c2VyX2lkIjogImpvaG5fZG9lIiwgImRvY3VtZW50X25hbWUiOiAiam9obl9rYV9kb2N1bWVudCIsICJ0aW1lc3RhbXAiO

In [38]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

data = []
for query_id in query_ids:
    query_text = queries.get(str(query_id), None)
    if query_text is None:
        continue  # Skip missing queries
    
    # Retrieve results for 3 iterations with error handling
    try:
        from time import sleep
        max_retries = 3
        delay = 1  # seconds between retries
        
        def safe_retrieve(query):
            for attempt in range(max_retries):
                try:
                    return retrieve_top_k(query, k=10)
                except Exception as e:
                    if attempt == max_retries - 1:
                        print(f"Failed to retrieve for query {query_id}: {str(e)}")
                        return []
                    sleep(delay)
            return []
            
        retrieved_v1 = safe_retrieve(query_text)
        sleep(0.5)  # Add small delay between calls
        retrieved_v2 = safe_retrieve(query_text)
        sleep(0.5)
        retrieved_v3 = safe_retrieve(query_text)
        
    except Exception as e:
        print(f"Error processing query {query_id}: {str(e)}")
        retrieved_v1 = retrieved_v2 = retrieved_v3 = []
    
    # Compute relevant answer rank for each iteration
    rank_v1 = get_relevant_answer_rank(query_id, retrieved_v1)
    rank_v2 = get_relevant_answer_rank(query_id, retrieved_v2)
    rank_v3 = get_relevant_answer_rank(query_id, retrieved_v3)
    
    # Check if deterministic (same ranks across iterations)
    deterministic = (rank_v1 == rank_v2 == rank_v3)
    
    data.append([query_id, rank_v1, rank_v2, rank_v3, deterministic])

# Create DataFrame
df_results = pd.DataFrame(data, columns=["query-id", "relevant_answer_rank_v1", "relevant_answer_rank_v2", "relevant_answer_rank_v3", "deterministic"])

# Save Results
df_results.to_csv("retrieval_results.csv", index=False)

# Display Results
df_results.head()


  results = client.search(


Unnamed: 0,query-id,relevant_answer_rank_v1,relevant_answer_rank_v2,relevant_answer_rank_v3,deterministic
0,0,10,10,10,True
1,2,10,10,10,True
2,4,10,10,10,True
3,6,10,10,10,True
4,9,10,10,10,True


In [39]:
df_results

Unnamed: 0,query-id,relevant_answer_rank_v1,relevant_answer_rank_v2,relevant_answer_rank_v3,deterministic
0,0,10,10,10,True
1,2,10,10,10,True
2,4,10,10,10,True
3,6,10,10,10,True
4,9,10,10,10,True
...,...,...,...,...,...
804,1403,10,10,10,True
805,1404,10,10,10,True
806,1405,10,10,10,True
807,1406,10,10,10,True
