In [1]:
# ollama serve
# ollama pull nomic-embed-text

from pymilvus import MilvusClient, FieldSchema, DataType, Collection, connections
import ollama

conversation = [
    "Alice: Hey, how was your weekend?",
    "Bob: Pretty good! I went hiking. You?",
    "Alice: Nice! I just relaxed and watched a few movies.",
    "Bob: That sounds great. Anything worth recommending?",
    "Alice: Yeah, 'The Secret Garden' was surprisingly good!",
    "Bob: I'll check it out. Thanks!"
]

records_dict = []
for i, line in enumerate(conversation):
    records_dict.append(
        {
            "id": i+1,
            "original_text": line,
            "vector": ollama.embeddings(model='nomic-embed-text', prompt=line)['embedding']
        }
    )
    
# print(records_dict)
print("part 1 done")

part 1 done


In [2]:
# create a milvus collection
client = MilvusClient(
    uri="http://localhost:19530",
    token="root:Milvus"
)
print(client)

collection_name = "test_textvector_storage"
exists = client.has_collection(collection_name=collection_name)
print(f"exists:{exists}")

def create_collection(client):
    # 3.1. Create schema
    schema = MilvusClient.create_schema(
        enable_dynamic_field=True,
    )
    
    # 3.2. Add fields to schema
    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
    schema.add_field(field_name="original_text", datatype=DataType.VARCHAR, max_length=512)
    schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=768)
    
    # 3.3. Prepare index parameters
    index_params = client.prepare_index_params()
    
    # 3.4. Add indexes
    index_params.add_index(
        field_name="id",
        index_type="AUTOINDEX"
    )
    
    index_params.add_index(
        field_name="vector", 
        index_type="AUTOINDEX",
        metric_type="COSINE"
    )
    
    return client.create_collection(
        collection_name=collection_name,
        schema=schema,
        index_params=index_params
    )
        
if not exists:
    print(create_collection(client))

connections.connect(alias="default", host="127.0.0.1", port="19530")

collection = Collection(collection_name)
collection.load()
print(f"collection: {collection}")

<pymilvus.milvus_client.milvus_client.MilvusClient object at 0x117ed6ba0>
exists:True
collection: <Collection>:
-------------
<name>: test_textvector_storage
<description>: 
<schema>: {'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'original_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 512}}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': True}



In [4]:
# now to insert
res = client.insert(
    collection_name=collection_name,
    data=records_dict
)
print(res)

{'insert_count': 6, 'ids': [1, 2, 3, 4, 5, 6]}


In [15]:
# now retrieve the documents based on prompt
# question = "what did everyone do in their weekend?"
question = "Who did what in weekend"
prompt_vector = ollama.embeddings(model='nomic-embed-text', prompt=question)['embedding']

res = collection.search(
    anns_field="vector",
    data=[prompt_vector],
    param={"nprobe": 16},
    limit=3,
    output_fields=["original_text"]
)


# fusing the gathered information
retrieved_documents = ""
milvus_results_array = []
for hits in res:
    for hit in hits:
        milvus_results_array.append(hit)
        retrieved_documents = retrieved_documents + hit['entity']['original_text'] + ";"

print(retrieved_documents)


Alice: Hey, how was your weekend?;Alice: Nice! I just relaxed and watched a few movies.;Bob: Pretty good! I went hiking. You?;


In [5]:
response = ollama.chat(model='gemma:7b', messages=[{
    'role': 'user', 
    'content': f"Answer this question: {question} Based on the information here: {retrieved_documents}",
}])
print(f"Answer this question: {question} Based on the information here: {retrieved_documents}")
print("-----------------")
print(response['message']['content'])

Answer this question: Who did what in weekend Based on the information here: Alice: Hey, how was your weekend?;Alice: Nice! I just relaxed and watched a few movies.;Bob: Pretty good! I went hiking. You?;
-----------------
Based on the information provided, Alice relaxed and watched movies, while Bob went hiking.


In [21]:
# Requires transformers>=4.51.0
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

def process_inputs(pairs):
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

@torch.no_grad()
def compute_logits(inputs, **kwargs):
    batch_scores = model(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp().tolist()
    return scores

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B").eval()
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B", torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda().eval()
token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
max_length = 8192

prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. 
                Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)

In [27]:
task = 'Retrieve passages where someone said they did something'

queries = [question]

# print(res)
# print([hit for hits in res])

print(milvus_results_array)

documents = [r['entity']['original_text'] for r in milvus_results_array]
print(documents)

pairs = [format_instruction(task, queries[0], doc) for doc in (documents)]

# # Tokenize the input texts
inputs = process_inputs(pairs)
scores = compute_logits(inputs)

print("scores: ", scores)

[{'id': 1, 'distance': 0.6612014770507812, 'entity': {'original_text': 'Alice: Hey, how was your weekend?'}}, {'id': 3, 'distance': 0.42717790603637695, 'entity': {'original_text': 'Alice: Nice! I just relaxed and watched a few movies.'}}, {'id': 2, 'distance': 0.4147673547267914, 'entity': {'original_text': 'Bob: Pretty good! I went hiking. You?'}}]
['Alice: Hey, how was your weekend?', 'Alice: Nice! I just relaxed and watched a few movies.', 'Bob: Pretty good! I went hiking. You?']
scores:  [0.08847928047180176, 0.06078891456127167, 0.03812427073717117]
