# Milestone 3

First we have to encode the file so that the SentenceTransformer module can work with it.

In [3]:
import json

json_file_path = "../../milestone_1/datasets/06_University_documents.json"
new_json_file_path = "../datasets/07_University_documents.json"

json_file = open(json_file_path)
data = json.load(json_file)
json_file.close()

with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [10]:
import json
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    # The model.encode() method already returns a list of floats
    return model.encode(text, convert_to_tensor=False).tolist()

json_file_path = "../datasets/07_University_documents.json"
new_json_file_path = "../datasets/08_Vectorized_university_documents.json"

json_file = open(json_file_path, encoding="utf-8")
data = json.load(json_file)
json_file.close()

# Update each document in the JSON data

combined_text=""

for document in data:
    # Extract fields if they exist, otherwise default to empty strings
    city_wikipedia_text = document.get("city_wikipedia_text", "")
    wikipedia_text = document.get("wikipedia_text", "")
    combined_text = city_wikipedia_text + " " + wikipedia_text 
    
    document["university_vector"] = get_embedding(combined_text)
        
with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

## Query example

In [13]:
import requests
from sentence_transformers import SentenceTransformer

def text_to_embedding(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding = model.encode(text, convert_to_tensor=False).tolist()
    
    # Convert the embedding to the expected format
    embedding_str = "[" + ",".join(map(str, embedding)) + "]"
    return embedding_str

def solr_knn_query(endpoint, collection, embedding):
    url = f"{endpoint}/{collection}/select"

    data = {
        "q": f"{{!knn f=university_vector topK=10}}{embedding}",
        "fl": "2024_rank,institution_name",
        "rows": 10,
        "wt": "json"
    }
    
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    response = requests.post(url, data=data, headers=headers)
    response.raise_for_status()
    return response.json()

def display_results(results):
    docs = results.get("response", {}).get("docs", [])
    if not docs:
        print("No results found.")
        return

    for doc in docs:
        print(f"* {doc.get('2024_rank')} {doc.get('institution_name')}")

def main():
    solr_endpoint = "http://localhost:8983/solr"
    collection = "universities"
    
    #query_text = input("Enter your query: ")
    embedding = text_to_embedding("Porto")

    try:
        results = solr_knn_query(solr_endpoint, collection, embedding)
        display_results(results)
    except requests.HTTPError as e:
        print(f"Error {e.response.status_code}: {e.response.text}")

if __name__ == "__main__":
    main()


* None University of Porto
* None University of Minho
* None University of Aveiro
* None University of Castilla–La Mancha
* None University of Lisbon
* None University of Coimbra
* None University of Seville
* None Catholic University of Portugal
* None University of Santiago de Compostela
* None University of Valladolid
