# Milestone 3

First we have to encode the file so that the SentenceTransformer module can work with it.

In [3]:
import json

json_file_path = "../../milestone_1/datasets/06_University_documents.json"
new_json_file_path = "../datasets/07_University_documents.json"

json_file = open(json_file_path)
data = json.load(json_file)
json_file.close()

with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [10]:
import json
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    # The model.encode() method already returns a list of floats
    return model.encode(text, convert_to_tensor=False).tolist()

json_file_path = "../datasets/07_University_documents.json"
new_json_file_path = "../datasets/08_Vectorized_university_documents.json"

json_file = open(json_file_path, encoding="utf-8")
data = json.load(json_file)
json_file.close()

# Update each document in the JSON data

combined_text=""

for document in data:
    # Extract fields if they exist, otherwise default to empty strings
    city_wikipedia_text = document.get("city_wikipedia_text", "")
    wikipedia_text = document.get("wikipedia_text", "")
    combined_text = city_wikipedia_text + " " + wikipedia_text 
    
    document["university_vector"] = get_embedding(combined_text)
        
with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

## Query example

In [28]:
import requests
from sentence_transformers import SentenceTransformer

semantic_res = []
query_res = []

def text_to_embedding(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding = model.encode(text, convert_to_tensor=False).tolist()
    
    # Convert the embedding to the expected format
    embedding_str = "[" + ",".join(map(str, embedding)) + "]"
    return embedding_str

def solr_knn_query(endpoint, collection, embedding):
    url = f"{endpoint}/{collection}/select"

    data = {
        "q": "country: \"united kingdom\" country:biology country:top country:universities country:in country:biology country:courses\nwikipedia_text:\"united kingdom\" wikipedia_text:\"biology\" wikipedia_text:top wikipedia_text:universities wikipedia_text:in wikipedia_text:biology country:courses\ncity_wikipedia_text:\"united kingdom\" city_wikipedia_text:\"biology\" city_wikipedia_text:top city_wikipedia_text:universities city_wikipedia_text:in city_wikipedia_text:biology country:courses\nsize:\"united kingdom\" size:biology size:top size:universities size:in size:biology size:courses",
        "rqq": f"{{!knn f=university_vector topK=10}}{embedding}",
        "fl": "2024_rank,institution_name",
        "rows": 10,
        "wt": "json",
        "defType": "edismax",
        "indent": "true",
        "qf": "country^4 wikipedia_text^3 city_wikipedia_text^2 size",
        "q.op":"OR",
    }
    
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    response = requests.post(url, data=data, headers=headers)
    response.raise_for_status()
    return response.json()

def display_results(results):
    docs = results.get("response", {}).get("docs", [])
    if not docs:
        print("No results found.")
        return

    for doc in docs:
        print(f"* {doc.get('2024_rank')} {doc.get('institution_name')}")
        semantic_res.append({"2024_rank": doc.get('2024_rank'), "institution_name": doc.get('institution_name')})

def main():
    solr_endpoint = "http://localhost:8983/solr"
    collection = "universities"
    
    #query_text = input("Enter your query: ")
    embedding = text_to_embedding("top universites in united kingdom biology courses")

    try:
        results = solr_knn_query(solr_endpoint, collection, embedding)
        display_results(results)
    except requests.HTTPError as e:
        print(f"Error {e.response.status_code}: {e.response.text}")

def query_search():
    params = dict(
        origin='Chicago,IL',
        destination='Los+Angeles,CA',
        waypoints='Joplin,MO|Oklahoma+City,OK',
        sensor='false'
    )

    resp = requests.get(url="http://localhost:8983/solr/universities/select?indent=true&q.op=OR&q=country%3Afaculty%20country%3Aof%20country%3Aengineering%20country%3Ascience%20country%3Acity%20country%3Amediterranean%20country%3Aclimate%0Awikipedia_text%3Afaculty%20wikipedia_text%3Aof%20wikipedia_text%3Aengineering%20wikipedia_text%3A%20science%20wikipedia_text%3Acity%20wikipedia_text%3Amediterranean%20wikipedia_text%3Aclimate%0Acity_wikipedia_text%3Afaculty%20city_wikipedia_text%3Aof%20city_wikipedia_text%3Aengineering%20city_wikipedia_text%3Ascience%20city_wikipedia_text%3Acity%20city_wikipedia_text%3Amediterranean%20city_wikipedia_text%3Aclimate%0Asize%3Afaculty%20size%3Aof%20size%3Aengineering%20size%3Ascience%20size%3Acity%20size%3Amediterranean%20size%3Aclimate&useParams=", params=params)
    data = resp.json() 
    print(data)

if __name__ == "__main__":
    main()
    #query_search()
        


* None University of Plymouth
* None University of Manchester
* None University of Leicester
* None Aberystwyth University
* None University of Exeter
* None University of Sheffield
* None University of St Andrews
* None Newcastle University
* None Aston University
* None University of Liverpool
