# Milestone 3

First we have to encode the file so that the SentenceTransformer module can work with it.

In [3]:
import json

json_file_path = "../../milestone_1/datasets/06_University_documents.json"
new_json_file_path = "../datasets/07_University_documents.json"

json_file = open(json_file_path)
data = json.load(json_file)
json_file.close()

with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [6]:
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    # The model.encode() method already returns a list of floats
    return model.encode(text, convert_to_tensor=False).tolist()

json_file_path = "../datasets/07_University_documents.json"
new_json_file_path = "../datasets/08_Vectorized_university_documents.json"

json_file = open(json_file_path, encoding="utf-8")
data = json.load(json_file)
json_file.close()

# Update each document in the JSON data

combined_text=""

for document in tqdm(data):
    # Extract fields if they exist, otherwise default to empty strings
    institution_name = document.get("institution_name", "")
    size = document.get("size", "")
    country = document.get("country", "")
    age = document.get("age", "")
    city_wikipedia_text = document.get("city_wikipedia_text", "")
    wikipedia_text = document.get("wikipedia_text", "")
    combined_text = city_wikipedia_text + " " + wikipedia_text 
    
    document["university_vector"] = get_embedding(combined_text)
        
with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

100%|██████████| 519/519 [49:19<00:00,  5.70s/it]


## Query example

In [4]:
import requests
from sentence_transformers import SentenceTransformer

def text_to_embedding(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding = model.encode(text, convert_to_tensor=False).tolist()
    
    # Convert the embedding to the expected format
    embedding_str = "[" + ",".join(map(str, embedding)) + "]"
    return embedding_str

def solr_knn_query(endpoint, collection, embedding):
    url = f"{endpoint}/{collection}/select"

    data = {
        "q": f"{{!knn f=university_vector topK=30}}{embedding}",
        "fl": "2024_rank,institution_name",
        "rows": 30,
        "wt": "json"
    }
    
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    response = requests.post(url, data=data, headers=headers)
    response.raise_for_status()
    return response.json()

def display_results(results):
    docs = results.get("response", {}).get("docs", [])
    if not docs:
        print("No results found.")
        return

    for doc in docs:
        print(f"{doc.get('institution_name')}")

def main():
    solr_endpoint = "http://localhost:8983/solr"
    collection = "universities"
    
    #query_text = input("Enter your query: ")
    embedding = text_to_embedding("top universities in computer science in city with rich cultural heritage")

    try:
        results = solr_knn_query(solr_endpoint, collection, embedding)
        display_results(results)
    except requests.HTTPError as e:
        print(f"Error {e.response.status_code}: {e.response.text}")

if __name__ == "__main__":
    main()


University of Southampton
De Montfort University
TU Dresden
Tallinn University
Polytechnic University of Catalonia
Technical University of Valencia
University of Strathclyde
University of Sarajevo
Moscow State University
Norwegian University of Science and Technology
National and Kapodistrian University of Athens
University of Pécs
University of Glasgow
University of Rovira i Virgili
University of Siena
National Technical University of Athens
Heidelberg University
University of Coimbra
University of Lisbon
Glasgow Caledonian University
University of Leeds
University of Bucharest
Politehnica University of Bucharest
Polytechnic University of Milan
University of Milan
University of Milano-Bicocca
Università Cattolica del Sacro Cuore
Vita-Salute San Raffaele University
University of Groningen
University of Bologna


    \item Looking for universities that are top-ranked in computer science and are located in cities that have rich cultural heritage
    \item Looking for universities located in the United Kingdom that have courses in biology and are ranked in the top 150
    \item Looking for universities in Germany that have a dental medicine faculty/dentistry and a large number of students
    \item Looking for universities that have a faculty of engineering or a faculty of science and are located in a city with a Mediterranean climate
    \item Looking for top-ranked universities in the north of Europe with a focus on the Computer Science field