# Milestone 3

## Add university URLs

In [20]:
import pandas as pd
import requests
import json

current_file_path = "../datasets/08_Vectorized_university_documents.json"
new_file_path = "../datasets/09_Vectorized_university_documents.json"
df = pd.read_json(current_file_path)

base_url = "https://en.wikipedia.org/wiki/"

urls = []

uni_names = df["institution_name"]

for idx, uni in enumerate(uni_names):
    try:
        print("Starting API call for: ", uni, ".")
        uni_url = base_url + uni.replace(" ", "_")
        
        r = requests.get(uni_url)
        
        if r.ok: 
            print("Finished API call for: ", idx, " - ", uni, " - OK")
            urls.append(uni_url)
        else:
            print("Finished API call for: ", idx, " - ", uni, " - NOT FOUND")
            urls.append("NOT_FOUND")
       
    except Exception as ex:
        print("Failed API call for: ", idx, " - ", uni, " with exception: ", ex)
        urls.append("NOT_FOUND")
    
    print(uni_url)
    
uni_names_df = pd.DataFrame(urls)

df["url"] = uni_names_df

print("Could not find URL for ", df["url"][df["url"] == "NOT_FOUND"].count(), " universities.")

df_json = df.to_json(orient="records")

new_file = open(new_file_path, "w")
new_file.writelines(df_json)
new_file.close()

df.head()


Starting API call for:  University of Cambridge .
Finished API call for:  0  -  University of Cambridge  - OK
https://en.wikipedia.org/wiki/University_of_Cambridge
Starting API call for:  University of Oxford .
Finished API call for:  1  -  University of Oxford  - OK
https://en.wikipedia.org/wiki/University_of_Oxford
Starting API call for:  Imperial College London .
Finished API call for:  2  -  Imperial College London  - OK
https://en.wikipedia.org/wiki/Imperial_College_London
Starting API call for:  ETH Zurich .
Finished API call for:  3  -  ETH Zurich  - OK
https://en.wikipedia.org/wiki/ETH_Zurich
Starting API call for:  UCL .
Finished API call for:  4  -  UCL  - OK
https://en.wikipedia.org/wiki/UCL
Starting API call for:  University of Edinburgh .
Finished API call for:  5  -  University of Edinburgh  - OK
https://en.wikipedia.org/wiki/University_of_Edinburgh
Starting API call for:  Paris Sciences et Lettres University .
Finished API call for:  6  -  Paris Sciences et Lettres Unive

Unnamed: 0,2024_rank,2023_rank,institution_name,country_code,country,size,focus,age,status,academic_reputation_score,...,overall_score,institution_name_-_wrong,wikidata,wikipedia_text,foundation_date,city_name,city_wikipedia_text,coordinates,university_vector,url
0,2,2,University of Cambridge,UK,United Kingdom,large,FC,historic,A,100.0,...,99.2,University of Cambridge,"{""entities"": {""Q35794"": {""type"": ""item"", ""id"":...",The University of Cambridge is a public colleg...,,Cambridgeshire,Cambridgeshire (abbreviated Cambs.) is a cerem...,"52.205355979757925,0.1131572696396882","[0.123701483011245, -0.10545004159212101, 0.03...",https://en.wikipedia.org/wiki/University_of_Ca...
1,3,4,University of Oxford,UK,United Kingdom,large,FC,historic,A,100.0,...,98.9,University of Oxford,"{""entities"": {""Q34433"": {""type"": ""item"", ""id"":...",The University of Oxford is a collegiate resea...,,Oxford,"Oxford ()Dictionary.com, ""oxford"" in Dictionar...","51.755,-1.255","[0.08868616819381701, -0.087829910218715, 0.04...",https://en.wikipedia.org/wiki/University_of_Ox...
2,6,6,Imperial College London,UK,United Kingdom,large,FC,historic,A,98.3,...,97.8,Imperial College London,"{""entities"": {""Q189022"": {""type"": ""item"", ""id""...",Imperial College London (sometimes known simpl...,,Royal Borough of Kensington and Chelsea,The Royal Borough of Kensington and Chelsea (o...,"51.498308,-0.176882","[0.159273698925971, -0.06799332797527301, 0.07...",https://en.wikipedia.org/wiki/Imperial_College...
3,7,9,ETH Zurich,CH,Switzerland,large,FO,historic,A,98.8,...,93.3,ETH Zurich - Swiss Federal Institute of Techno...,"{""entities"": {""Q11942"": {""type"": ""item"", ""id"":...",ETH Zurich (; ) is a public research universit...,,Zürich,"thumb|Logo of the city of Zürich\nZürich ( , ;...","47.37645,8.54785","[0.070952259004116, 0.023061515763401003, 0.03...",https://en.wikipedia.org/wiki/ETH_Zurich
4,9,8,UCL,UK,United Kingdom,extra large,FC,historic,A,99.5,...,92.4,UCL,"{""entities"": {""Q250211"": {""type"": ""item"", ""id""...",UCL may refer to:\r\n Education \r\n Universit...,,NOT FOUND,NOT FOUND,"0.0,0.0","[-0.07706703245639801, -0.047585718333721, -0....",https://en.wikipedia.org/wiki/UCL


First we have to encode the file so that the SentenceTransformer module can work with it.

In [3]:
import json

json_file_path = "../../milestone_1/datasets/06_University_documents.json"
new_json_file_path = "../datasets/07_University_documents.json"

json_file = open(json_file_path)
data = json.load(json_file)
json_file.close()

with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [6]:
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    # The model.encode() method already returns a list of floats
    return model.encode(text, convert_to_tensor=False).tolist()

json_file_path = "../datasets/07_University_documents.json"
new_json_file_path = "../datasets/08_Vectorized_university_documents.json"

json_file = open(json_file_path, encoding="utf-8")
data = json.load(json_file)
json_file.close()

# Update each document in the JSON data

combined_text=""

for document in tqdm(data):
    # Extract fields if they exist, otherwise default to empty strings
    institution_name = document.get("institution_name", "")
    size = document.get("size", "")
    country = document.get("country", "")
    age = document.get("age", "")
    city_wikipedia_text = document.get("city_wikipedia_text", "")
    wikipedia_text = document.get("wikipedia_text", "")
    combined_text = city_wikipedia_text + " " + wikipedia_text 
    
    document["university_vector"] = get_embedding(combined_text)
        
with open(new_json_file_path, 'w', encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

100%|██████████| 519/519 [49:19<00:00,  5.70s/it]


## Query example

In [28]:
import requests
from sentence_transformers import SentenceTransformer

semantic_res = []
query_res = []

def text_to_embedding(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding = model.encode(text, convert_to_tensor=False).tolist()
    
    # Convert the embedding to the expected format
    embedding_str = "[" + ",".join(map(str, embedding)) + "]"
    return embedding_str

def solr_knn_query(endpoint, collection, embedding):
    url = f"{endpoint}/{collection}/select"

    data = {
        "q": "country: \"united kingdom\" country:biology country:top country:universities country:in country:biology country:courses\nwikipedia_text:\"united kingdom\" wikipedia_text:\"biology\" wikipedia_text:top wikipedia_text:universities wikipedia_text:in wikipedia_text:biology country:courses\ncity_wikipedia_text:\"united kingdom\" city_wikipedia_text:\"biology\" city_wikipedia_text:top city_wikipedia_text:universities city_wikipedia_text:in city_wikipedia_text:biology country:courses\nsize:\"united kingdom\" size:biology size:top size:universities size:in size:biology size:courses",
        "rqq": f"{{!knn f=university_vector topK=10}}{embedding}",
        "fl": "2024_rank,institution_name",
        "rows": 10,
        "wt": "json",
        "defType": "edismax",
        "indent": "true",
        "qf": "country^4 wikipedia_text^3 city_wikipedia_text^2 size",
        "q.op":"OR",
    }
    
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    response = requests.post(url, data=data, headers=headers)
    response.raise_for_status()
    return response.json()

def display_results(results):
    docs = results.get("response", {}).get("docs", [])
    if not docs:
        print("No results found.")
        return

    for doc in docs:
        print(f"* {doc.get('2024_rank')} {doc.get('institution_name')}")
        semantic_res.append({"2024_rank": doc.get('2024_rank'), "institution_name": doc.get('institution_name')})

def main():
    solr_endpoint = "http://localhost:8983/solr"
    collection = "universities"
    
    #query_text = input("Enter your query: ")
    embedding = text_to_embedding("top universites in united kingdom biology courses")

    try:
        results = solr_knn_query(solr_endpoint, collection, embedding)
        display_results(results)
    except requests.HTTPError as e:
        print(f"Error {e.response.status_code}: {e.response.text}")

def query_search():
    params = dict(
        origin='Chicago,IL',
        destination='Los+Angeles,CA',
        waypoints='Joplin,MO|Oklahoma+City,OK',
        sensor='false'
    )

    resp = requests.get(url="http://localhost:8983/solr/universities/select?indent=true&q.op=OR&q=country%3Afaculty%20country%3Aof%20country%3Aengineering%20country%3Ascience%20country%3Acity%20country%3Amediterranean%20country%3Aclimate%0Awikipedia_text%3Afaculty%20wikipedia_text%3Aof%20wikipedia_text%3Aengineering%20wikipedia_text%3A%20science%20wikipedia_text%3Acity%20wikipedia_text%3Amediterranean%20wikipedia_text%3Aclimate%0Acity_wikipedia_text%3Afaculty%20city_wikipedia_text%3Aof%20city_wikipedia_text%3Aengineering%20city_wikipedia_text%3Ascience%20city_wikipedia_text%3Acity%20city_wikipedia_text%3Amediterranean%20city_wikipedia_text%3Aclimate%0Asize%3Afaculty%20size%3Aof%20size%3Aengineering%20size%3Ascience%20size%3Acity%20size%3Amediterranean%20size%3Aclimate&useParams=", params=params)
    data = resp.json() 
    print(data)

if __name__ == "__main__":
    main()
    #query_search()
        


* None University of Plymouth
* None University of Manchester
* None University of Leicester
* None Aberystwyth University
* None University of Exeter
* None University of Sheffield
* None University of St Andrews
* None Newcastle University
* None Aston University
* None University of Liverpool


    \item Looking for universities that are top-ranked in computer science and are located in cities that have rich cultural heritage
    \item Looking for universities located in the United Kingdom that have courses in biology and are ranked in the top 150
    \item Looking for universities in Germany that have a dental medicine faculty/dentistry and a large number of students
    \item Looking for universities that have a faculty of engineering or a faculty of science and are located in a city with a Mediterranean climate
    \item Looking for top-ranked universities in the north of Europe with a focus on the Computer Science field