## Inference

In [36]:
import pandas as pd
import ast
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD

# Carica i dati
df_authors = pd.read_csv('../data/authors/authors_final.csv')
df_courses = pd.read_csv('../data/courses/courses.csv')

# Inizializza grafo RDF
g = Graph()
DS = Namespace("http://example.org/ds#")
g.bind("ds", DS)
g.bind("rdfs", RDFS)

# Mappa ORCID ‚Üí info autore
author_info = {
    str(row['orcid']).strip(): row for _, row in df_authors.iterrows()
}

# Aggiungi corsi e relazioni autore-corso
for _, row in df_courses.iterrows():
    course_name = row['Course']
    course_id = course_name.replace(' ', '_')
    course_uri = URIRef(f"http://example.org/ds/{course_id}")

    g.add((course_uri, RDF.type, DS.Course))
    g.add((course_uri, DS.courseName, Literal(course_name)))
    g.add((course_uri, RDFS.label, Literal(course_name)))

    # Parsa lista di ORCID
    try:
        orcid_list = ast.literal_eval(row['orcid'])
    except Exception as e:
        print(f"‚ö†Ô∏è Errore parsing ORCID per '{course_name}': {e}")
        continue

    for orcid in orcid_list:
        orcid = orcid.strip().replace('"', '').replace("'", "")
        author_uri = URIRef(f"http://example.org/ds/{orcid}")
        
        # Base autore
        g.add((author_uri, RDF.type, DS.Author))
        g.add((author_uri, DS.hasORCID, Literal(orcid)))
        g.add((author_uri, DS.teaches, course_uri))

        # Arricchimento se info presente
        if orcid in author_info:
            info = author_info[orcid]
            name = str(info['Name'])

            g.add((author_uri, DS.fullName, Literal(name)))
            g.add((author_uri, RDFS.label, Literal(name)))  # Mostrato nei tool

            if pd.notna(info['SSD']):
                g.add((author_uri, DS.hasSSD, Literal(info['SSD'])))

            if pd.notna(info['hindex']):
                g.add((author_uri, DS.hasHIndex, Literal(int(info['hindex']), datatype=XSD.integer)))

            if pd.notna(info['openalex id']):
                g.add((author_uri, DS.hasOpenAlexID, Literal(info['openalex id'])))

            if pd.notna(info['topics']):
                g.add((author_uri, DS.hasTopic, Literal(info['topics'])))

            if pd.notna(info['past_institutions_id']):
                g.add((author_uri, DS.pastInstitutionsID, Literal(info['past_institutions_id'])))

            if pd.notna(info['DS Department']):
                g.add((author_uri, DS.hasDSDepartment, Literal(info['DS Department'])))

            if pd.notna(info['ins_id']):
                g.add((author_uri, DS.hasInstitutionID, Literal(info['ins_id'])))
        else:
            print(f"‚ö†Ô∏è ORCID non trovato in df_authors: {orcid}")
        
    # STEP FINALE: aggiungi autori non presenti tra gli insegnanti
    for orcid, info in author_info.items():
        orcid = orcid.strip()
        author_uri = URIRef(f"http://example.org/ds/{orcid}")

        # Se autore non ha nessuna tripla ds:teaches ‚Üí aggiungilo ora
        if (author_uri, DS.teaches, None) not in g:
            g.add((author_uri, RDF.type, DS.Author))
            g.add((author_uri, DS.hasORCID, Literal(orcid)))
            name = str(info['Name'])
            g.add((author_uri, DS.fullName, Literal(name)))
            g.add((author_uri, RDFS.label, Literal(name)))

            if pd.notna(info['SSD']):
                g.add((author_uri, DS.hasSSD, Literal(info['SSD'])))

            if pd.notna(info['hindex']):
                g.add((author_uri, DS.hasHIndex, Literal(int(info['hindex']), datatype=XSD.integer)))

            if pd.notna(info['openalex id']):
                g.add((author_uri, DS.hasOpenAlexID, Literal(info['openalex id'])))

            if pd.notna(info['topics']):
                g.add((author_uri, DS.hasTopic, Literal(info['topics'])))

            if pd.notna(info['past_institutions_id']):
                g.add((author_uri, DS.pastInstitutionsID, Literal(info['past_institutions_id'])))

            if pd.notna(info['DS Department']):
                g.add((author_uri, DS.hasDSDepartment, Literal(info['DS Department'])))

            if pd.notna(info['ins_id']):
                g.add((author_uri, DS.hasInstitutionID, Literal(info['ins_id'])))
            
# --- Aggiunta dei paper al grafo ---
df_papers = pd.read_csv('../data/papers/papers.csv')  # adatta il path

for _, row in df_papers.iterrows():
    if pd.isna(row['doi']):
        continue

    doi = row['doi'].strip()
    paper_uri = URIRef(f"http://example.org/ds/paper/{doi.replace('/', '_')}")

    g.add((paper_uri, RDF.type, DS.Paper))
    g.add((paper_uri, DS.hasDOI, Literal(doi)))
    g.add((paper_uri, DS.hasTitle, Literal(row['title'])))
    g.add((paper_uri, DS.hasYear, Literal(int(row['year']), datatype=XSD.gYear)))
    g.add((paper_uri, DS.hasType, Literal(row['type'])))
    g.add((paper_uri, DS.hasTopic, Literal(row['topics'])))

    # Associa autori via ORCID
    try:
        orcid_list = ast.literal_eval(row['author_orcids'])
    except Exception as e:
        print(f"‚ö†Ô∏è Errore parsing ORCID in paper {doi}: {e}")
        continue

    for orcid in orcid_list:
        if orcid is None or pd.isna(orcid):
            continue
        orcid = orcid.strip()
        author_uri = URIRef(f"http://example.org/ds/{orcid}")
        g.add((paper_uri, DS.hasAuthor, author_uri))
        g.add((author_uri, DS.authored, paper_uri))  # relazione inversa opzionale

# --- Aggiunta delle istituzioni e collegamento agli autori ---
df_institutions = pd.read_csv('../data/institution/institutions.csv')  # adatta path

# Mappa ID ‚Üí dati istituzione
institution_info = {
    str(row['ins_id']).strip(): row for _, row in df_institutions.iterrows()
}

# Aggiungi istituzioni al grafo
for ins_id, row in institution_info.items():
    inst_uri = URIRef(ins_id)  # usiamo direttamente l'URI di OpenAlex
    g.add((inst_uri, RDF.type, DS.Institution))
    g.add((inst_uri, DS.institutionName, Literal(row['ins_name'])))
    g.add((inst_uri, DS.institutionType, Literal(row['ins_type'])))
    g.add((inst_uri, DS.institutionCountry, Literal(row['ins_country'])))
    g.add((inst_uri, RDFS.label, Literal(row['ins_name'])))

# Collega autori alle istituzioni (se l'ins_id esiste)
for orcid, info in author_info.items():
    orcid = orcid.strip()
    author_uri = URIRef(f"http://example.org/ds/{orcid}")
    ins_id = str(info['ins_id']).strip()

    if ins_id in institution_info:
        inst_uri = URIRef(ins_id)
        g.add((author_uri, DS.hasInstitution, inst_uri))

from rdflib.namespace import RDF, RDFS, XSD, OWL

# Class hierarchy
g.add((DS.Author, RDFS.subClassOf, DS.Person))

# Property domains and ranges
g.add((DS.authored, RDFS.domain, DS.Author))
g.add((DS.authored, RDFS.range, DS.Paper))

g.add((DS.hasAuthor, RDFS.domain, DS.Paper))
g.add((DS.hasAuthor, RDFS.range, DS.Author))

# Propriet√† inverse (non automatiche in RDFS)
g.add((DS.authored, OWL.inverseOf, DS.hasAuthor))

from rdflib import Graph, URIRef
from rdflib.namespace import OWL
import pandas as pd

# Carica il dataset (modifica il percorso se necessario)
df = pd.read_csv("../data/institution/institutions_wiki.csv")

# Inizializza il grafo RDF
g 

# Aggiungi solo le triple owl:sameAs se wikidata_id √® presente
for _, row in df.iterrows():
    wikidata_id = row.get("wikidata_id")
    if pd.notna(wikidata_id) and wikidata_id.strip() != "":
        openalex_uri = URIRef(row["ins_id"].strip())
        wikidata_uri = URIRef(wikidata_id.strip())
        g.add((openalex_uri, OWL.sameAs, wikidata_uri))
# Esporta in Turtle
ttl_output = g.serialize(format="turtle")
print(ttl_output[:2000])  # Anteprima


@prefix ds: <http://example.org/ds#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ds:authored rdfs:domain ds:Author ;
    rdfs:range ds:Paper ;
    owl:inverseOf ds:hasAuthor .

<http://example.org/ds/0000-0001-9601-0403> a ds:Author ;
    rdfs:label "Mirko Cesarini" ;
    ds:fullName "Mirko Cesarini" ;
    ds:hasDSDepartment 1 ;
    ds:hasHIndex 15 ;
    ds:hasInstitution <https://openalex.org/I66752286> ;
    ds:hasInstitutionID "https://openalex.org/I66752286" ;
    ds:hasORCID "0000-0001-9601-0403" ;
    ds:hasOpenAlexID "https://openalex.org/A5049259722" ;
    ds:hasSSD "ING-INF/05" ;
    ds:hasTopic "['Data Quality and Management', 'Semantic Web and Ontologies', 'Data Mining Algorithms and Applications', 'Privacy-Preserving Technologies in Data', 'Advanced Database Systems and Queries', 'Service-Oriented Architecture and Web Services', 'Big Data and Business Intelligen

In [38]:
import pandas as pd
import ast
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, OWL

# === Caricamento dati ===
df_authors = pd.read_csv('../data/authors/authors_final.csv')
df_courses = pd.read_csv('../data/courses/courses.csv')
df_papers = pd.read_csv('../data/papers/papers.csv')
df_wiki = pd.read_csv('../data/institution/institutions_wiki.csv')

# === Inizializza grafo ===
g = Graph()
DS = Namespace("http://example.org/ds#")
g.bind("ds", DS)
g.bind("rdfs", RDFS)
g.bind("owl", OWL)

# === Mappa ORCID ‚Üí info autore ===
author_info = {
    str(row['orcid']).strip(): row for _, row in df_authors.iterrows()
}

# === Aggiungi corsi e relazioni con autori ===
for _, row in df_courses.iterrows():
    course_name = row['Course']
    course_id = course_name.replace(' ', '_')
    course_uri = URIRef(f"http://example.org/ds/{course_id}")
    g.add((course_uri, RDF.type, DS.Course))
    g.add((course_uri, DS.courseName, Literal(course_name)))
    g.add((course_uri, RDFS.label, Literal(course_name)))

    try:
        orcid_list = ast.literal_eval(row['orcid'])
    except Exception as e:
        print(f"‚ö†Ô∏è Errore parsing ORCID per '{course_name}': {e}")
        continue

    for orcid in orcid_list:
        orcid = orcid.strip().replace('"', '').replace("'", "")
        author_uri = URIRef(f"http://example.org/ds/{orcid}")
        g.add((author_uri, RDF.type, DS.Author))
        g.add((author_uri, DS.hasORCID, Literal(orcid)))
        g.add((author_uri, DS.teaches, course_uri))

        if orcid in author_info:
            info = author_info[orcid]
            g.add((author_uri, DS.fullName, Literal(info['Name'])))
            g.add((author_uri, RDFS.label, Literal(info['Name'])))
            if pd.notna(info['SSD']):
                g.add((author_uri, DS.hasSSD, Literal(info['SSD'])))
            if pd.notna(info['hindex']):
                g.add((author_uri, DS.hasHIndex, Literal(int(info['hindex']), datatype=XSD.integer)))
            if pd.notna(info['openalex id']):
                g.add((author_uri, DS.hasOpenAlexID, Literal(info['openalex id'])))
            if pd.notna(info['topics']):
                g.add((author_uri, DS.hasTopic, Literal(info['topics'])))
            if pd.notna(info['past_institutions_id']):
                g.add((author_uri, DS.pastInstitutionsID, Literal(info['past_institutions_id'])))
            if pd.notna(info['DS Department']):
                g.add((author_uri, DS.hasDSDepartment, Literal(info['DS Department'])))
            if pd.notna(info['ins_id']):
                g.add((author_uri, DS.hasInstitutionID, Literal(info['ins_id'])))

# === Aggiungi autori non docenti ===
for orcid, info in author_info.items():
    author_uri = URIRef(f"http://example.org/ds/{orcid}")
    if (author_uri, DS.teaches, None) not in g:
        g.add((author_uri, RDF.type, DS.Author))
        g.add((author_uri, DS.hasORCID, Literal(orcid)))
        g.add((author_uri, DS.fullName, Literal(info['Name'])))
        g.add((author_uri, RDFS.label, Literal(info['Name'])))
        if pd.notna(info['SSD']):
            g.add((author_uri, DS.hasSSD, Literal(info['SSD'])))
        if pd.notna(info['hindex']):
            g.add((author_uri, DS.hasHIndex, Literal(int(info['hindex']), datatype=XSD.integer)))
        if pd.notna(info['openalex id']):
            g.add((author_uri, DS.hasOpenAlexID, Literal(info['openalex id'])))
        if pd.notna(info['topics']):
            g.add((author_uri, DS.hasTopic, Literal(info['topics'])))
        if pd.notna(info['past_institutions_id']):
            g.add((author_uri, DS.pastInstitutionsID, Literal(info['past_institutions_id'])))
        if pd.notna(info['DS Department']):
            g.add((author_uri, DS.hasDSDepartment, Literal(info['DS Department'])))
        if pd.notna(info['ins_id']):
            g.add((author_uri, DS.hasInstitutionID, Literal(info['ins_id'])))

# === Aggiungi paper ===
for _, row in df_papers.iterrows():
    if pd.isna(row['doi']):
        continue
    doi = row['doi'].strip()
    paper_uri = URIRef(f"http://example.org/ds/paper/{doi.replace('/', '_')}")
    g.add((paper_uri, RDF.type, DS.Paper))
    g.add((paper_uri, DS.hasDOI, Literal(doi)))
    g.add((paper_uri, DS.hasTitle, Literal(row['title'])))
    g.add((paper_uri, DS.hasYear, Literal(int(row['year']), datatype=XSD.gYear)))
    g.add((paper_uri, DS.hasType, Literal(row['type'])))
    g.add((paper_uri, DS.hasTopic, Literal(row['topics'])))

    try:
        orcid_list = ast.literal_eval(row['author_orcids'])
    except Exception as e:
        print(f"‚ö†Ô∏è Errore parsing ORCID in paper {doi}: {e}")
        continue

    for orcid in orcid_list:
        if orcid and pd.notna(orcid):
            orcid = orcid.strip()
            author_uri = URIRef(f"http://example.org/ds/{orcid}")
            g.add((paper_uri, DS.hasAuthor, author_uri))
            g.add((author_uri, DS.authored, paper_uri))

# === Aggiungi istituzioni da df_wiki ===
institution_info = {
    str(row['ins_id']).strip(): row for _, row in df_wiki.iterrows()
}

for ins_id, row in institution_info.items():
    inst_uri = URIRef(ins_id)
    g.add((inst_uri, RDF.type, DS.Institution))
    g.add((inst_uri, DS.institutionName, Literal(row['ins_name'])))
    g.add((inst_uri, DS.institutionType, Literal(row['ins_type'])))
    g.add((inst_uri, DS.institutionCountry, Literal(row['ins_country'])))
    g.add((inst_uri, RDFS.label, Literal(row['ins_name'])))

    if pd.notna(row['wikidata_id']) and row['wikidata_id'].strip():
        wikidata_uri = URIRef(row['wikidata_id'].strip())
        g.add((inst_uri, OWL.sameAs, wikidata_uri))

# === Collega autori alle istituzioni (solo se ins_id √® valido) ===
for orcid, info in author_info.items():
    author_uri = URIRef(f"http://example.org/ds/{orcid}")
    ins_id = str(info['ins_id']).strip()
    if ins_id in institution_info:
        inst_uri = URIRef(ins_id)
        g.add((author_uri, DS.hasInstitution, inst_uri))

# === Ontologia base: propriet√† e inverse ===
g.add((DS.Author, RDFS.subClassOf, DS.Person))
g.add((DS.authored, RDFS.domain, DS.Author))
g.add((DS.authored, RDFS.range, DS.Paper))
g.add((DS.hasAuthor, RDFS.domain, DS.Paper))
g.add((DS.hasAuthor, RDFS.range, DS.Author))
g.add((DS.authored, OWL.inverseOf, DS.hasAuthor))

# === Serializza (anteprima) ===
ttl_output = g.serialize(format="turtle")
print(ttl_output[:2000])


@prefix ds: <http://example.org/ds#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ds:authored rdfs:domain ds:Author ;
    rdfs:range ds:Paper ;
    owl:inverseOf ds:hasAuthor .

<http://example.org/ds/0000-0001-9601-0403> a ds:Author ;
    rdfs:label "Mirko Cesarini" ;
    ds:fullName "Mirko Cesarini" ;
    ds:hasDSDepartment 1 ;
    ds:hasHIndex 15 ;
    ds:hasInstitution <https://openalex.org/I66752286> ;
    ds:hasInstitutionID "https://openalex.org/I66752286" ;
    ds:hasORCID "0000-0001-9601-0403" ;
    ds:hasOpenAlexID "https://openalex.org/A5049259722" ;
    ds:hasSSD "ING-INF/05" ;
    ds:hasTopic "['Data Quality and Management', 'Semantic Web and Ontologies', 'Data Mining Algorithms and Applications', 'Privacy-Preserving Technologies in Data', 'Advanced Database Systems and Queries', 'Service-Oriented Architecture and Web Services', 'Big Data and Business Intelligen

In [39]:
ttl_path = "../data/knowledge_base2.ttl"
g.serialize(destination=ttl_path, format="turtle")

ttl_path

'../data/knowledge_base2.ttl'

In [40]:
from owlrl import DeductiveClosure, OWLRL_Semantics
from rdflib.namespace import OWL

DeductiveClosure(OWLRL_Semantics).expand(g)



In [41]:
results = g.query("""
PREFIX ds: <http://example.org/ds#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?personName
WHERE {
  ?person a ds:Person ;
          ds:teaches ?course ;
          ds:fullName ?personName .
}
""")

for row in results:
    print(row.personName)


Paola Rebora
Davide Chicco
Michele Ciavotta
Andrea Maurino
Elisabetta Fersini
Enrico Moretto
Paolo Napoletano
Marco Viviani
Marco Guerzoni
Dario Pescini
Matteo Palmonari
Gianfranco Forte
Enza Messina
Fabio Antonio Stella
Claudio Ferretti
Marco Fattore
Fulvia Pennoni
Gianna Monti
Fabio Mercorio
Gabriele Gianini
Davide Paolo Bernasconi
Pier Giovanni Bissiri
Marco Paganoni
Simone Bianco
Gianluca Della Vedova
Mirko Cesarini
Matteo Pelagatti
Luca Presotto
Gabriella Pasi


In [42]:
results = g.query("""
PREFIX ds: <http://example.org/ds#>
SELECT DISTINCT ?paper ?authorName
WHERE {
  ?paper a ds:Paper ;
         ds:hasAuthor ?author .
  ?author ds:fullName ?authorName .
}
LIMIT 10
""")

for row in results:
    print(f"üìÑ {row.paper} ‚Äî üë®‚Äçüíº {row.authorName}")


üìÑ http://example.org/ds/paper/10.1016_j.jhep.2023.09.008 ‚Äî üë®‚Äçüíº Riccardo De Carlis
üìÑ http://example.org/ds/paper/10.1016_j.jhep.2023.09.008 ‚Äî üë®‚Äçüíº Andrea Lauterio
üìÑ http://example.org/ds/paper/10.1016_j.jhep.2023.09.008 ‚Äî üë®‚Äçüíº Davide Paolo Bernasconi
üìÑ http://example.org/ds/paper/10.1016_j.jhep.2023.09.008 ‚Äî üë®‚Äçüíº C. Burcin Taner
üìÑ http://example.org/ds/paper/10.3390_curroncol28060391 ‚Äî üë®‚Äçüíº Nicol√≤ Tamini
üìÑ http://example.org/ds/paper/10.3390_curroncol28060391 ‚Äî üë®‚Äçüíº Luca Gianotti
üìÑ http://example.org/ds/paper/10.3390_curroncol28060391 ‚Äî üë®‚Äçüíº Davide Paolo Bernasconi
üìÑ http://example.org/ds/paper/10.3390_cancers13071745 ‚Äî üë®‚Äçüíº Nicol√≤ Tamini
üìÑ http://example.org/ds/paper/10.3390_cancers13071745 ‚Äî üë®‚Äçüíº Davide Paolo Bernasconi
üìÑ http://example.org/ds/paper/10.3390_cancers13071745 ‚Äî üë®‚Äçüíº Lorenzo Ripamonti


## Institution

In [13]:
import requests

# ID OpenAlex della Bicocca
openalex_id = "https://openalex.org/I66752286"

# Step 1: recupera i dati OpenAlex
openalex_resp = requests.get("https://api.openalex.org/institutions/I66752286")
data = openalex_resp.json()

# Step 2: prendi il ROR ID
ror_uri = data.get("ror")
if not ror_uri:
    print("Nessun ROR ID trovato.")
else:
    ror_id = ror_uri.strip().split("/")[-1]
    ror_api_url = f"https://api.ror.org/organizations/{ror_id}"

    # Step 3: chiama ROR API
    ror_resp = requests.get(ror_api_url)
    if ror_resp.status_code != 200:
        print("Errore nella chiamata ROR:", ror_resp.status_code)
    else:
        ror_data = ror_resp.json()
        wikidata_ids = ror_data.get("external_ids", {}).get("Wikidata", {}).get("all", [])
        if wikidata_ids:
            wikidata_id = wikidata_ids[0]
            print(f"Wikidata ID: {wikidata_id}")
            print(f"Tripla:")
            print(f"<{openalex_id}> owl:sameAs <http://www.wikidata.org/entity/{wikidata_id} .")
        else:
            print("Nessun Wikidata ID trovato.")


Wikidata ID: Q1073674
Tripla:
<https://openalex.org/I66752286> owl:sameAs <http://www.wikidata.org/entity/Q1073674 .


In [18]:
import pandas as pd
import requests


df = pd.read_csv('../data/institution/institutions.csv')


# Funzione per ottenere Wikidata ID da OpenAlex ‚Üí ROR ‚Üí Wikidata
def get_wikidata_id_from_openalex(openalex_url):
    try:
        # Estrai ID OpenAlex
        openalex_id = openalex_url.split("/")[-1]
        openalex_resp = requests.get(f"https://api.openalex.org/institutions/{openalex_id}")
        if openalex_resp.status_code != 200:
            return None
        data = openalex_resp.json()
        ror_uri = data.get("ror")
        if not ror_uri:
            return None
        ror_id = ror_uri.strip().split("/")[-1]
        ror_resp = requests.get(f"https://api.ror.org/organizations/{ror_id}")
        if ror_resp.status_code != 200:
            return None
        ror_data = ror_resp.json()
        wikidata_ids = ror_data.get("external_ids", {}).get("Wikidata", {}).get("all", [])
        if wikidata_ids:
            return f"http://www.wikidata.org/entity/{wikidata_ids[0]}"
        return None
    except Exception:
        return None

# Applica la funzione a ogni riga
df["wikidata_id"] = df["ins_id"].apply(get_wikidata_id_from_openalex)



In [21]:
df.to_csv('../data/institution/institutions_wiki.csv')

In [20]:
df[df["wikidata_id"].isna()]


Unnamed: 0,ins_id,ins_name,ins_type,ins_country,wikidata_id
3,https://openalex.org/I4210110840,Azienda Ospedaliera San Gerardo,healthcare,IT,
5,https://openalex.org/I4210153126,Istituti di Ricovero e Cura a Carattere Scient...,healthcare,IT,
7,https://openalex.org/I4210151645,Policlinico San Matteo Fondazione,healthcare,IT,
9,https://openalex.org/I2277624104,Fondazione Bruno Kessler,funder,IT,
13,https://openalex.org/I4210139705,Ingegneria dei Sistemi (Italy),company,IT,
21,https://openalex.org/I4210105192,United Institute of Informatics Problems,facility,BY,
22,https://openalex.org/I4210117802,Institute of Electronics,nonprofit,BG,
24,,,,,
37,https://openalex.org/I4210125301,Health Awareness (United States),company,US,
62,https://openalex.org/I4210095629,Institute of Molecular Bioimaging and Physiology,facility,IT,


In [22]:
df

Unnamed: 0,ins_id,ins_name,ins_type,ins_country,wikidata_id
0,https://openalex.org/I138689650,University of Padua,funder,IT,http://www.wikidata.org/entity/Q193510
1,https://openalex.org/I4210094195,Azienda Socio Sanitaria Territoriale Grande Os...,healthcare,IT,http://www.wikidata.org/entity/Q3886620
2,https://openalex.org/I4210146710,Mayo Clinic in Florida,healthcare,US,http://www.wikidata.org/entity/Q6797499
3,https://openalex.org/I4210110840,Azienda Ospedaliera San Gerardo,healthcare,IT,
4,https://openalex.org/I66752286,University of Milano-Bicocca,funder,IT,http://www.wikidata.org/entity/Q1073674
...,...,...,...,...,...
122,https://openalex.org/I108290504,University of Pisa,funder,IT,http://www.wikidata.org/entity/Q645663
123,https://openalex.org/I4210156583,Laboratoire d'Informatique de Paris-Nord,facility,FR,http://www.wikidata.org/entity/Q3214424
124,https://openalex.org/I135117807,Universit√© de Sherbrooke,funder,CA,http://www.wikidata.org/entity/Q2579532
125,https://openalex.org/I186771145,Covenant University,funder,NG,http://www.wikidata.org/entity/Q742241


add to graph

In [37]:
from rdflib import Graph, URIRef
from rdflib.namespace import OWL
import pandas as pd

# Carica il dataset (modifica il percorso se necessario)
df = pd.read_csv("../data/institution/institutions_wiki.csv")

# Inizializza il grafo RDF
g 

# Aggiungi solo le triple owl:sameAs se wikidata_id √® presente
for _, row in df.iterrows():
    wikidata_id = row.get("wikidata_id")
    if pd.notna(wikidata_id) and wikidata_id.strip() != "":
        openalex_uri = URIRef(row["ins_id"].strip())
        wikidata_uri = URIRef(wikidata_id.strip())
        g.add((openalex_uri, OWL.sameAs, wikidata_uri))


In [31]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
SELECT ?rank ?year ?rankingLabel
WHERE {
  wd:Q1073674 p:P1352 ?rankStatement .
  ?rankStatement ps:P1352 ?rank .
  ?rankStatement pq:P585 ?year .
  ?rankStatement pq:P459 ?rankingMethod .
  
  ?rankingMethod rdfs:label ?rankingLabel .
  FILTER(LANG(?rankingLabel) = "en")
  FILTER(CONTAINS(?rankingLabel, "QS"))
}
ORDER BY ?year
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(f"Year: {result['year']['value']}, Rank: {result['rank']['value']}, Method: {result['rankingLabel']['value']}")


Year: 2022-01-01T00:00:00Z, Rank: 450, Method: QS World University Rankings
Year: 2024-01-01T00:00:00Z, Rank: 481, Method: QS World University Rankings
Year: 2025-01-01T00:00:00Z, Rank: 513, Method: QS World University Rankings


In [35]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
SELECT ?countryLabel ?adminLabel ?coord ?address ?cap
WHERE {
  OPTIONAL { wd:Q1073674 wdt:P17 ?country . }
  OPTIONAL { wd:Q1073674 wdt:P131 ?admin . }
  OPTIONAL { wd:Q1073674 wdt:P625 ?coord . }
  OPTIONAL { wd:Q1073674 wdt:P6375 ?address . }
  OPTIONAL { wd:Q1073674 wdt:P281 ?cap . }

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    country = result.get("countryLabel", {}).get("value", "N/A")
    admin = result.get("adminLabel", {}).get("value", "N/A")
    coord = result.get("coord", {}).get("value", "N/A")
    address = result.get("address", {}).get("value", "N/A")
    cap = result.get("cap", {}).get("value", "N/A")

    print(f"Country: {country}")
    print(f"Administrative area: {admin}")
    print(f"Coordinates: {coord}")
    print(f"Address: {address}")
    print(f"Postal code: {cap}")


Country: Italy
Administrative area: Milan
Coordinates: Point(9.213344 45.518406)
Address: Piazza dell'Ateneo Nuovo 1
Postal code: N/A


## Query con Aggiunta di wikidata

In [49]:
q2 = """
PREFIX ds: <http://example.org/ds#>
SELECT ?paper ?title
WHERE {
  ?author a ds:Author ;
          ds:fullName ?name ;
          ds:authored ?paper .
  ?paper ds:hasTitle ?title .
  FILTER(CONTAINS(LCASE(STR(?name)), "noemi gozzi"))
}
"""
for row in g.query(q2):
    print(f"üìÑ {row.title} ‚Äî URI: {row.paper}")


üìÑ XAI for myo-controlled prosthesis: Explaining EMG data for hand gesture classification ‚Äî URI: http://example.org/ds/paper/10.1016_j.knosys.2021.108053


In [87]:
from rdflib.namespace import OWL

# Query SPARQL per cercare "ETH Zurich" nel campo ds:institutionName
query_eth = """
PREFIX ds: <http://example.org/ds#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?wikidata
WHERE {
  ?inst a ds:Institution ;
        ds:institutionName ?name ;
        owl:sameAs ?wikidata .
  FILTER(CONTAINS(LCASE(STR(?name)), "tu wien"))
}
"""

wikidata_uri = None
for row in g.query(query_eth):
    wikidata_uri = str(row.wikidata)
    print(f"üîó Wikidata URI: {wikidata_uri}")
    break  # prende solo il primo risultato




üîó Wikidata URI: http://www.wikidata.org/entity/Q689400


In [74]:
wikidata_uri

'http://www.wikidata.org/entity/Q11942'

In [88]:
from SPARQLWrapper import SPARQLWrapper, JSON

if wikidata_uri:
    wikidata_id = wikidata_uri.split("/")[-1]
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(JSON)

    sparql.setQuery(f"""
    SELECT ?rank ?year ?rankingLabel
    WHERE {{
      wd:{wikidata_id} p:P1352 ?rankStatement .
      ?rankStatement ps:P1352 ?rank .
      ?rankStatement pq:P585 ?year .
      
    }}
    ORDER BY DESC(?year)
    """)

    print(f"üì° Interrogazione di Wikidata per {wikidata_id} (QS Ranking)...")
    results = sparql.query().convert()

    print("üìä QS World University Rankings ‚Äì Bicocca:")
    for res in results["results"]["bindings"]:
        year = res["year"]["value"][:4]  # taglia a "2025-01-01" ‚Üí "2025"
        rank = res["rank"]["value"]
        
        print(f"üóì {year} ‚Üí üèÜ Rank: {rank} ({label})")
else:
    print("‚ùå Nessun URI Wikidata trovato per 'Bicocca'")


üì° Interrogazione di Wikidata per Q689400 (QS Ranking)...
üìä QS World University Rankings ‚Äì Bicocca:
üóì 2025 ‚Üí üèÜ Rank: 190 (QS World University Rankings)
üóì 2024 ‚Üí üèÜ Rank: 184 (QS World University Rankings)
üóì 2023 ‚Üí üèÜ Rank: 179 (QS World University Rankings)
üóì 2022 ‚Üí üèÜ Rank: 180 (QS World University Rankings)
üóì 2021 ‚Üí üèÜ Rank: 191 (QS World University Rankings)
üóì 2020 ‚Üí üèÜ Rank: 192 (QS World University Rankings)
üóì 2019 ‚Üí üèÜ Rank: 199 (QS World University Rankings)
üóì 2018 ‚Üí üèÜ Rank: 182 (QS World University Rankings)
üóì 2017 ‚Üí üèÜ Rank: 183 (QS World University Rankings)
üóì 2016 ‚Üí üèÜ Rank: 197 (QS World University Rankings)
üóì 2015 ‚Üí üèÜ Rank: 246 (QS World University Rankings)
üóì 2014 ‚Üí üèÜ Rank: 264 (QS World University Rankings)
üóì 2012 ‚Üí üèÜ Rank: 274 (QS World University Rankings)


In [45]:
results

{'head': {'vars': ['typeLabel', 'countryLabel']},
 'results': {'bindings': [{'typeLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'university'},
    'countryLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Italy'}}]}}