In [3]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD


df_authors = pd.read_csv('../data/authors/authors_internal_prova.csv')
df_courses = pd.read_csv('../data/courses/courses.csv')
# Define namespace
DS = Namespace("http://example.org/ds#")
g = Graph()
g.bind("ds", DS)

# Add authors and their properties
for _, row in df_authors.iterrows():
    author_uri = URIRef(f"http://example.org/ds/{row['orcid']}")
    course_uri = URIRef(f"http://example.org/ds/{row['Course'].replace(' ', '_')}")
    
    g.add((author_uri, RDF.type, DS.Author))
    g.add((author_uri, DS.hasORCID, Literal(row['orcid'])))
    g.add((author_uri, DS.fullName, Literal(f"{row['Name']} {row['Last name']}")))
    g.add((author_uri, DS.hasSSD, Literal(row['SSD'])))
    g.add((author_uri, DS.hasHIndex, Literal(row['hindex'], datatype=XSD.integer)))
    g.add((author_uri, DS.hasOpenAlexID, Literal(row['openalex id'])))
    g.add((author_uri, DS.teaches, course_uri))
    g.add((author_uri, DS.hasTopic, Literal(row['topics'])))

# Add courses (aggregated by name)
unique_courses = df_courses["Course"].unique()
for course in unique_courses:
    course_uri = URIRef(f"http://example.org/ds/{course.replace(' ', '_')}")
    g.add((course_uri, RDF.type, DS.Course))
    g.add((course_uri, DS.courseName, Literal(course)))

# Serialize to Turtle format
ttl_output = g.serialize(format="turtle")
ttl_output[:2000]  # Show a sample of the RDF output


'@prefix ds: <http://example.org/ds#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n\n<http://example.org/ds/0000-0001-5186-0199> a ds:Author ;\n    ds:fullName "Gabriele Gianini" ;\n    ds:hasHIndex 32.0 ;\n    ds:hasORCID "0000-0001-5186-0199" ;\n    ds:hasOpenAlexID "https://openalex.org/A5037738148" ;\n    ds:hasSSD "INF/01" ;\n    ds:hasTopic "[\'Particle physics theoretical and experimental studies\', \'Quantum Chromodynamics and Particle Interactions\', \'High-Energy Particle Collisions Research\', \'Particle Detector Development and Performance\', \'Radiation Detection and Scintillator Technologies\', \'Image Enhancement Techniques\', \'Neutrino Physics Research\', \'Peer-to-Peer Network Technologies\', \'Superconducting Materials and Applications\', \'Mobile Ad Hoc Networks\', \'Opportunistic and Delay-Tolerant Networks\', \'CCD and CMOS Imaging Sensors\', \'Energy Efficient Wireless Sensor Networks\', \'Atomic and Subatomic Physics Research\', \'Particle Accelerators

In [2]:
pip install rdflib

Collecting rdflib
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.4-py3-none-any.whl (565 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rdflib
Successfully installed rdflib-7.1.4
Note: you may need to restart the kernel to use updated packages.


In [5]:
ttl_path = "../data/knowledge_base.ttl"
g.serialize(destination=ttl_path, format="turtle")

ttl_path

'../data/knowledge_base.ttl'