In [1]:
import pandas as pd, pathlib, re, math, pathlib
from slugify import slugify
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, XSD


In [2]:
DATA = pathlib.Path("data")
PUB = Namespace("http://www.example.org/publication#")
RES = Namespace("http://www.example.org/resource/")
g   = Graph()
g.bind("pub", PUB)


aid  = lambda x: RES[f"author/{str(x).strip()}"]
pid  = lambda x: RES[f"paper/{str(x).strip()}"]
kid  = lambda x: RES[f"keyword/{slugify(str(x))}"]
jid  = lambda x: RES[f"journal/{slugify(str(x))}"]
vid  = lambda j,v: RES[f"journal/{slugify(str(j))}/vol/{int(v)}"]
eid  = lambda n: RES[f"event/{slugify(str(n))}"]
proc = lambda i: RES[f"proceeding/{str(i).strip()}"]
year_uri = lambda y: RES[f"year/{int(float(y))}"]
city_uri = lambda c: RES[f"city/{slugify(str(c))}"]

In [3]:
for _,r in pd.read_csv(DATA/"author_nodes.csv").iterrows():
    a = aid(r.authorId)
    g.add((a,RDF.type,PUB.Author))
    g.add((a,PUB.name,Literal(r.name)))
    g.add((a,PUB.email,Literal(r.email)))

In [4]:
for _,r in pd.read_csv(DATA/"author_nodes.csv").iterrows():
    a = aid(r.authorId)
    g.add((a,RDF.type,PUB.Author))
    g.add((a,PUB.name,Literal(r.name)))
    g.add((a,PUB.email,Literal(r.email)))

In [5]:
for _,r in pd.read_csv(DATA/"paper_nodes.csv").iterrows():
    p = pid(r.paperId)
    g.add((p,RDF.type,PUB.Paper))
    g.add((p,PUB.title,Literal(r.title)))
    absn = BNode()
    g.add((absn,RDF.type,PUB.Abstract))
    g.add((absn,PUB.text,Literal(r.abstract)))
    g.add((p,PUB.contains,absn))
    g.add((p,PUB.pages,Literal(int(r.pages))))
    g.add((p,PUB.doi,Literal(str(r.doi))))
    g.add((p,PUB.url,Literal(r.url,datatype=XSD.anyURI)))
    g.add((p,PUB.citationCount,Literal(int(r.citationCount))))

In [6]:
pd.read_csv(DATA/"keyword_nodes.csv").keyword.apply(
    lambda kw: (g.add((kid(kw),RDF.type,PUB.Keyword)),
                g.add((kid(kw),PUB.label,Literal(kw))))
)

0     (((http://www.example.org/resource/author/5413...
1     (((http://www.example.org/resource/author/5413...
2     (((http://www.example.org/resource/author/5413...
3     (((http://www.example.org/resource/author/5413...
4     (((http://www.example.org/resource/author/5413...
5     (((http://www.example.org/resource/author/5413...
6     (((http://www.example.org/resource/author/5413...
7     (((http://www.example.org/resource/author/5413...
8     (((http://www.example.org/resource/author/5413...
9     (((http://www.example.org/resource/author/5413...
10    (((http://www.example.org/resource/author/5413...
11    (((http://www.example.org/resource/author/5413...
12    (((http://www.example.org/resource/author/5413...
13    (((http://www.example.org/resource/author/5413...
14    (((http://www.example.org/resource/author/5413...
15    (((http://www.example.org/resource/author/5413...
16    (((http://www.example.org/resource/author/5413...
17    (((http://www.example.org/resource/author/

In [7]:

for _,r in pd.read_csv(DATA/"paper_has_keyword.csv").iterrows():
    g.add((pid(r.paperId),PUB.isAbout,kid(r.keyword)))


In [8]:

for _,r in pd.read_csv(DATA/"paper_cites_paper.csv").iterrows():
    g.add((pid(r.sourcePaperId),PUB.cites,pid(r.targetPaperId)))

In [9]:
for _,r in pd.read_csv(DATA/"author_writes_paper.csv").iterrows():
    a,p = aid(r.authorId), pid(r.paperId)
    g.add((a,PUB.writes,p))
    if str(r.corresponding_author).strip().lower()=="true":
        g.add((a,PUB.isCorrespondingAuthor,p))

In [10]:
for _,r in pd.read_csv(DATA/"author_reviews_paper.csv").iterrows():
    a,p = aid(r.authorId), pid(r.paperId)
    g.add((a,RDF.type,PUB.Reviewer))
    g.add((a,PUB.reviews,p))

In [11]:
for _, r in pd.read_csv(DATA / "paper_published_in.csv").iterrows():
    j   = jid(r.journalName)
    vol = vid(r.journalName, r.volume)
    p   = pid(r.paperId)

    g.add((j, RDF.type, PUB.Journal))
    g.add((vol, RDF.type, PUB.Volume))
    g.add((vol, PUB.belongsTo, j))

    year_val = pd.to_numeric(r.year, errors="coerce")
    if pd.notna(year_val):
        yr = year_uri(year_val)                         
        g.add((yr, RDF.type, PUB.Year))
        g.add((vol, PUB.publishedInYear, yr))

    g.add((p, PUB.publishedIn, vol))

In [12]:
part_df = pd.read_csv(DATA/"proceeding_part_of.csv")
proc2event = {}
for _,r in part_df.iterrows():
    name = r.conferenceName.strip()
    ev_class = PUB.Workshop if re.search(r"workshop", name, re.I) else PUB.Conference
    ev  = eid(name)
    proc2event[r.proceedingId] = ev
    g.add((ev,RDF.type,ev_class))
    g.add((ev,PUB.label,Literal(name)))

In [13]:
for _,r in pd.read_csv(DATA/"proceedings_nodes.csv").iterrows():
    prc = proc(r.proceedingId)
    g.add((prc,RDF.type,PUB.Proceedings))
    yr,city = str(r.year).strip(), str(r.city).strip()
    if yr and not math.isnan(float(yr)):
        yuri = year_uri(float(yr))
        g.add((yuri,RDF.type,PUB.Year))
        g.add((prc,PUB.presentedInYear,yuri))
    if city and city!='nan':
        curi = city_uri(city)
        g.add((curi,RDF.type,PUB.City))
        g.add((prc,PUB.ofVenueIn,curi))
    ev = proc2event.get(r.proceedingId)
    if not ev:
        name = r.conferenceName.strip()
        ev_class = PUB.Workshop if re.search(r"workshop", name, re.I) else PUB.Conference
        ev = eid(name)
        g.add((ev,RDF.type,ev_class))
        g.add((ev,PUB.label,Literal(name)))
    g.add((prc,PUB.isPartOf,ev))

In [14]:
for _,r in pd.read_csv(DATA/"paper_presented_in.csv").iterrows():
    g.add((pid(r.paperId),PUB.presentedIn,proc(r.proceedingId)))


In [15]:
# one pass over the graph â€“ a few milliseconds
name2author = { str(label): subj
                for subj, _, label in g.triples((None, PUB.name, None)) }

for _, row in pd.read_csv(DATA/"h_index.csv").iterrows():
    a_uri = name2author.get(row.authorName)
    if a_uri:
        g.add((a_uri,
               PUB.hIndex,
               Literal(int(row.hIndex), datatype=XSD.integer)))

In [16]:
out = pathlib.Path("publication_ABOX.ttl")
g.serialize(out,format="turtle")
print("A-Box written to",out)

A-Box written to publication_ABOX.ttl
