In [1]:
from graphdatascience import GraphDataScience

In [2]:
gds = GraphDataScience("neo4j://localhost", auth=("neo4j", "pierre!!"))

In [3]:
def iterate_gdpr(file):
    res = {}
    res["regulation"] = "gdpr"
    res["label"] = "Regulation"
    res["text"]="""REGULATION (EU) 2016/679 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL

of 27 April 2016

on the protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing Directive 95/46/EC (General Data Protection Regulation)

(Text with EEA relevance)""".strip().replace("\n", " ")
    yield dict(res)
    for line in open(file, "r").readlines():
        line = line.strip()
        if line == '':
            continue
        #print(line, line[0], line[-1])
        if line.startswith('CHAPTER'):
            _, iden = line.split(' ')
            res["chapter"] = iden
            res["label"] = "Chapter"
            res.pop("article", None)
            res.pop("paragraph", None)
            res.pop("point", None)
            res.pop("text", None)
        elif line.startswith('Article'):
            _, iden = line.split(' ')
            res["article"] = iden
            res["label"] = "Article"
            res.pop("paragraph", None)
            res.pop("point", None)
            res.pop("text", None)
        elif line[1:5] == ".   ":
            iden, text = line.split(".   ")
            res["paragraph"] = int(iden)
            res["label"] = "Paragraph"
            res["text"] = text
            res.pop("point", None)
            yield dict(res)
        elif line[0] == '(' and line[-1] == ')':
            res["point"] = line[1:-1]
            res["label"] = "Point"
            res.pop("text", None)
        else:
            res["text"] = line
            yield dict(res)
            

In [4]:
# manage points and subpoints (ex (1) then (a), (b) )
data = [item for item in iterate_gdpr("gdpr.txt")]
processed_data = []
for item in data:
    if item['label'] == 'Point':
        if processed_data[-1]['label'] == 'Point':
            cpt = 0
            if processed_data[-1]["point"].isnumeric():
                cpt += 1
            if item["point"].isnumeric():
                cpt += 1
            if cpt == 1:
                item["subpoint"] = item["point"]
                item["point"] = processed_data[-1]["point"]
                item["label"] = "SubPoint"
        if processed_data[-1]['label'] == 'SubPoint':
            cpt = 0
            if processed_data[-1]["subpoint"].isnumeric():
                cpt += 1
            if item["point"].isnumeric():
                cpt += 1
            if cpt != 1:
                item["subpoint"] = item["point"]
                item["point"] = processed_data[-1]["point"]
                item["label"] = "SubPoint"
    processed_data += [item]
        

In [6]:
query ="""
UNWIND $data AS i_data
CALL {
    WITH i_data
    CREATE (i:Chunk)
    SET i+= i_data
} IN TRANSACTIONS OF 100 ROWS
"""
gds.run_cypher(query, {"data":processed_data})