# Processing Archiclass Export

## Paramètres


In [1]:
from collections import defaultdict
import unicodedata
import os
from datetime import date, timedelta

In [2]:
import pandas as pd

In [3]:
from rdflib import Graph, Literal, RDF, URIRef, BNode, Namespace
from rdflib.namespace import  XSD, DC, DCTERMS, OWL, SKOS, RDFS


In [4]:
baseUrl = 'http://127.0.0.1:8080/rest'

In [5]:
archiclass_export = '066_ACV_REFGD_20221028.csv'
version = '1.0.0'
validationDate = date(2022,10,28)

In [6]:
archiclass_export_2 = '066_ACV_REFGD_20230110.csv'
version_2 = '2.0.0'
validationDate_2 = date(2023,1,10)

In [7]:
archiclass_export_3 = '066_ACV_REFGD_20230120.csv'
version_3 = '3.0.0'
validationDate_3 = date(2023,1,20)

In [8]:
output_folder = 'rico_output'

#### Ajout du prefix sur les identifiants

In [9]:
prefix = '066'

#### Préfixes

In [10]:
rico_ns = 'https://www.ica.org/standards/RiC/ontology#'
RICO = Namespace(rico_ns)

premis_ns = 'http://www.loc.gov/premis/rdf/v1#'
PREMIS = Namespace(premis_ns)

record_ns = baseUrl + '/refgd/'
RECORD = Namespace(record_ns)

agent_ns = baseUrl + '/agent/'
AGENT = Namespace(agent_ns)

type_ns = baseUrl + '/type/'
TYPE = Namespace(type_ns)

## Code 

In [11]:
def remove_diacritics(s):
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

def remove_sepcial_chars(s):
    s2 = s
    # replace by underscores
    chr_to_replace = [ ' ', '"', "'", '/', '\\', '-', '&', ',', '.', ';', '~', '?', '!', '`', '^' ]
    for c in chr_to_replace:
        s2 = s2.replace(c, '_')    
    # remove multiple underscores
    s2 = s2.replace('__', '_').replace('__', '_').replace('__', '_')
    return s2

def normalize_title(s):
    return remove_sepcial_chars(str(remove_diacritics(s)))

def strip_textfield(s):
    if isinstance(s, str):
        return s.strip('"= ')
    else:
        return ''
    
def strip_textfield2(s):
        return s.strip('"= ')


In [12]:
conv_list = [ 'M3', 'M4', 'M5', 'M6', 'M8_Id', 'M8_ExternalId',
       'M8_Value', 'M9_Id', 'M9_ExternalId', 'M9_Value', 'M10_Id',
       'M10_ExternalId', 'M10_Value', 'M11_Id', 'M11_ExternalId', 'M11_Value',
       'M12', 'M13_Id', 'M13_ExternalId', 'M13_Value', 'M14', 'M15_Id',
       'M15_ExternalId', 'M15_Value', 'M16', 'M17_Id', 'M17_ExternalId',
       'M17_Value', 'M18.1_Id', 'M18.1_ExternalId', 'M18.1_Value', 'M18.2_Id',
       'M18.2_ExternalId', 'M18.2_Value', 'M18.3_Id', 'M18.3_ExternalId',
       'M18.3_Value', 'M19.1_Id', 'M19.1_ExternalId', 'M19.1_Value',
       'M19.2_Id', 'M19.2_ExternalId', 'M19.2_Value', 'M19.3_Id',
       'M19.3_ExternalId', 'M19.3_Value', 'M20_Id', 'M20_ExternalId',
       'M20_Value', 'M21_Id', 'M21_ExternalId', 'M21_Value', 'M22', 'M23',
       'M24_Id', 'M24_ExternalId', 'M24_Value', 'M25', 'M26_Id',
       'M26_ExternalId', 'M26_Value', 'M27_Id', 'M27_ExternalId', 'M27_Value',
       'M28', 'M29_Id', 'M29_ExternalId', 'M29_Value', 'M30_Id',
       'M30_ExternalId', 'M30_Value', 'M35', 'M36', 'M37', 'M38.1', 'M38.2',
       'M39.1', 'M39.2', 'M39.3', 'M40', 'M41', 'M42', 'M43', 'M44' ]

In [13]:
converters = {}
for x in conv_list:
    converters[x]=strip_textfield2

In [14]:
df = pd.read_csv(archiclass_export, sep=";", converters=converters)

In [15]:
df['id'] = df['M1']
df['parent_id'] = df['M2']
df['support'] = df['M10_ExternalId'].apply(strip_textfield).astype(str)

In [16]:
#df = df.applymap(strip_textfield)

In [17]:
df2 = pd.read_csv(archiclass_export_2, sep=";", converters=converters)
df2['id'] = df2['M1']
df2['parent_id'] = df2['M2']
df2['support'] = df2['M10_ExternalId'].apply(strip_textfield).astype(str)

In [18]:
df3 = pd.read_csv(archiclass_export_3, sep=";", converters=converters)
df3['id'] = df3['M1']
df3['parent_id'] = df3['M2']
df3['support'] = df3['M10_ExternalId'].apply(strip_textfield).astype(str)

### Add prefixes to ids

In [19]:
def add_prefix(value, sep=':'):
    return prefix + sep + str(value)

def add_prefix_d(value):
    return add_prefix(value, sep='.')

In [20]:
df['M1'] = df['M1'].apply(add_prefix)
df['M2'] = df['M2'].apply(add_prefix)
df['M3'] = df['M3'].apply(add_prefix_d)

In [21]:
sysIds = list(df['M1'])

In [22]:
df2['M1'] = df2['M1'].apply(add_prefix)
df2['M2'] = df2['M2'].apply(add_prefix)
df2['M3'] = df2['M3'].apply(add_prefix_d)

In [23]:
sysIds_2 = list(df2['M1'])

In [24]:
df3['M1'] = df3['M1'].apply(add_prefix)
df3['M2'] = df3['M2'].apply(add_prefix)
df3['M3'] = df3['M3'].apply(add_prefix_d)

In [25]:
sysIds_3 = list(df3['M1'])

### Parents and children

In [26]:
children = defaultdict(list)
for ix, row in df.iterrows():
    if row['M2'] != '-':
        children[row['M2']].append(row['M1'])

In [27]:
children2 = defaultdict(list)
for ix, row in df2.iterrows():
    if row['M2'] != '-':
        children2[row['M2']].append(row['M1'])

In [28]:
children3 = defaultdict(list)
for ix, row in df3.iterrows():
    if row['M2'] != '-':
        children3[row['M2']].append(row['M1'])

In [29]:
parents = defaultdict(list)
for k in children.keys():
    for x in children[k]:
        parents[x] = k

In [30]:
parents2 = defaultdict(list)
for k in children2.keys():
    for x in children2[k]:
        parents2[x] = k

In [31]:
parents3 = defaultdict(list)
for k in children3.keys():
    for x in children3[k]:
        parents3[x] = k

### Conver to RiC

In [32]:
for ix, r in df.iterrows():
    
    # Graph
    
    g = Graph()
    g.namespace_manager.bind('rico', URIRef(rico_ns))
    g.namespace_manager.bind('premis', URIRef(premis_ns))
    g.namespace_manager.bind('record', URIRef(record_ns))
    g.namespace_manager.bind('agent', URIRef(agent_ns))
    g.namespace_manager.bind('type', URIRef(type_ns))
 
    # Record and its identifiers
    
    sysId = r['M1']
    sysid = BNode()
    g.add((sysid, RDF.type, RICO.Identifier))
    g.add((sysid, RICO.hasIdentifierType, TYPE.IdRefGD ))
    g.add((sysid, RICO.name, Literal(sysId) ))
    
    refCode  = r['M3']
    refcode = BNode()
    g.add((refcode, RDF.type, RICO.Identifier))
    g.add((refcode, RICO.hasIdentifierType, TYPE.CoteIntellectuelle ))
    g.add((refcode, RICO.name, Literal(refCode) ))
 
    uid = RECORD.term(str(sysId) + '_' + str(version))
    uid_2 = RECORD.term(str(sysId) + '_' + str(version_2))
    
    g.add((uid, RDF.type, RICO.RecordSet))
    g.add((uid, RDF.type, TYPE.RefGD))
    g.add((uid, RICO.recordSetType, TYPE.RefGD))
    g.add((uid, RICO.hasOrHadIdentifier, sysid))
    g.add((uid, RICO.hasOrHadIdentifier, refcode))
    
    # Record parent
    
    if sysId in parents.keys():
        parent = RECORD.term(str( parents[sysId] + '_' + version ))
        g.add((uid, RICO.isOrWasIncludedIn, parent))
    
    # Record description metadata
    
    g.add((uid, RICO.title, Literal(r['M4'])))
    if r['M5'] != '':
        g.add((uid, RICO.scopeAndContent, Literal(r['M5'])))
     
    # Creator agent
    creator = AGENT.term(str( prefix ))
    g.add((uid, RICO.hasCreator, creator))
        
    # Version and dates
    
    g.add((uid, PREMIS.version, Literal(version)))
    
    date = BNode()
    g.add((date, RICO.type, RICO.SingleDate ))
    g.add((date, RICO.name, Literal('date de validation') ))
    g.add((date, RICO.normalizedDateVelue, Literal(validationDate) ))
    g.add((uid, RICO.beginningDate, date))
    
    dateEnd = BNode()
    g.add((dateEnd, RICO.type, RICO.SingleDate ))
    g.add((dateEnd, RICO.normalizedDateVelue, Literal(validationDate_2) ))
    g.add((uid, RICO.endDate, dateEnd))
    
    if sysId in sysIds_2 :
        g.add((uid, RICO.precedesInTime, uid_2))
 
    # Rules
    
    personalData= r['M13_ExternalId']
    pDN = BNode()
    g.add((pDN, RDF.type, RICO.Rule))
    g.add((pDN, RICO.hasOrHadRuleType, TYPE.DonneesPersonnelles  ))
    g.add((pDN, RICO.name, Literal(personalData) )) 
    g.add((uid, RICO.isAssociatedWithRule, pDN))
    
    closingPeriod = 'P' + r['M18.1_ExternalId'].replace('-','1').replace('A','') + 'Y'
    cP = BNode()
    g.add((cP, RDF.type, RICO.Rule))
    g.add( (cP, RICO.hasOrHadRuleType, TYPE.RegleCloture ) )
    g.add((cP, RDF.value, Literal(closingPeriod, datatype=XSD.duration) )) 
    g.add((uid, RICO.isAssociatedWithRule, cP))
    
    retentionPeriod =  'P' + r['M22'].replace('-','1').replace('A','') + 'Y'
    DU = BNode()
    g.add((DU, RDF.type, RICO.Rule))
    g.add((DU, RICO.hasOrHadRuleType, TYPE.DureeUtilite  ))
    g.add((DU, RDF.value, Literal(retentionPeriod, datatype=XSD.duration) )) 
    g.add((uid, RICO.isAssociatedWithRule, DU))
    
    # Save
    
    ttl = g.serialize(format="turtle", encoding='utf-8')
    with  open( output_folder+os.sep+sysId.replace(':','-')+'_'+str(version)+'.ttl', 'wb' ) as ttlf:
        ttlf.write(ttl)
    #print(ttl.decode('utf-8'))



In [33]:
for ix, r in df2.iterrows():
    
    # Graph
    
    g = Graph()
    g.namespace_manager.bind('rico', URIRef(rico_ns))
    g.namespace_manager.bind('premis', URIRef(premis_ns))
    g.namespace_manager.bind('record', URIRef(record_ns))
    g.namespace_manager.bind('agent', URIRef(agent_ns))
    g.namespace_manager.bind('type', URIRef(type_ns))

    # Record and its identifiers
    
    sysId = r['M1']
    sysid = BNode()
    g.add((sysid, RDF.type, RICO.Identifier))
    g.add((sysid, RICO.hasIdentifierType, TYPE.IdRefGD ))
    g.add((sysid, RICO.name, Literal(sysId) ))
    
    refCode  = r['M3']
    refcode = BNode()
    g.add((refcode, RDF.type, RICO.Identifier))
    g.add((refcode, RICO.hasIdentifierType, TYPE.CoteIntellectuelle ))
    g.add((refcode, RICO.name, Literal(refCode) ))
 
    uid = RECORD.term(str(sysId) + '_' + str(version_2))
    uid_2 = RECORD.term(str(sysId) + '_' + str(version_3))
    uid_old = RECORD.term(str(sysId) + '_' + str(version))
    
    g.add((uid, RDF.type, RICO.RecordSet))
    g.add((uid, RDF.type, TYPE.RefGD))
    g.add((uid, RICO.recordSetType, TYPE.RefGD))
    g.add((uid, RICO.hasOrHadIdentifier, sysid))
    g.add((uid, RICO.hasOrHadIdentifier, refcode))
    
    # Record parent
    
    if sysId in parents.keys():
        parent = RECORD.term(str( parents[sysId] + '_' + version ))
        g.add((uid, RICO.isOrWasIncludedIn, parent))
    
    # Record description metadata
    
    g.add((uid, RICO.title, Literal(r['M4'])))
    if r['M5'] != '':
        g.add((uid, RICO.scopeAndContent, Literal(r['M5'])))
     
    # Creator agent
    creator = AGENT.term(str( prefix ))
    g.add((uid, RICO.hasCreator, creator))
        
    # Version and dates
    
    g.add((uid, PREMIS.version, Literal(version_2)))
    
    date = BNode()
    g.add((date, RICO.type, RICO.SingleDate ))
    g.add((date, RICO.name, Literal('date de validation') ))
    g.add((date, RICO.normalizedDateVelue, Literal(validationDate_2) ))
    g.add((uid, RICO.beginningDate, date))
    
    dateEnd = BNode()
    g.add((dateEnd, RICO.type, RICO.SingleDate ))
    g.add((dateEnd, RICO.normalizedDateVelue, Literal(validationDate_3) ))
    g.add((uid, RICO.endDate, dateEnd))

    if sysId in sysIds_3 :
        g.add((uid, RICO.precedesInTime, uid_2))
    
    if sysId in sysIds :
        g.add((uid, RICO.followsInTime, uid_old))
 
    # Rules
    
    personalData= r['M13_ExternalId']
    pDN = BNode()
    g.add((pDN, RDF.type, RICO.Rule))
    g.add((pDN, RICO.hasOrHadRuleType, TYPE.DonneesPersonnelles  ))
    g.add((pDN, RICO.name, Literal(personalData) )) 
    g.add((uid, RICO.isAssociatedWithRule, pDN))
    
    closingPeriod = 'P' + r['M18.1_ExternalId'].replace('-','1').replace('A','') + 'Y'
    cP = BNode()
    g.add((cP, RDF.type, RICO.Rule))
    g.add( (cP, RICO.hasOrHadRuleType, TYPE.RegleCloture ) )
    g.add((cP, RDF.value, Literal(closingPeriod, datatype=XSD.duration) )) 
    g.add((uid, RICO.isAssociatedWithRule, cP))
    
    retentionPeriod =  'P' + r['M22'].replace('-','1').replace('A','') + 'Y'
    DU = BNode()
    g.add((DU, RDF.type, RICO.Rule))
    g.add((DU, RICO.hasOrHadRuleType, TYPE.DureeUtilite  ))
    g.add((DU, RDF.value, Literal(retentionPeriod, datatype=XSD.duration) )) 
    g.add((uid, RICO.isAssociatedWithRule, DU))
    
    # Save
    
    ttl = g.serialize(format="turtle", encoding='utf-8')
    with  open( output_folder+os.sep+sysId.replace(':','-')+'_'+str(version_2)+'.ttl', 'wb' ) as ttlf:
        ttlf.write(ttl)
    #print(ttl.decode('utf-8'))



In [34]:
for ix, r in df3.iterrows():
    
    # Graph
    
    g = Graph()
    g.namespace_manager.bind('rico', URIRef(rico_ns))
    g.namespace_manager.bind('premis', URIRef(premis_ns))
    g.namespace_manager.bind('record', URIRef(record_ns))
    g.namespace_manager.bind('agent', URIRef(agent_ns))
    g.namespace_manager.bind('type', URIRef(type_ns))
 
    # Record and its identifiers
    
    sysId = r['M1']
    sysid = BNode()
    g.add((sysid, RDF.type, RICO.Identifier))
    g.add((sysid, RICO.hasIdentifierType, TYPE.IdRefGD ))
    g.add((sysid, RICO.name, Literal(sysId) ))
    
    refCode  = r['M3']
    refcode = BNode()
    g.add((refcode, RDF.type, RICO.Identifier))
    g.add((refcode, RICO.hasIdentifierType, TYPE.CoteIntellectuelle ))
    g.add((refcode, RICO.name, Literal(refCode) ))
 
    uid = RECORD.term(str(sysId) + '_' + str(version_3))
    #uid_2 = RECORD.term(str(sysId) + '_' + str(version_3))
    uid_old = RECORD.term(str(sysId) + '_' + str(version_2))
    
    g.add((uid, RDF.type, RICO.RecordSet))
    g.add((uid, RDF.type, TYPE.RefGD))
    g.add((uid, RICO.recordSetType, TYPE.RefGD))
    g.add((uid, RICO.hasOrHadIdentifier, sysid))
    g.add((uid, RICO.hasOrHadIdentifier, refcode))
    
    # Record parent
    
    if sysId in parents.keys():
        parent = RECORD.term(str( parents[sysId] + '_' + version ))
        g.add((uid, RICO.isOrWasIncludedIn, parent))
    
    # Record description metadata
    
    g.add((uid, RICO.title, Literal(r['M4'])))
    if r['M5'] != '':
        g.add((uid, RICO.scopeAndContent, Literal(r['M5'])))
     
    # Creator agent
    creator = AGENT.term(str( prefix ))
    g.add((uid, RICO.hasCreator, creator))
        
    # Version and dates
    
    g.add((uid, PREMIS.version, Literal(version_3)))
    
    date = BNode()
    g.add((date, RICO.type, RICO.SingleDate ))
    g.add((date, RICO.name, Literal('date de validation') ))
    g.add((date, RICO.normalizedDateVelue, Literal(validationDate_2) ))
    g.add((uid, RICO.beginningDate, date))

    #if sysId in sysIds_3 :
    #    g.add((uid, RICO.precedesInTime, uid_2))
    
    if sysId in sysIds :
        g.add((uid, RICO.followsInTime, uid_old))
 
    # Rules
    
    personalData= r['M13_ExternalId']
    pDN = BNode()
    g.add((pDN, RDF.type, RICO.Rule))
    g.add((pDN, RICO.hasOrHadRuleType, TYPE.DonneesPersonnelles ))
    g.add((pDN, RICO.name, Literal(personalData) )) 
    g.add((uid, RICO.isAssociatedWithRule, pDN))
    
    closingPeriod = 'P' + r['M18.1_ExternalId'].replace('-','1').replace('A','') + 'Y'
    cP = BNode()
    g.add((cP, RDF.type, RICO.Rule))
    g.add( (cP, RICO.hasOrHadRuleType, TYPE.RegleCloture ) )
    g.add((cP, RDF.value, Literal(closingPeriod, datatype=XSD.duration) )) 
    g.add((uid, RICO.isAssociatedWithRule, cP))
    
    retentionPeriod =  'P' + r['M22'].replace('-','1').replace('A','') + 'Y'
    DU = BNode()
    g.add((DU, RDF.type, RICO.Rule))
    g.add((DU, RICO.hasOrHadRuleType, TYPE.DureeUtilite ))
    g.add((DU, RDF.value, Literal(retentionPeriod, datatype=XSD.duration) )) 
    g.add((uid, RICO.isAssociatedWithRule, DU))
    
    # Save
    
    ttl = g.serialize(format="turtle", encoding='utf-8')
    with  open( output_folder+os.sep+sysId.replace(':','-')+'_'+str(version_3)+'.ttl', 'wb' ) as ttlf:
        ttlf.write(ttl)
    #print(ttl.decode('utf-8'))



### Concatenate generated records

In [35]:
prefixes = set([])
refGDrico = ''

for f in os.listdir(output_folder):
    lines = open(output_folder+os.sep+f).readlines()
    for l in lines:
        if l.startswith('@prefix'):
            prefixes.add(l)
        else:
            refGDrico += l

p = ''.join(list(prefixes))
open('rico.ttl', 'w').write( p+refGDrico )
    

832640