In [None]:
pip install rdflib

In [None]:
pip install dateparser

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
from rdflib import Graph, Literal, RDF, URIRef, Namespace 
from rdflib.namespace import FOAF , XSD, DC, FOAF, SKOS, RDF, RDFS
import urllib.parse 
from google.colab import files 
import io
import regex as re
from datetime import datetime
import dateparser
from dateparser.search import search_dates

## Input file

In [None]:
uploaded = files.upload()

In [None]:
eb = pd.read_excel('SP_DownloadLijst_Eredienstbesturen_20210108.xlsx', sheet_name='DATA')

In [None]:
gp = pd.read_excel('gemeente-provincie.xlsx', sheet_name='Feuil2')

In [None]:
eb.info()

In [None]:
eb

## Helper functions

### space_cleansing

In [None]:
def space_cleansing(space):
  return re.sub(r'\s', '', space)

### split_house_bus_number

In [None]:
def split_house_bus_number(house_bus_number):
  house_number = bus_number = np.NaN
  comment = []
  house_bus_number = house_bus_number.replace(' ', '')

  if ('z/n' not in house_bus_number and 'nan' not in house_bus_number) : 
    if (('bus' in house_bus_number or '/' in house_bus_number)) :
      comment.append('Splitting. Check it.')
      if ('bus' in house_bus_number) : 
        split = house_bus_number.split('bus')
      else :
        split = house_bus_number.split('/')
      house_number = split[0]
      bus_number = split[1]
    else:
        house_number = house_bus_number
    house_number =  house_number.replace('/', '').replace('-', '').replace(',', '')
      
  return [house_number, bus_number, ' - '.join(comment)]

### kbo_cleansing

In [None]:
def kbo_cleansing(kbo):
  kbo_cleansed = comment = np.NaN

  if kbo != 'nan':
    kbo = re.sub(r'\D', '', kbo)
    if  re.match(r'\d{10}', kbo):
      kbo_cleansed = kbo
    elif re.match(r'\d{9}', kbo):
      kbo_cleansed = kbo
      comment = 'only 9 digits. Missing first 0?'
    else: 
      comment = 'Wrong KBO format. Check it.'
  else :
    comment = 'No KBO nr found'

  return [kbo_cleansed, comment]

### mail_cleansing

In [None]:
def mail_cleansing(mail):
  mail_cleansed = comment = np.NaN

  if mail != 'nan':
    if  re.match(r'[\w\.-]+@[\w\.-]+(\.[\w]+)+', mail):
      mail_cleansed = mail
    else: 
      comment = 'Wrong mail format. Check it.'

  return [mail_cleansed, comment]

### telephone_number_cleansing

In [None]:
def telephone_number_cleansing(telephone_number):
  telephone_number = re.sub(r'\s', '', telephone_number)

  telephone_number = re.sub(r'tel:', '', telephone_number)
  telephone_number = re.sub(r'tel', '', telephone_number)
  telephone_number = re.sub(r'<br>', '', telephone_number)

  telephone_number = re.sub(r'^\+32-\(0\)', '0', telephone_number)
  telephone_number = re.sub(r'^\+32', '0', telephone_number)
  telephone_number = re.sub(r'^32', '0', telephone_number)

  telephone_number = re.sub(r'^\+0032-\(0\)', '0', telephone_number)
  telephone_number = re.sub(r'^0032-\(0\)', '0', telephone_number)
  telephone_number = re.sub(r'^0032', '0', telephone_number)

  telephone_number = re.sub(r'^\(\d\d\d\)', '0', telephone_number)
  telephone_number = re.sub(r'^0\(\d\d\d\)', '0', telephone_number)

  telephone_number =  re.sub(r'[\.a-zA-Z]', '', telephone_number)

  return split_telephone_number(telephone_number)

def split_telephone_number(telephone_number):
  telephone_number_1 = telephone_number_2 = np.NaN
  comment = []

  split = [telephone_number]
  if '//' in telephone_number:
    split = telephone_number.split('//')
  elif '-' in telephone_number:
    split = telephone_number.split('-')
  elif ';' in telephone_number:
    split = telephone_number.split(';')
  elif 'enGSM:' in telephone_number:
    split = telephone_number.split('enGSM:')

  split[0] = split[0].replace('/', '')

  if telephone_number != '' :
    if check_telephone_number_lenght(split[0]):
      telephone_number_1 = split[0]
    else:
      comment.append('Wrong telephone number lenght. Check it.')

    if len(split) == 2 :
      comment.append('Splitting. Check it.')
      split[1] = split[1].replace('/', '')
      if check_telephone_number_lenght(split[1]):
        telephone_number_2 = split[1]
      else:
        comment.append('Wrong telephone 2 number lenght. Check it.')

  return [telephone_number_1, telephone_number_2, ' - '.join(comment)]

def check_telephone_number_lenght(telephone_number):
  if  (len(telephone_number) < 9 or len(telephone_number) > 10):
    return False
  else :
    return True

### postcode_cleansing

In [None]:
def postcode_cleansing(postcode):
  postcode_cleansed = comment = np.NaN

  if postcode != 'nan':
    postcode = re.sub(r'\D', '', postcode)
    if  re.match(r'\d{4}', postcode):
      postcode_cleansed = postcode
    else: 
      comment = 'Wrong postcode format. Check it.'

  return [postcode_cleansed, comment]

### date_cleansing

In [None]:
def date_cleansing(date):
    dates_parsed = []

    if date != "nan":
      print(">> " + date)
      
      match = re.findall(r'\d{1,2}.\d{1,2}.\d{2,4}', date)
      if match:
        for m in match:
          date_parsed_match = dateparser.parse(m, settings={'DATE_ORDER': 'DMY'})
          dates_parsed.append(date_parsed_match)

      match = re.findall(r'\d{1,2} \w* \d{2,4}', date)
      if match:
        for m in match:
          date_parsed_match = dateparser.parse(m, settings={'DATE_ORDER': 'DMY'})
          dates_parsed.append(date_parsed_match)

      for item in dates_parsed:
          print("..." + str(item))

    return dates_parsed

In [None]:
import hashlib
import uuid

def concept_uri(base_uri, input):
  m = hashlib.md5()
  m.update(input.encode('utf-8'))

  return URIRef(base_uri + m.hexdigest())

def addLiteral(subject, predicate, objectColumn, datatype=None):
  if pd.notna(row[objectColumn]):
    if datatype == None:
      g.add((subject, predicate, Literal(str(row[objectColumn]), lang='nl')))
    else:
      g.add((subject, predicate, Literal(str(row[objectColumn]), datatype=datatype)))

# using statbel firstnames of newborns (heuristic)
m = pd.read_excel('Voornamen_Jongens_1995-2017_0.xls', sheet_name='1995-2019')
male_names = (m['Unnamed: 1'].append(m['Unnamed: 4']).append(m['Unnamed: 7']).append(m['Unnamed: 10'])).unique()
f = pd.read_excel('Voornamen_meisjes_1995-2017.xls', sheet_name='1995-2019')
female_names = (f['Unnamed: 1'].append(f['Unnamed: 4']).append(f['Unnamed: 7']).append(f['Unnamed: 10'])).unique()

manual_entries = ['Friedo', 'Renilde', 'Jozef', 'Maria-André', 'Gedo', 'Yvo', 'Marie-Cecile', 'Fonny', 'Luciaan', 'Willy', 'Fredy']
likely_last_names = ['Vos', 'Matthijs', 'Stevens', 'Maere', 'Rubens', 'Beer', 'Duran', 'Roos', 'Broos', 'Thijs', 'Perre', 'Joris', 'Winter', 'Claus', 'Thys', 'Massa', 'Roy']

first_names = np.concatenate([male_names,female_names, manual_entries])

first_names = np.delete(first_names, np.where(first_names == 'Van'))
first_names = np.delete(first_names, np.where(first_names == 'Blomme'))


def is_known_first_name(potential_name):
  return potential_name in first_names


def splitname(full_name):
  full_name.str.split(' ')


In [None]:
import re

mv = re.compile('(mevr|dhr)[\.]?[\s]?', re.IGNORECASE)

def remove_title(full_name):
  return mv.sub('', full_name)


def splitname(full_name):
  first = last = np.NaN
  comment = []

  split = remove_title(full_name).split(' ')

  if len(split) == 1 : comment.append('Cannot split name')

  potential_first_last = is_known_first_name(split[0])
  potential_last_first = is_known_first_name(split[-1])

  if potential_first_last and potential_last_first:
    if split[-1] in likely_last_names:
      first = split[0]
      last = ' '.join(split[1:])
    elif split[0] in likely_last_names:
      first = split[-1]
      last = ' '.join(split[0:-1])
    comment.append('Ambiguous: two possible first names - {}'.format(full_name))
  elif potential_first_last:
    first = split[0]
    last = ' '.join(split[1:])
  elif potential_last_first:
    first = split[-1]
    last = ' '.join(split[0:-1])
  else:
    comment.append('No potential first name found - {}'.format(full_name))
    # print([full_name])
  return [first, last, ' - '.join(comment)]

## Data cleansing

### Titel

In [None]:
eb['Titel'].dropna().unique()

In [None]:
eb[['Titel Cleansed']] = pd.DataFrame(eb['Titel'].astype(str).apply(space_cleansing).values.tolist())

In [None]:
eb[eb['Titel Cleansed'].str.contains(r"\s", na=False)]

### Status_EB

In [None]:
eb['Status_EB'].dropna().unique()

In [None]:
eb['Status_EB Cleansed'] = eb['Status_EB']

### Gemeente_EB

In [None]:
eb['Gemeente_EB'].dropna().unique()

In [None]:
eb['Gemeente_EB Cleansed'] = eb['Gemeente_EB'].str.strip().str.title().replace('Antwerpen (Deurne', 'Antwerpen (Deurne)')

In [None]:
eb['Gemeente_EB Cleansed'].dropna().unique()

### Provincie_EB

In [None]:
eb['Provincie_EB'].dropna().unique()

In [None]:
#eb['Provincie_EB Cleansed'] = eb['Provincie_EB']

In [None]:
gp['Gemeente'].dropna().unique()

In [None]:
def find_city_provincie(city):
  return gp[gp['Gemeente'].str.contains(city)]

In [None]:
eb['Provincie_EB Cleansed'] = None
eb['Provincie_EB Comment'] = None

In [None]:
for index, row in eb.iterrows():
  city = str(row['Gemeente_EB Cleansed'])
  result = find_city_provincie(city)
  
  if len(result) > 0:
    if str(result.iloc[0]['Provincie']) != str(row['Provincie_EB']):
      eb.at[index, 'Provincie_EB Cleansed'] = result.iloc[0]['Provincie'].strip().title()
      eb.at[index, 'Provincie_EB Comment'] = "Different Provincie"
    else:
      eb.at[index, 'Provincie_EB Cleansed'] = str(row['Provincie_EB'])
  elif city != 'NaN':
    eb.at[index, 'Provincie_EB Comment'] = "Municipality Not Found"
    eb.at[index, 'Provincie_EB Cleansed'] = str(row['Provincie_EB'])

In [None]:
eb['Provincie_EB Cleansed'] = eb['Provincie_EB Cleansed'].replace('nan', np.NaN)
eb['Provincie_EB Cleansed'].dropna().unique()

In [None]:
eb['Provincie_EB Comment'].dropna().unique()

In [None]:
eb[eb['Provincie_EB Comment'] == 'Different Provincie'][['Gemeente_EB Cleansed', 'Provincie_EB', 'Provincie_EB Cleansed', 'Provincie_EB Comment']]

### Straat_EB >> simplified

In [None]:
eb['Straat_EB'].dropna().unique()

In [None]:
eb[eb['Straat_EB'].str.contains(r"\d", na=False)]

In [None]:
eb['Straat_EB Cleansed'] = eb['Straat_EB'].str.strip()

### Huisnr_EB / Busnummer_EB

In [None]:
eb['Huisnr_EB'].dropna().unique()

In [None]:
eb[['Huisnr_EB Cleansed', 'Busnummer_EB Cleansed', 'Huisnr_EB Comment']] = pd.DataFrame(eb['Huisnr_EB'].astype(str).apply(split_house_bus_number).values.tolist(), columns=['house_number', 'bus_number', 'comment'])
eb[eb['Huisnr_EB Comment'].str.contains('\w', na=False)]

In [None]:
eb['Huisnr_EB Cleansed'].dropna().unique()

In [None]:
eb['Busnummer_EB Cleansed'].dropna().unique()

### Postcode_EB

In [None]:
eb['Postcode_EB'].unique()

In [None]:
eb[['Postcode_EB Cleansed', 'Postcode_EB Comment']] = pd.DataFrame(eb['Postcode_EB'].astype(str).apply(postcode_cleansing).values.tolist(), columns=['postcode_cleansed','comment'])

In [None]:
eb['Postcode_EB Cleansed'].unique()

### Naam_EB

In [None]:
eb['Naam_EB'].dropna().unique()

In [None]:
eb[eb['Naam_EB'].str.contains(r"\d", na=False)]

In [None]:
eb['Naam_EB Cleansed'] = eb['Naam_EB']

### KBO_EB

In [None]:
eb['KBO_EB'].dropna().unique()

In [None]:
eb[eb['KBO_EB'].str.contains(r'\D', na=False)]

In [None]:
eb[['KBO_EB Cleansed', 'KBO_EB Comment']] = pd.DataFrame(eb['KBO_EB'].astype(str).apply(kbo_cleansing).values.tolist(), columns=['kbo_cleansed','comment'])

In [None]:
eb[eb['KBO_EB Comment'].str.contains('\w', na=False)]

In [None]:
eb['organization_id'] = eb['KBO_EB Cleansed'].fillna(eb['Titel'])

### Voorzitter_EB

#### Naam_voorzitter_EB

In [None]:
eb['Naam_voorzitter_EB'].dropna().unique()

In [None]:
eb['Naam_voorzitter_EB Cleansed'] = eb['Naam_voorzitter_EB'].str.replace('<br>', '').str.strip()
eb[['Naam_voorzitter_EB First', 'Naam_voorzitter_EB Last', 'Naam_voorzitter_EB Comment']] = pd.DataFrame(eb['Naam_voorzitter_EB Cleansed'].astype(str).apply(splitname).values.tolist(), columns=['first', 'last', 'comment'])

In [None]:
eb['Naam_voorzitter_EB First'].dropna().unique()

In [None]:
eb['Naam_voorzitter_EB Last'].dropna().unique()

#### Adres_voorzitter_EB >> simplified

In [None]:
eb['Adres_voorzitter_EB'].dropna().unique()

In [None]:
eb['Adres_voorzitter_EB Cleansed'] = eb['Adres_voorzitter_EB'].str.replace('<br>', '').str.strip()

#### Mail_voorzitter_EB

In [None]:
eb['Mail_voorzitter_EB'].dropna().unique()

In [None]:
eb[['Mail_voorzitter_EB Cleansed', 'Mail_voorzitter_EB Comment']] = pd.DataFrame(eb['Mail_voorzitter_EB'].astype(str).apply(mail_cleansing).values.tolist(), columns=['mail_cleansed','comment'])

In [None]:
eb[eb['Mail_voorzitter_EB Comment'].str.contains('\w', na=False)]

#### Tel_voorzitter_EB

In [None]:
eb[['Tel_voorzitter_EB 1', 'Tel_voorzitter_EB 2', 'Tel_voorzitter_EB Comment']] = pd.DataFrame(eb['Tel_voorzitter_EB'].astype(str).apply(telephone_number_cleansing).values.tolist(), columns=['telephone_number_1', 'telephone_number_2', 'comment'])

In [None]:
eb[eb['Tel_voorzitter_EB Comment'].str.contains(r'\w', na=False)]

### Penningmeester_EB

#### Naam_penningmeester_EB

In [None]:
eb['Naam_penningmeester_EB Cleansed'] = eb['Naam_penningmeester_EB'].str.replace('<br>', '').str.strip()
eb[['Naam_penningmeester_EB First', 'Naam_penningmeester_EB Last', 'Naam_penningmeester_EB Comment']] = pd.DataFrame(eb['Naam_penningmeester_EB Cleansed'].astype(str).apply(splitname).values.tolist(), columns=['first', 'last', 'comment'])

#### Adres_penningmeester_EB >> simplified

In [None]:
eb['Adres_penningmeester_EB Cleansed'] = eb['Adres_penningmeester_EB'].str.replace('<br>', '').str.strip()

#### Mail_penningmeester_EB

In [None]:
eb[['Mail_penningmeester_EB Cleansed', 'Mail_penningmeester_EB Comment']] = pd.DataFrame(eb['Mail_penningmeester_EB'].astype(str).apply(mail_cleansing).values.tolist(), columns=['mail_cleansed','comment'])

In [None]:
eb[eb['Mail_penningmeester_EB Comment'].str.contains('\w', na=False)]

#### Tel_penningmeester_EB

In [None]:
eb[['Tel_penningmeester_EB 1', 'Tel_penningmeester_EB 2', 'Tel_penningmeester_EB Comment']] = pd.DataFrame(eb['Tel_penningmeester_EB'].astype(str).apply(telephone_number_cleansing).values.tolist(), columns=['telephone_number_1', 'telephone_number_2', 'comment'])

In [None]:
eb[eb['Tel_penningmeester_EB Comment'].str.contains(r'\w', na=False)]

### Secretaris_EB

#### Naam_secretaris_EB

In [None]:
eb['Naam_secretaris_EB Cleansed'] = eb['Naam_secretaris_EB'].str.replace('<br>', '').str.strip()
eb[['Naam_secretaris_EB First', 'Naam_secretaris_EB Last', 'Naam_secretaris_EB Comment']] = pd.DataFrame(eb['Naam_secretaris_EB Cleansed'].astype(str).apply(splitname).values.tolist(), columns=['first', 'last', 'comment'])

#### Adres_secretaris_EB >> simplified

In [None]:
eb['Adres_secretaris_EB Cleansed'] = eb['Adres_secretaris_EB'].str.replace('<br>', '').str.strip()

#### Mail_secretaris_EB

In [None]:
eb[['Mail_secretaris_EB Cleansed', 'Mail_secretaris_EB Comment']] = pd.DataFrame(eb['Mail_secretaris_EB'].astype(str).apply(mail_cleansing).values.tolist(), columns=['mail_cleansed','comment'])

In [None]:
eb[eb['Mail_secretaris_EB Comment'].str.contains('\w', na=False)]

#### Tel_secretaris_EB

In [None]:
eb[['Tel_secretaris_EB 1', 'Tel_secretaris_EB 2', 'Tel_secretaris_EB Comment']] = pd.DataFrame(eb['Tel_secretaris_EB'].astype(str).apply(telephone_number_cleansing).values.tolist(), columns=['telephone_number_1', 'telephone_number_2', 'comment'])

In [None]:
eb[eb['Tel_secretaris_EB Comment'].str.contains(r'Wrong', na=False)]

### Naam_Lid4

In [None]:
eb['Naam_Lid4'].dropna().unique()

In [None]:
eb['Naam_Lid4 Cleansed'] = eb['Naam_Lid4'].str.replace('<br>', '').str.strip()
eb[['Naam_Lid4 First', 'Naam_Lid4 Last', 'Naam_Lid4 Comment']] = pd.DataFrame(eb['Naam_Lid4 Cleansed'].astype(str).apply(splitname).values.tolist(), columns=['first', 'last', 'comment'])

In [None]:
eb['Naam_Lid4 First'].dropna().unique()

### Naam_Lid5

In [None]:
eb['Naam_Lid5'].dropna().unique()

In [None]:
eb['Naam_Lid5 Cleansed'] = eb['Naam_Lid5'].str.replace('<br>', '').str.strip()
eb[['Naam_Lid5 First', 'Naam_Lid5 Last', 'Naam_Lid5 Comment']] = pd.DataFrame(eb['Naam_Lid5 Cleansed'].astype(str).apply(splitname).values.tolist(), columns=['first', 'last', 'comment'])

In [None]:
eb['Naam_Lid5 First'].dropna().unique()

### Verkiezingen17_Opmerkingen

In [None]:
eb['Verkiezingen17_Opmerkingen'].dropna().unique()

In [None]:
  for index, row in eb.iterrows():
    date = str(row['Verkiezingen17_Opmerkingen'])

    if date != 'nan':
      dates_parsed = date_cleansing(date)

      if dates_parsed :
        eb.at[index, 'Verkiezingen17_Opmerkingen Cleansed'] = dates_parsed[0]
        if len(dates_parsed) > 1:
          comment = []
          for i in range(1, len(dates_parsed)):
            comment.append(str(dates_parsed[i]))
            eb.at[index, 'Verkiezingen17_Opmerkingen Comment'] = ' - '.join(comment)
        else:
          eb.at[index, 'Verkiezingen17_Opmerkingen Comment'] = np.NaN
      else:
        eb.at[index, 'Verkiezingen17_Opmerkingen Cleansed'] = np.NaN

        eb['Verkiezingen17_Opmerkingen Comment'] = eb['Verkiezingen17_Opmerkingen Comment'].astype(object)
        eb.at[index, 'Verkiezingen17_Opmerkingen Comment'] = 'Wrong date format. Check it.'


In [None]:
eb['Verkiezingen17_Opmerkingen Comment'].dropna().unique()

In [None]:
eb['Verkiezingen17_Opmerkingen Cleansed'].dropna().unique()

### Verkiezingen2020_Opmerkingen 

In [None]:
eb['Verkiezingen2020_Opmerkingen'].dropna().unique()

In [None]:
  for index, row in eb.iterrows():
    date = str(row['Verkiezingen2020_Opmerkingen'])
    
    if date != 'nan':
      dates_parsed = date_cleansing(date)

      if dates_parsed :
        eb.at[index, 'Verkiezingen2020_Opmerkingen Cleansed'] = dates_parsed[0]
        if len(dates_parsed) > 1:
          comment = []
          for i in range(1, len(dates_parsed)):
            comment.append(str(dates_parsed[i]))
            eb.at[index, 'Verkiezingen2020_Opmerkingen Comment'] = ' - '.join(comment)
        else:
          eb.at[index, 'Verkiezingen2020_Opmerkingen Comment'] = np.NaN
      else:
        eb.at[index, 'Verkiezingen2020_Opmerkingen Cleansed'] = np.NaN

        eb['Verkiezingen2020_Opmerkingen Comment'] = eb['Verkiezingen2020_Opmerkingen Comment'].astype(object)
        eb.at[index, 'Verkiezingen2020_Opmerkingen Comment'] = 'Wrong date format. Check it.'

In [None]:
date_parsed_search = search_dates('Verkiezing grote helft 2020 bij kerkraadsbesluit van 2 maart 2020 (voor de strenge maatregelen ivm corona van 18 maart 2020)', settings={'DATE_ORDER': 'DMY'})
for item in date_parsed_search:
  print("..." + str(item[1]))

In [None]:
date_parsed_match = dateparser.parse('2 maart 2020)', settings={'DATE_ORDER': 'DMY'})
print(date_parsed_match)
  

In [None]:
date_parsed_match = dateparser.parse('2020)', settings={'DATE_ORDER': 'DMY'})
print(date_parsed_match)

In [None]:
date_parsed_search = search_dates('In 2019 verkiezing KH)', settings={'DATE_ORDER': 'DMY'})
for item in date_parsed_search:
  print("..." + str(item[1]))

### Datum verkiezing voorzitter

In [None]:
eb['Datum verkiezing voorzitter Cleansed'] = eb['Datum verkiezing voorzitter']

In [None]:
eb['Datum verkiezing voorzitter Cleansed'].dropna().unique()

### Datum verkiezing penningmeester

In [None]:
eb['Datum verkiezing penningmeester'].dropna().unique()

In [None]:
eb['Datum verkiezing penningmeester Cleansed'] = eb['Datum verkiezing penningmeester'].replace(' ', np.NaN); 

In [None]:
eb['Datum verkiezing penningmeester Cleansed'].dropna().unique()

### Datum verkiezing secretaris

In [None]:
eb['Datum verkiezing secretaris Cleansed'] = eb['Datum verkiezing secretaris']

In [None]:
eb['Datum verkiezing secretaris Cleansed'].dropna().unique()

### Datum verkiezing lid 4

In [None]:
eb['Datum verkiezing lid 4 Cleansed'] = eb['Datum verkiezing lid 4']

In [None]:
eb['Datum verkiezing lid 4 Cleansed'].dropna().unique()

### Datum verkiezing lid 5

In [None]:
eb['Datum verkiezing lid 5 Cleansed'] = eb['Datum verkiezing lid 5']

In [None]:
eb['Datum verkiezing lid 5 Cleansed'].dropna().unique()

# Export data

In [None]:
eb.to_excel('eredienstbesturen.xlsx')

## Init graph

In [None]:
#namspaces
org = Namespace('http://www.w3.org/ns/org#')
locn = Namespace('http://www.w3.org/ns/locn#')
dc_terms= Namespace('http://purl.org/dc/terms/')
schema = Namespace('http://schema.org/')
regorg = Namespace('http://www.w3.org/ns/regorg#')
person = Namespace('http://www.w3.org/ns/person#')
vcard = Namespace('http://www.w3.org/2006/vcard/ns#')
dbpedia = Namespace('https://dbpedia.org/ontology/')

organisatie = Namespace('https://data.vlaanderen.be/ns/organisatie#')
persoon = Namespace('https://data.vlaanderen.be/ns/persoon#')
adres = Namespace('https://data.vlaanderen.be/ns/adres#')
generiek = Namespace('https://data.vlaanderen.be/ns/generiek#')
mandaat = Namespace('http://data.vlaanderen.be/ns/mandaat#')
besluit = Namespace('http://data.vlaanderen.be/ns/besluit#')

#lblod = Namespace('https://contacthub-dev.lblod.info/id/')
lblod = Namespace('https://contacthub-qa.lblod.info/id/')
os = Namespace('https://data.vlaanderen.be/id/concept/OrganisatieStatus/')
oc = Namespace('https://data.vlaanderen.be/id/concept/OrganisatieClassificatie/')


# Predefined concepts:
bestuur_van_de_eredienst = URIRef("http://data.vlaanderen.be/id/concept/BestuurseenheidClassificatieCode/66ec74fd-8cfc-4e16-99c6-350b35012e86")


In [None]:
g = Graph()

In [None]:
for status in eb['Status_EB'].dropna().unique():
  subject = concept_uri(os, status)
  g.add((subject, RDF.type, SKOS.Concept))
  g.add((subject, SKOS.prefLabel, Literal(status, lang='nl')))
  g.add((subject, SKOS.definition, Literal(status, lang='nl')))
  if status.startswith('Operationeel'):
    g.add((subject, SKOS.broader, os.actief))
  else:
    g.add((subject, SKOS.broader, os.nietactief))

In [None]:
voorzitter_concept = concept_uri(lblod + 'concept/BestuursFunctieCode/', 'voorzitter')
secretaris_concept = concept_uri(lblod + 'concept/BestuursFunctieCode/', 'secretaris')
penningmeester_concept = concept_uri(lblod + 'concept/BestuursFunctieCode/', 'penningmeester')

# Mapping

**To do**

* Titel
* Status_EB
* Provincie_EB
* Straat_EB
* Huisnr_EB >> Needs to be split in housenr and busnr
* Busnummer_EB
* Postcode_EB
* Gemeente_EB
* Naam_EB
* KBO_EB
* Adres_voorzitter_EB
* Mail_voorzitter_EB
* Tel_voorzitter_EB
* Naam_penningmeester_EB
* Adres_penningmeester_EB
* Mail_penningmeester_EB
* Tel_penningmeester_EB
* Naam_secretaris_EB
* Adres_secretaris_EB
* Mail_secretaris_EB
* Tel_secretaris_EB





fyi: latest version of centrale kerk besturen

https://colab.research.google.com/drive/1NpNfYxUPWq-WcQ58DezGjbYxn0qdOZCa#scrollTo=MtdteyNS3_Vk&line=1&uniqifier=1

In [None]:
eb.info()

In [None]:
for index, row in eb.iterrows():
  abb_id = concept_uri(lblod + 'organisatie/', str(row['organization_id']))
  g.add((abb_id, RDF.type, org.Organization))

  g.add((abb_id, regorg.orgStatus, concept_uri(os, str(row['Status_EB']))))

  site_id = concept_uri(lblod + 'vestiging/', str(row['organization_id']))
  g.add((site_id, RDF.type, org.Site))

  address_id = concept_uri(lblod + 'adresvoorstelling/', str(row['organization_id']))
  g.add((address_id, RDF.type, locn.Address))
  addLiteral(address_id, locn.adminUnitL2, 'Provincie_EB Cleansed')
  addLiteral(address_id, locn.thoroughfare, 'Straat_EB')
  addLiteral(address_id, adres['Adresvoorstelling.huisnummer'], 'Huisnr_EB Cleansed', XSD.string)
  addLiteral(address_id, adres['Adresvoorstelling.busnummer'], 'Busnummer_EB Cleansed', XSD.string)
  addLiteral(address_id, locn.postCode, 'Postcode_EB Cleansed', XSD.string)
  addLiteral(address_id, adres.gemeenttenaam, 'Gemeente_EB Cleansed', XSD.string)
  g.add((address_id, adres.land, Literal('België', lang='nl')))

  g.add((site_id, organisatie.bestaatUit, address_id))
  g.add((abb_id, org.hasPrimarySite, site_id))

  addLiteral(abb_id, SKOS.prefLabel, 'Naam_EB')
  addLiteral(abb_id, regorg.legalName, 'Naam_EB')

  kbo_id = concept_uri(lblod + 'gestructureerdeIdentificator/', str(row['KBO_EB Cleansed']))
  g.add((kbo_id, RDF.type, generiek.GestructureerdeIdentificator))
  addLiteral(kbo_id, generiek.lokaleIdentificator, 'KBO_EB Cleansed', XSD.string)

  g.add((abb_id, org.classification, bestuur_van_de_eredienst))

  #Bestuur
  bestuur = concept_uri(lblod + 'bestuursorgaan/', str(row['organization_id']))
  g.add((bestuur, RDF.type, besluit.Bestuursorgaan))
  g.add((bestuur, besluit.bestuurt, abb_id))

  bestuur_temporary = concept_uri(lblod + 'bestuursorgaan/', str(row['organization_id']) + str(datetime.now().year))
  g.add((bestuur_temporary, RDF.type, besluit.Bestuursorgaan))
  g.add((bestuur_temporary, generiek.isTijdspecialisatieVan, bestuur))

  # Voorzitter
  voorzitter = concept_uri(lblod + 'persoon/', str(row['Naam_voorzitter_EB First']) + str(row['Naam_voorzitter_EB Last']))
  g.add((voorzitter, RDF.type, person.Person))
  addLiteral(voorzitter, persoon.gebruikteVoornaam, 'Naam_voorzitter_EB First')
  addLiteral(voorzitter, FOAF.familyName, 'Naam_voorzitter_EB Last')

  ## Tel voorzitter - 
  voorzitter_site_id = concept_uri(lblod + 'vestiging/', str(row['organization_id']) + str(row['Naam_voorzitter_EB First']) + str(row['Naam_voorzitter_EB Last']))
  g.add((voorzitter_site_id, RDF.type, org.Site))
  g.add((voorzitter, org.basedAt, voorzitter_site_id))

  voorzitter_contact_uri = concept_uri(lblod + 'contactinfo/', str(row['organization_id']) + str(row['Naam_voorzitter_EB First']) + str(row['Naam_voorzitter_EB Last']) + 'Tel_voorzitter_EB 1')
  g.add((voorzitter_contact_uri, RDF.type, schema.ContactPoint))
  g.add((voorzitter_site_id, org.siteAddress, voorzitter_contact_uri))
  addLiteral(voorzitter_contact_uri, schema.telephone, 'Tel_voorzitter_EB 1')
  addLiteral(voorzitter_contact_uri, schema.email, 'Mail_voorzitter_EB Cleansed')  

  if str(row['Tel_voorzitter_EB 2']) != str(np.nan):
    voorzitter_contact_2_uri = concept_uri(lblod + 'contactinfo/', str(row['organization_id']) + str(row['Naam_voorzitter_EB First']) + str(row['Naam_voorzitter_EB Last']) + 'Tel_voorzitter_EB 2')
    g.add((voorzitter_contact_2_uri, RDF.type, schema.ContactPoint))
    g.add((voorzitter_site_id, org.siteAddress, voorzitter_contact_2_uri))
    addLiteral(voorzitter_contact_2_uri, schema.telephone, 'Tel_voorzitter_EB 2')

  # Address
  voorzitter_address_id = concept_uri(lblod + 'adresvoorstelling/', str(row['organization_id']) + str(row['Naam_voorzitter_EB First']) + str(row['Naam_voorzitter_EB Last'])) 
  g.add((voorzitter_address_id, RDF.type, locn.Address))
  g.add((voorzitter_site_id, organisatie.bestaatUit, voorzitter_address_id))
  addLiteral(voorzitter_address_id, locn.fullAddress, 'Adres_voorzitter_EB Cleansed')
  
  ## Mandataris
  voorzitter_mandataris = concept_uri(lblod + 'mandataris/', str(row['organization_id']) + str(row['Naam_voorzitter_EB First']) + str(row['Naam_voorzitter_EB Last']) + 'voorzitter')
  g.add((voorzitter_mandataris, RDF.type, mandaat.Mandataris))
  g.add((voorzitter, mandaat.isAangesteldAls, voorzitter_mandataris))
  g.add((voorzitter_mandataris, mandaat.isBestuurlijkeAliasVan, voorzitter))
  #start
  #einde
  #status ~ cf loket lokale besturen PoC https://poc-form-builder.relance.s.redpencil.io/codelijsten
  voorzitter_mandaat = concept_uri(lblod + 'mandaat/', str(row['organization_id']) + str(row['Naam_voorzitter_EB First']) + str(row['Naam_voorzitter_EB Last']))
  g.add((voorzitter_mandaat, RDF.type, mandaat.Mandaat))
  g.add((voorzitter_mandataris, org.holds, voorzitter_mandaat))
  g.add((voorzitter_mandaat, org.role, voorzitter_concept))

  g.add((bestuur_temporary, org.hasPost, voorzitter_mandaat))
  #g.add((voorzitter_mandaat, org.holds, bestuur_temporary))
  
  #Secretaris
  secretaris =  concept_uri(lblod + 'persoon/', str(row['Naam_secretaris_EB First']) + str(row['Naam_secretaris_EB Last']))
  g.add((secretaris, RDF.type, person.Person))
  addLiteral(secretaris, persoon.gebruikteVoornaam, 'Naam_secretaris_EB First')
  addLiteral(secretaris, FOAF.familyName, 'Naam_secretaris_EB Last')
  
  ## Tel secretaris
  secretaris_vestiging_uri = concept_uri(lblod + 'vestiging/', str(row['organization_id']) + str(row['Naam_secretaris_EB First']) + str(row['Naam_secretaris_EB Last']))
  g.add((secretaris_vestiging_uri, RDF.type, org.Site))
  g.add((secretaris, org.basedAt, secretaris_vestiging_uri))

  secretaris_contact_uri = concept_uri(lblod + 'contactinfo/', str(row['organization_id']) + str(row['Naam_secretaris_EB First']) + str(row['Naam_secretaris_EB Last']) + 'Tel_secretaris_EB 1')
  g.add((secretaris_contact_uri, RDF.type, schema.ContactPoint))
  g.add((secretaris_vestiging_uri, organisatie.contactinfo, secretaris_contact_uri))
  addLiteral(secretaris_contact_uri, schema.telephone, 'Tel_secretaris_EB 1')
  addLiteral(secretaris_contact_uri, schema.email, 'Mail_secretaris_EB Cleansed')

  if str(row['Tel_secretaris_EB 2']) != str(np.nan):
    secretaris_contact_2_uri = concept_uri(lblod + 'contactinfo/', str(row['organization_id']) + str(row['Naam_secretaris_EB First']) + str(row['Naam_secretaris_EB Last']) + 'Tel_secretaris_EB 2')
    g.add((secretaris_contact_2_uri, RDF.type, schema.ContactPoint))
    g.add((secretaris_vestiging_uri, org.siteAddress, secretaris_contact_2_uri))
    addLiteral(secretaris_contact_2_uri, schema.telephone, 'Tel_secretaris_EB 2')

  secretaris_address_id = concept_uri(lblod + 'adresvoorstelling/', str(row['organization_id']) + str(row['Naam_secretaris_EB First']) + str(row['Naam_secretaris_EB Last'])) 
  g.add((secretaris_address_id, RDF.type, locn.Address))
  g.add((secretaris_vestiging_uri, organisatie.bestaatUit, secretaris_address_id))
  addLiteral(secretaris_address_id, locn.fullAddress, 'Adres_secretaris_EB Cleansed')

  #Mandataris
  secretaris_mandataris = concept_uri(lblod + 'mandataris/', str(row['organization_id']) + str(row['Naam_secretaris_EB First']) + str(row['Naam_secretaris_EB Last']) + 'secretaris')
  g.add((secretaris_mandataris, RDF.type, mandaat.Mandataris))
  g.add((secretaris, mandaat.isAangesteldAls, secretaris_mandataris))
  g.add((secretaris_mandataris, mandaat.isBestuurlijkeAliasVan, secretaris))
  #start
  #einde
  #status
  secretaris_mandaat = concept_uri(lblod + 'mandaat/', str(row['organization_id']) + str(row['Naam_secretaris_EB First']) + str(row['Naam_secretaris_EB Last']))
  g.add((secretaris_mandaat, RDF.type, mandaat.Mandaat))
  g.add((secretaris_mandataris, org.holds, secretaris_mandaat))
  g.add((secretaris_mandaat, org.role, secretaris_concept))
  
  g.add((bestuur_temporary, org.hasPost, secretaris_mandaat))
  #g.add((secretaris_mandaat, org.holds, bestuur_temporary))

  #Penningmeester
  penningmeester =  concept_uri(lblod + 'persoon/', str(row['Naam_penningmeester_EB First']) + str(row['Naam_penningmeester_EB Last']))
  g.add((penningmeester, RDF.type, person.Person))
  addLiteral(penningmeester, persoon.gebruikteVoornaam, 'Naam_penningmeester_EB First')
  addLiteral(penningmeester, FOAF.familyName, 'Naam_penningmeester_EB Last')
  
  ## Tel penningmeester
  penningmeester_vestiging_uri = concept_uri(lblod + 'vestiging/', str(row['organization_id']) + str(row['Naam_penningmeester_EB First']) + str(row['Naam_penningmeester_EB Last']))
  g.add((penningmeester_vestiging_uri, RDF.type, org.Site))
  g.add((penningmeester, org.basedAt, penningmeester_vestiging_uri))

  penningmeester_contact_uri = concept_uri(lblod + 'contactinfo/', str(row['organization_id']) + str(row['Naam_penningmeester_EB First']) + str(row['Naam_penningmeester_EB Last']) + 'Tel_penningmeester_EB 1')
  g.add((penningmeester_contact_uri, RDF.type, schema.ContactPoint))
  g.add((penningmeester_vestiging_uri, organisatie.contactinfo, penningmeester_contact_uri))
  addLiteral(penningmeester_contact_uri, schema.telephone, 'Tel_penningmeester_EB 1')
  addLiteral(penningmeester_contact_uri, schema.email, 'Mail_penningmeester_EB Cleansed')

  if str(row['Tel_penningmeester_EB 2']) != str(np.nan):
    penningmeester_contact_2_uri = concept_uri(lblod + 'contactinfo/', str(row['organization_id']) + str(row['Naam_penningmeester_EB First']) + str(row['Naam_penningmeester_EB Last']) + 'Tel_penningmeester_EB 2')
    g.add((penningmeester_contact_2_uri, RDF.type, schema.ContactPoint))
    g.add((penningmeester_vestiging_uri, organisatie.contactinfo, penningmeester_contact_2_uri))
    addLiteral(penningmeester_contact_2_uri, schema.telephone, 'Tel_penningmeester_EB 2')

  penningmeester_address_id = concept_uri(lblod + 'adresvoorstelling/', str(row['organization_id']) + str(row['Naam_penningmeester_EB First']) + str(row['Naam_penningmeester_EB Last'])) 
  g.add((penningmeester_address_id, RDF.type, locn.Address))
  g.add((penningmeester_vestiging_uri, organisatie.bestaatUit, penningmeester_address_id))
  addLiteral(penningmeester_address_id, locn.fullAddress, 'Adres_penningmeester_EB Cleansed')

  #Mandataris
  penningmeester_mandataris = concept_uri(lblod + 'mandataris/', str(row['organization_id']) + str(row['Naam_penningmeester_EB First']) + str(row['Naam_penningmeester_EB Last']) + 'penningmeester')
  g.add((penningmeester_mandataris, RDF.type, mandaat.Mandataris))
  g.add((penningmeester, mandaat.isAangesteldAls, penningmeester_mandataris))
  g.add((penningmeester_mandataris, mandaat.isBestuurlijkeAliasVan, penningmeester))
  #start
  #einde
  #status
  penningmeester_mandaat = concept_uri(lblod + 'mandaat/', str(row['organization_id']) + str(row['Naam_penningmeester_EB First']) + str(row['Naam_penningmeester_EB Last']))
  g.add((penningmeester_mandaat, RDF.type, mandaat.Mandaat))
  g.add((penningmeester_mandataris, org.holds, penningmeester_mandaat))
  g.add((penningmeester_mandaat, org.role, penningmeester_concept))
  
  g.add((bestuur_temporary, org.hasPost, penningmeester_mandaat))
  #g.add((penningmeester_mandaat, org.holds, bestuur_temporary))



In [None]:
g.serialize('eb-qa.ttl',format='turtle')

# Checks/tests

In [None]:
ckb['Status_CKB'].unique()

In [None]:
hash('a')

In [None]:
import random

In [None]:
rd = random.Random()

In [None]:
rd.getrandbits(128)

In [None]:
import hashlib
import uuid

m = hashlib.md5()
m.update("aaa".encode('utf-8'))
new_uuid = uuid.UUID(m.hexdigest())


In [None]:
new_uuid

In [None]:
m.update("aaa".encode('utf-8'))
new_uuid = uuid.UUID(m.hexdigest())

In [None]:
new_uuid

In [None]:
a = reproducible_uuid('qqq')

In [None]:
a.hex

In [None]:
s = URIRef(lblod+a.hex)

In [None]:
s

In [None]:
!head output.ttl

In [None]:
os.

In [None]:
ckb['Status_CKB'].unique()

In [None]:
pd.Series(ckb['Status_CKB'].unique()).values != 'Niet actief - niet van toepassing'

In [None]:
ckb['Naam_Voorzitter_CKB'].str.replace('<br>', '')

In [None]:
ckb['Naam_secretaris_CKB'].str.replace('<br>', '')

In [None]:
from nameparser import HumanName

In [None]:
HumanName('Coussement Johan')

In [None]:
is_known_first_name('Didier')

In [None]:
m = pd.read_excel('Voornamen_Jongens_1995-2017_0.xls', sheet_name='1995-2019')

In [None]:
m

In [None]:
male_names = (m['Unnamed: 1'].append(m['Unnamed: 4']).append(m['Unnamed: 7']).append(m['Unnamed: 10'])).unique()

In [None]:
f = pd.read_excel('Voornamen_meisjes_1995-2017.xls', sheet_name='1995-2019')

In [None]:
female_names = (f['Unnamed: 1'].append(f['Unnamed: 4']).append(f['Unnamed: 7']).append(f['Unnamed: 10'])).unique()

# Oslo

In [None]:
oslo = pd.read_excel('OSLO_Mapping_20210322.xlsx', sheet_name='vocabulary_CH-core_FORMAT-Nordi')

In [None]:
oslo 

In [None]:
o = Graph();

oslo['Uri'] = oslo['Uri'].str.replace('org:', 'http://www.w3.org/ns/org#')

for index, row in oslo.dropna(subset=['Uri']).iterrows():
  s = URIRef(row['Uri'])
  # NOT EVERYTHING IS A OBJECT PROPERTY, IT CAN ALSO BE A DATATYPE PROPERTY. 
  o.add((s, RDF.type, URIRef('http://www.w3.org/2002/07/owl#ObjectProperty')))
  o.add((s, RDFS.comment, Literal(row['comment'], lang='nl')))
  o.add((s, RDFS.label, Literal(row['label'], lang='nl')))
  
print(o.serialize(format='turtle').decode('UTF-8'))

In [None]:
o.serialize('oslo.ttl',format='turtle')

# Open issues/questions


*   identifier: not every entity has a (valid) KBO number. cf cleansinig. Uri's are currently based on internal identifier
*   heeftvesitiging ipv adres? https://data.vlaanderen.be/ns/organisatie (hasSite)
*   Base uri's
*   Secretaris ~ hoedanigheid bugdethouder?? <> penningmeester


