# KG Construction based on RML mapping.

> The goal is to use [pandas](https://pandas.pydata.org/docs/) and [RML](https://rml.io/) to construct a simple knowledge graph. We will then construct an ontology to create semantic structure for the column heading, SHACL shapes to validate the structure. We are also using [literate programming](https://en.wikipedia.org/wiki/Literate_programming) methodology and FastAI [nbdev](https://nbdev.fast.ai/) as a tool.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#|echo: true
# I want to load the csv file "./data/CO2Meter_GPS_iso.csv" to a pandas data frame.
import pandas as pd

df = pd.read_csv('./data/CO2Meter_GPS_iso.csv')


In [None]:
#|echo: true
# I want to print the column headers from the data frame.
import pandas as pd

df = pd.read_csv('./data/CO2Meter_GPS_iso.csv')
print(df.columns.to_list())
print(df.head(5))


['CO2 (PPM)', ' Latitude', ' Longitude', ' Altitude', ' Air Speed (m/s)', ' Mode', ' Fixed Satellites', ' Available Satellites', 'voltage', 'current', 'level', 'id']
   CO2 (PPM)   Latitude   Longitude   Altitude   Air Speed (m/s)       Mode  \
0        395  44.382214  -73.258468      66.12              0.02  STABILIZE   
1        394  44.382213  -73.258468      66.12              0.03  STABILIZE   
2        394  44.382213  -73.258468      66.13              0.05  STABILIZE   
3        394  44.382214  -73.258467      66.16              0.08  STABILIZE   
4        395  44.382214  -73.258467      66.19              0.01     GUIDED   

    Fixed Satellites   Available Satellites  voltage current level  \
0                  4                     11      0.0    None  None   
1                  4                     11      0.0    None  None   
2                  4                     11      0.0    None  None   
3                  4                     11      0.0    None  None   
4        

In [None]:
#|echo: true
# I want to construct a rml mapping for this CSV file. Given each column header CO2Meter_GPS_iso.csv
# generate the mapping to load this CSV into a knowledg graph. I want to define the rml as a python string and
# write the mapping to a file called "mappings.ttl". You don't need rdflib for this task. Just a python string and a 
# python write to the mappings file. Make sure to include the necessary prefixes in the string. Make sure to 
# include rml:logicalSource 
#        rml:source CO2Meter_GPS_iso.csv";
# as part of the string. Use the column headers loaded in the previous pandas data frame for the RML mapping.


In [None]:
import pandas as pd

df = pd.read_csv('./data/CO2Meter_GPS_iso.csv')

column_headers = ['CO2 (PPM)', ' Latitude', ' Longitude', ' Altitude', ' Air Speed (m/s)', ' Mode', ' Fixed Satellites', ' Available Satellites', 'voltage', 'current', 'level', 'id']

rml_mapping = '@prefix rr: <http://www.w3.org/ns/r2rml#> .\n' \
              '@prefix rml: <http://semweb.mmlab.be/ns/rml#> .\n' \
              '@prefix ql: <http://semweb.mmlab.be/ns/ql#> .\n' \
              '@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n' \
              '\n' \
              '<#ObservationMap>\n' \
              'rr:logicalTable [\n' \
              '    rr:tableName "CO2Meter_GPS_iso"\n' \
              '];\n'

for header in column_headers:
    header_clean = header.strip()
    if header_clean == 'id':
        rml_mapping += 'rr:subjectMap [\n' \
                       '    rr:template "http://example.com/{' + header_clean + '}/";\n' \
                       '    rr:class ex:Observation\n' \
                       '];\n'
    else:
        rml_mapping += 'rr:predicateObjectMap [\n' \
                       '    rr:predicate ex:{' + header_clean + '};\n' \
                       '    rr:objectMap [\n' \
                       '        rml:reference "' + header_clean + '"\n' \
                       '    ]\n' \
                       '];\n'

with open('obs_mapper.ttl', 'w') as f:
    f.write(rml_mapping)


In [None]:
import pandas as pd

df = pd.read_csv('./data/CO2Meter_GPS_iso.csv')

# Constructing RML mapping as a string
rml_prefixes = '''
@prefix rml: <http://semweb.mmlab.be/ns/rml#> .
@prefix rr: <http://www.w3.org/ns/r2rml#> .
@prefix ql: <http://semweb.mmlab.be/ns/ql#> .
@prefix co2: <http://example.com/co2#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
'''

rml_mapping = rml_prefixes + f'''
co2:co2_measurement a rr:TriplesMap ;
    rml:logicalSource [
        rml:source "CO2Meter_GPS_iso.csv";
        rml:referenceFormulation ql:CSV
    ] ;
'''

for column in df.columns:
    column_name = column.lower().replace(' ', '_')
    rml_mapping += f'''
    rr:subjectMap [
        rr:template "http://example.com/co2/{{{column_name}}}" ;
        rr:class co2:CO2
    ] ;
    rr:predicateObjectMap [
        rr:predicate co2:{column_name} ;
        rr:objectMap [
            rml:reference "{column}" ;
            rr:datatype xsd:string
        ]
    ] ;
    '''

# Writing RML mapping to file
with open('mappings.ttl', 'w') as f:
    f.write(rml_mapping)


In [None]:
import os
from os.path import dirname

cwd = os.path.dirname(os.getcwd())
mapping_path = os.path.join(os.path.dirname(os.path.realpath('mappings.ttl')), 'mappings.ttl')
config = f"""
[SimpleExample]
mappings={mapping_path}
"""

print(config)


[SimpleExample]
mappings=/Users/cvardema/dev/git/landrs-toolkit/llm-experiments/mappings.ttl



In [None]:
import morph_kgc
g_rdflib = morph_kgc.materialize(config)

INFO | 2023-03-08 13:30:34,055 | Parallelization is not supported for darwin when running as a library. If you need to speed up your data integration pipeline, please run through the command line.
INFO | 2023-03-08 13:30:34,897 | 156 mapping rules retrieved.
INFO | 2023-03-08 13:30:34,923 | Mapping partition with 13 groups generated.
INFO | 2023-03-08 13:30:34,923 | Maximum number of rules within mapping group: 12.
INFO | 2023-03-08 13:30:34,923 | Mappings processed in 0.861 seconds.


ValueError: Usecols do not match columns, columns expected but not found: ['_altitude']

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()