# Analysis of Named Entities in ISSA dataset

**Analysing quality of named entities found in the document text**  

Use case: Agritrop ISSA

Endpoint: https://data-issa.cirad.fr/sparql

In [None]:
import os
try:
    isColab = os.environ['GCE_METADATA_TIMEOUT']
    if isColab:
        !pip install SPARQLWrapper
except:
    pass

In [None]:
import pandas as pd
import json

from SPARQLWrapper import SPARQLWrapper, JSON

import matplotlib.pyplot as plt


In [None]:
def sparql_service_to_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas DataFrame.
    
    Credit to Ted Lawless https://lawlesst.github.io/notebook/sparql-dataframe.html
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

def sparql_service_to_dataframe_with_scrollable_cursor(service, query_templ, params=(), page_size=10000):
    offset= 0
    keep_going=True
    df_res_list = [] 

    print('fetching... ', end='')
    while True:
        query = query_templ % (params + (offset, page_size) )
        df_res = sparql_service_to_dataframe(service, query )
        if df_res.shape[0] > 0 : 
           print(df_res.shape[0], end=', ')
           df_res_list.append(df_res)
           offset = offset + page_size
        else:
            print(df_res.shape[0])
            break
    return pd.concat(df_res_list)

def hyperlink(val):
    return '<a href="{}">{}</a>'.format(val,val)

def hyperlink_multi(val):
    uris = val.split(', ')
    uris_hl = ['<a href="{}">{}</a>'.format(v,v) for v in uris]

    return ', '.join(uris_hl)

In [None]:
issa_agritrop_endpoint = 'http://data-issa.cirad.fr/sparql'

## Dataset info

In [164]:
dataset_info_query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX void:   <http://rdfs.org/ns/void#> 
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>

SELECT * WHERE {
  issa:issa-agritrop dct:title ?dataset_name;
                     dct:description ?dataset_description;
                     void:triples ?triple_count;
                     owl:versionInfo ?version;
                     dct:issued ?initial_date;
                     prov:wasGeneratedAtTime ?generated_date.
  
  OPTIONAL{   issa:issa-agritrop dct:modified ?updated_date. }
} 
'''
dataset_info = sparql_service_to_dataframe(issa_agritrop_endpoint, dataset_info_query)
dataset_info.transpose()

Unnamed: 0,0
dataset_name,ISSA Agritrop dataset
dataset_description,This RDF dataset was produced by processing articles from Agritrop - the open repository of CIRA...
triple_count,171783648
version,1.2.20230306
initial_date,2022-04-29
generated_date,2023-03-14
updated_date,2023-03-14


## Count named entities from all NER tools

In [190]:
entity_count_per_annotator_query = '''
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>


SELECT ?annotator 
       (count(?entity) as ?cnt) 
	     (count(distinct ?uri) as ?unique_cnt)
FROM <http://data-issa.cirad.fr/graph/dbpedia-spotlight-nes>
FROM <http://data-issa.cirad.fr/graph/entity-fishing-nes>
FROM <http://data-issa.cirad.fr/graph/geographic-nes>
FROM <http://data-issa.cirad.fr/graph/pyclinrec-nes>
WHERE {
	?entity a prov:Entity;
          oa:hasBody ?uri;
          prov:wasAttributedTo ?annotator.
}
'''

In [191]:
df_entity_count = sparql_service_to_dataframe(issa_agritrop_endpoint, entity_count_per_annotator_query)

df_entity_count = df_entity_count.astype({'cnt': 'int',
                                          'unique_cnt' : 'int'})

print(df_entity_count.shape)

(4, 3)


In [192]:
table_style = [{"selector": "", "props": [("border", "3px solid lightblue !important")]},
               {"selector": "td", "props": [("border", "1px solid grey !important")]},
               {"selector": "th", "props": [("border", "1px solid grey !important")]}  ]
               
df_entity_count.style.set_table_styles(table_style)

Unnamed: 0,annotator,cnt,unique_cnt
0,http://data-issa.cirad.fr/DBPediaSpotlight,1876939,73390
1,http://data-issa.cirad.fr/EntityFishing,4480307,142397
2,http://data-issa.cirad.fr/GeographicEntityExtractor,247790,12028
3,http://data-issa.cirad.fr/Pyclinrec,3564076,17382


In [193]:
annotations_per_entity_count_query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>
PREFIX issapr:  <http://data-issa.cirad.fr/property/> 

SELECT ?uri ?entityLabel ?cnt ?min_conf ?max_conf
FROM <http://data-issa.cirad.fr/graph/dbpedia-spotlight-nes>
FROM <http://data-issa.cirad.fr/graph/entity-fishing-nes>
FROM <http://data-issa.cirad.fr/graph/geographic-nes>
FROM <http://data-issa.cirad.fr/graph/pyclinrec-nes>
FROM <http://data-issa.cirad.fr/graph/dbpedia-named-entities>
FROM <http://data-issa.cirad.fr/graph/wikidata-named-entities>
FROM <http://geonames.org/graph>
FROM <http://agrovoc.fao.org/graph>
WHERE {

      SELECT ?uri ?entityLabel ?cnt ?min_conf ?max_conf
      WHERE {

      {
      SELECT ?uri ?annotator
            (count(distinct ?paper) as ?cnt)
            (min(?conf) as ?min_conf)
            (max(?conf) as ?max_conf)

      WHERE {
            ?entity a prov:Entity;
            oa:hasBody ?uri;
            schema:about ?paper;
            issapr:confidence ?conf;
            prov:wasAttributedTo ?annotator.
            FILTER (?annotator = %s) # issa:EntityFishing | issa:DBPediaSpotlight | issa:GeographicEntityExtractor | issa:Pyclinrec
            }
      GROUP by  ?uri ?annotator
      }
      
      OPTIONAL {
            ?uri rdfs:label|skos:prefLabel|(skosxl:prefLabel/skosxl:literalForm)|dct:title|schema:name|gn:officialName ?entityLabel_en.
            FILTER langMatches(lang(?entityLabel_en), "en" )
      }
      # for DBPedia.fr
      OPTIONAL {
            ?uri rdfs:label ?entityLabel_fr.
            FILTER langMatches(lang(?entityLabel_fr), "fr" )
      }
      # for Geonames
      OPTIONAL {
            # |gn:alternateName||gn:shortName
            ?uri gn:name ?entityLabel_nolang.
      }
      
            BIND(COALESCE(?entityLabel_en, ?entityLabel_fr, ?entityLabel_nolang) as ?entityLabel)
      }
      ORDER BY DESC(?cnt)
}
OFFSET %d
LIMIT %d
'''

In [194]:
annotators = {'wikidata': 'issa:EntityFishing',
              'geonames': 'issa:GeographicEntityExtractor',
              'dbpedia' : 'issa:DBPediaSpotlight',
              'agrovoc' : 'issa:Pyclinrec'}

In [195]:
# in some queries having a long choice of paths to a label proves to be too expensive. 
# For these queries we can split the path and pass it as a string substitution.
label_path = {'wikidata': 'rdfs:label|skos:prefLabel|dct:title|schema:name',
              'geonames': 'gn:officialName',
              'dbpedia' : 'rdfs:label|skos:prefLabel|dct:title|schema:name',
              'agrovoc' : 'skosxl:prefLabel/skosxl:literalForm'}

## Annotation quality assesment per vocabulary
Select one of the vocabularis and run the cells below

In [None]:
vocab = 'agrovoc'

In [None]:
df_ne_count = {}
#for key, value in annotators.items():
key, value = vocab, annotators[vocab]
df_ne_count[key] = sparql_service_to_dataframe_with_scrollable_cursor(issa_agritrop_endpoint,  annotations_per_entity_count_query, (value, ) )
#df_ne_count[key].dropna(inplace=True)
df_ne_count[key] = df_ne_count[key].astype({'cnt': 'int',
                                            'min_conf' : 'float',
                                            'max_conf' : 'float'})
print(annotators[key] , df_ne_count[key].shape)

In [None]:
#import pickle
#with open('df_ne_count.pkl', 'wb') as f:
#    pickle.dump(df_ne_count, f)

#with open('df_ne_count.pkl', 'rb') as f:
#    df_ne_count = pickle.load(f)

In [None]:
df_ne_count[vocab].describe() \
                  .transpose() \
                  .style.format(precision=2)\
                  .set_table_styles(table_style) 

OBSERVATIONS: 
- AGROVOC: Confidence score is always 1.0. It should be addressed in the *pyclinrec* library.
- GEONAMES: 
- WIKIDATA: we don't filter the wikidata NE but we probably should as min confidence is 0.35.
- DBPEDIA: we do filter the DBpedia NEs by setting up the Spotlight parameters, terefore the min confidence is 0.75. Otherwise there would be a lot of NEs.

### Most popular entities 

In [None]:
df_ne_count[vocab].head(20)

In [None]:
fig = plt.figure(figsize=(12, 6))
fig.suptitle(f'Most popular {vocab} named entities')

plt.barh(df_ne_count[vocab].head(20).entityLabel , width= df_ne_count[vocab].head(20).cnt)
plt.gca().invert_yaxis()

plt.show()

OBSERVATIONS: 
- AGROVOC: top named entities are general concepts
- GEONAMES: as expected 
- WIKIDATA: as expected the top named entities are either geographic or agriculture related.
- DBPEDIA: not as agricultural as wikidata. Odd one: *Seconde Guerre Mondiale*.

In [None]:
fig = plt.figure(figsize=(18, 6))
fig.suptitle(f'Distribution of per entity counts for {vocab}')

def show_hist(i, df_f1, range=None, title=None):
    mean_cnt = df_f1.cnt.mean()
    median_cnt = df_f1.cnt.median()

    plt.subplot(1 , 3, i+1)

    plt.hist(df_f1.cnt, 10, color='lightblue', range=range)
    plt.axvline(mean_cnt, color='indigo')
    plt.text(mean_cnt + 0.01, 500, 'mean=%.2f' % mean_cnt , color="indigo")

    plt.axvline(median_cnt, color="blue")
    plt.text(median_cnt + 0.01, 1000, 'median=%.2f' % median_cnt , color="blue")
    
    plt.title(title)


show_hist(0, df_ne_count[vocab], title='All counts')
show_hist(1, df_ne_count[vocab], range=(df_ne_count[vocab].cnt.quantile(0.05) ,  
                                        df_ne_count[vocab].cnt.quantile(0.95)),
          title='middle 90%' )
show_hist(2, df_ne_count[vocab], range=(df_ne_count[vocab].cnt.quantile(0.25) ,
                                        df_ne_count[vocab].cnt.quantile(0.75)),
         title='Moddle 50%')

### Missing labels

In [None]:
df_ne_count[vocab][df_ne_count[vocab].entityLabel.isna()]\
    .style.format({'uri': hyperlink})

In [None]:
df_ne_count[vocab][df_ne_count[vocab].entityLabel.isna()].shape

OBSERVATIONS: 
- AGROVOC: no missing values!
- GEONAMES:
- WIKIDATA: investigate why 463 entities do not have labels
- DBPEDIA:

### Named entities recognised from a short words ( surface <= 3 characters)

In [None]:
short_text_query_templ = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>
PREFIX issapr:  <http://data-issa.cirad.fr/property/> 

SELECT ?surfaceLen ?surface ?entityLabel_en ?entityLabel_fr ?entityLabel ?entityUri ?cnt 
FROM <http://data-issa.cirad.fr/graph/dbpedia-spotlight-nes>
FROM <http://data-issa.cirad.fr/graph/entity-fishing-nes>
FROM <http://data-issa.cirad.fr/graph/geographic-nes>
FROM <http://data-issa.cirad.fr/graph/pyclinrec-nes>
FROM <http://data-issa.cirad.fr/graph/dbpedia-named-entities>
FROM <http://data-issa.cirad.fr/graph/wikidata-named-entities>
FROM <http://geonames.org/graph>
FROM <http://agrovoc.fao.org/graph>
WHERE {
      {SELECT ?surface ?entityUri (count(?surface) as ?cnt) (max(?surfaceLEN) as ?surfaceLen)
      WHERE {
            #VALUES (?entity) {(<http://data-issa.cirad.fr/ann/00006684522793ef568fee34752ac6371f33d3c3>)}
            ?entity a prov:Entity;
                  oa:hasBody ?entityUri;       
                  oa:hasTarget/oa:hasSelector ?selector;
                  oa:hasTarget/oa:hasSource ?source;
                  issapr:confidence ?conf;
                  prov:wasAttributedTo ?annotator.
            FILTER (?annotator = %s) # issa:EntityFishing | issa:DBPediaSpotlight | issa:GeographicEntityExtractor | issa:Pyclinrec

            ?selector oa:exact ?surface;
                  oa:start ?start.

            BIND ( STRLEN(?surface) as ?surfaceLEN )
            FILTER ( ?surfaceLEN %s )
            } 
      GROUP BY ?surface ?entityUri ?entityLabel
      }
      OPTIONAL {
            #?entityUri rdfs:label|skos:prefLabel|(skosxl:prefLabel/skosxl:literalForm)|dct:title|schema:name|gn:officialName|gn:alternateName|gn:name|gn:shortName ?entityLabel.
            ?entityUri %s ?entityLabel_en.
            FILTER langMatches(lang(?entityLabel_en), "en" )
      }
      OPTIONAL {
            ?entityUri %s ?entityLabel_fr.
            FILTER langMatches(lang(?entityLabel_fr), "fr" )
      }
      # for Geonames
      #OPTIONAL {
      #      ?entityUri gn:name ?entityLabel_nolang.
      #}

      BIND(COALESCE(?entityLabel_en, ?entityLabel_fr, ?entityLabel_nolang) as ?entityLabel)
}
ORDER BY ?surface ?entityLabel
'''

In [None]:
df_short_text_count = {}

df_short_text_count[vocab] = sparql_service_to_dataframe(issa_agritrop_endpoint,  
                                                            short_text_query_templ %  (annotators[vocab], '< 4',
                                                                                       label_path[vocab], label_path[vocab]) )
df_short_text_count[vocab] = df_short_text_count[vocab].astype({'cnt': 'int',
                                                                'surfaceLen': 'int'})
print(annotators[vocab] , df_short_text_count[vocab].shape)

#### len(surface) == 1

In [None]:
df_short_text_1 =  df_short_text_count[vocab].loc[df_short_text_count[vocab].surfaceLen == 1]

#df_short_surface_1.sort_values(by='cnt', ascending=False)
df_short_text_1.sort_values(by=['entityLabel_en','cnt'])\
                  #.style.set_table_styles(table_style)

OBSERVATIONS: 
- AGROVOC: filtering surface string >= 3 in effect
- GEONAMES:
- WIKIDATA: 
- DBPEDIA: 

#### len(surface) == 2

In [None]:
pd.options.display.max_rows = 10
df_short_surface_2 =  df_short_text_count[vocab].loc[df_short_text_count[vocab].surfaceLen == 2]

#df_short_surface_2.sort_values(by='cnt', ascending=False)
df_short_surface_2.sort_values(by=['entityLabel_en','cnt'])\
                  #.style.format({'entityUri': hyperlink})

CONCLUSION: 
- AGROVOC: the 2 letter surface also does not match very well with the exceptions of UK, EU, and pH. I think they should be filtered as well (0.4% of all) => filtering surface string >= 3 in effect
- GEONAMES: hard to tell, some of the 2 letter entities look ok, like *UK* or *US* but some are quesionable like *LA* or *SC*. Removing these entities would mean removing 4% of annottaions. Definetely needs a closer look.
- WIKIDATA: see the conclusion above
- DBPEDIA: see the conclusion  above

#### len(surface) == 3

In [None]:
pd.options.display.max_rows = 100
df_short_text_3 =  df_short_text_count[vocab].loc[df_short_text_count[vocab].surfaceLen == 3]

#df_short_text_3.sort_values(by='cnt', ascending=False)
df_short_text_3.sort_values(by=['cnt', 'entityLabel_en'], ascending=False)\
               .head(20)
               #.style.format({'entityUri': hyperlink})


CONCLUSION: 
- AGROVOC: for the populous 3 letter words the annotation is pretty accurate. For less popular words and abbreviations it's hard to tell without the context
- GEONAMES:
- WIKIDATA: hard to tellsome of the 3 letter entities look ok, like *CWD* (Coffee Wilt Disease). Definetely needs a closer look.
- DBPEDIA:

##### surface text starts with digit

In [None]:
pd.options.display.max_rows = 10
df_short_text_count[vocab] .loc[df_short_text_count[vocab] .surface.str[0].str.isdigit() ]\
                  #.style.format({'entityUri': hyperlink})  

CONCLUSION: 
- AGROVOC:  filtering surfaces that start with digits or punctualtion is in effect
- GEONAMES: the text *7E* annotated as  *Paris 07 Palais-Bourbon* is most likely junk. But having one junky annotation is is very insignificant.
- WIKIDATA: Consider filtering out the short text that starts with a digit.
- DBPEDIA: the same as above

##### surface matches (or not) the entity label

In [None]:
pd.options.display.max_rows = 10

def unCapitalise(s):
     if s[0].isupper() and s[1].islower():
          return s[:1].lower() + s[1:]
     else:
          return s


df_short_text_3.loc[(df_short_text_3.surface.apply(unCapitalise) != df_short_text_3.entityLabel_en) & 
                    (df_short_text_3.surface.apply(unCapitalise) != df_short_text_3.entityLabel_fr)] \
                  .sort_values(by=['cnt', 'entityLabel_en'], ascending=False)\
                  #.style.format({'entityUri': hyperlink}) 

CONCLUSION: 
- AGROVOC: 3 letter surfaces annotations look pretty good especially for the mostly used. The percentage of not-correct annotations should be negligible. 
- GEONAMES: geonames labels try to expand the abrreviations that's why there is no match between 3 letter text and concept lables.
- WIKIDATA: hard to judge the abbreviations. but not abbreaviate 3 letter text looks good.
- DBPEDIA: the same

### Surface forms that begin with non-letters

In [105]:
text_starts_with_non_letter_query_templ = ''' 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>
PREFIX issapr:  <http://data-issa.cirad.fr/property/> 

SELECT ?surfaceLen ?surface ?entityLabel_en ?entityLabel_fr ?entityLabel ?entityUri ?cnt 
FROM <http://data-issa.cirad.fr/graph/dbpedia-spotlight-nes>
FROM <http://data-issa.cirad.fr/graph/entity-fishing-nes>
FROM <http://data-issa.cirad.fr/graph/geographic-nes>
FROM <http://data-issa.cirad.fr/graph/pyclinrec-nes>
FROM <http://data-issa.cirad.fr/graph/dbpedia-named-entities>
FROM <http://data-issa.cirad.fr/graph/wikidata-named-entities>
FROM <http://geonames.org/graph>
FROM <http://agrovoc.fao.org/graph>
WHERE {
      {SELECT ?surface ?entityUri (count(?surface) as ?cnt) (max(?surfaceLEN) as ?surfaceLen)
      WHERE {
            #VALUES (?entity) {(<http://data-issa.cirad.fr/ann/00006684522793ef568fee34752ac6371f33d3c3>)}
            ?entity a prov:Entity;
                  oa:hasBody ?entityUri;       
                  oa:hasTarget/oa:hasSelector ?selector;
                  oa:hasTarget/oa:hasSource ?source;
                  issapr:confidence ?conf;
                  prov:wasAttributedTo ?annotator.
            FILTER (?annotator = %s) # issa:EntityFishing | issa:DBPediaSpotlight | issa:GeographicEntityExtractor | issa:Pyclinrec

            ?selector oa:exact ?surface;
                  oa:start ?start.

            BIND ( STRLEN(?surface) as ?surfaceLEN )
            FILTER ( ! REGEX(?surface, "^[a-z,A-Z,À-ÿ]")  ) # Greek letters ,\\p{Greek},µ
            } 
      GROUP BY ?surface ?entityUri ?entityLabel
      }
      OPTIONAL {
            #?entityUri rdfs:label|skos:prefLabel|(skosxl:prefLabel/skosxl:literalForm)|dct:title|schema:name|gn:officialName|gn:alternateName|gn:name|gn:shortName ?entityLabel.
            ?entityUri %s ?entityLabel_en.
            FILTER langMatches(lang(?entityLabel_en), "en" )
      }
      OPTIONAL {
            ?entityUri %s ?entityLabel_fr.
            FILTER langMatches(lang(?entityLabel_fr), "fr" )
      }
      # for Geonames
      OPTIONAL {
            ?entityUri gn:name ?entityLabel_nolang.
      }

      BIND(COALESCE(?entityLabel_en, ?entityLabel_fr, ?entityLabel_nolang) as ?entityLabel)
}
ORDER BY ?surface ?entityLabel
'''

In [106]:
df_non_letter_count = {}

df_non_letter_count[vocab] = sparql_service_to_dataframe(issa_agritrop_endpoint,  
                                                            text_starts_with_non_letter_query_templ %  (annotators[vocab],
                                                                                       label_path[vocab], label_path[vocab]) )
df_non_letter_count[vocab] = df_non_letter_count[vocab].astype({'cnt': 'int',
                                                                'surfaceLen': 'int'})
print(annotators[vocab] , df_non_letter_count[vocab].shape)

issa:Pyclinrec (0, 7)


In [107]:
df_non_letter_count[vocab].sort_values(by=['surface'])\
                  #.style.format({'entityUri': hyperlink})
                  

Unnamed: 0,surfaceLen,surface,entityLabel_en,entityLabel_fr,entityLabel,entityUri,cnt


In [108]:
df_non_letter_count[vocab].loc[(df_non_letter_count[vocab].surfaceLen==4) & 
                              ( df_non_letter_count[vocab].entityLabel_fr.str.isdigit()) ].cnt.sum()


0

In [109]:
df_non_letter_count[vocab].head()

Unnamed: 0,surfaceLen,surface,entityLabel_en,entityLabel_fr,entityLabel,entityUri,cnt


In [110]:
df_non_letter_count[vocab].cnt.sort_values().sum()

0

OBSERVATIONS:
- AGROVOC: filtering in effect 
- GEONAMES: 24 surface forms that start with various non-alphabetical and non-digit characters. Some of the annotations look ok though but the offsets are incorrect. Si I think it would be better to filter them out too and we will not loose much of the information.
- WIKIDATA: about 1% of entity-fishing annotatins correspond to the text that begins with non-alphabetical charachters. In most cases they are interpreded as :
    - country calling codes - e.g. *+223*	= *Telephone numbers in Mali* - correct and clever but irrelevant for the corpus
    - file extensions - e.g. *.csv file*	= *comma-separated values* - also correct and clever but irrelevant
    - quantity  annotations - e.g. *0.31%* = 	*1998 United States Senate elections* - mosly meaningless
    - dates - e.g. *01/09* = *September 1* - correct but not interesting I suppose
    - chemical formulas - e.g. *1-MCP* = *1-methylcyclopropene*  - these could be good to keep but hard to distinguish
    - biological nmaes  - e.g. *β2-microglobulin* = *Beta-2-microglobulin*  - these could be good to keep and if they begin with a greek letter easy to do
    - units - e.g. *μmol* = *mole* - also good to keep
    
    It looks like majority of annotations for the text starting with digit is meaningless and can be filtered out we would only loose the chemical and biological entities along the way. Other starting charactes are ok including the greek letters.
- DBPEDIA: 2.5% of dbpedia spotlight annotatins correspond to the text that begins with non-alphabetical charachters. Quite a lot! And it seems that the 70% of them are the annotations of the years in french dbpedia. The other issues are similar to wikidata case. Thus if we filter out the annotatins for the surface form starting with non-alphabetical character in te worse case scenario we would loose less than 0.5% of good annotations.  



### Multiple entities for one piece of text

#### More than 3 named entity per text

In [127]:
multiple_entities_query_templ = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>


SELECT ?source ?surface ?start ?cnt 
       #?entities 
WHERE {
  { SELECT ?source ?surface ?start 
          (count(?entityUri) as ?cnt)
          (group_concat(?entityUri; separator=", ") as ?entities) 
    WHERE {
          #VALUES (?entity) {(<http://data-issa.cirad.fr/ann/00006684522793ef568fee34752ac6371f33d3c3>)}
          ?entity a prov:Entity;
                  oa:hasBody ?entityUri;       
                  oa:hasTarget/oa:hasSelector ?selector;
                  oa:hasTarget/oa:hasSource ?source;
                  prov:wasAttributedTo ?annotator.
          FILTER (?annotator = %s) # issa:EntityFishing | issa:DBPediaSpotlight | issa:GeographicEntityExtractor | issa:Pyclinrec

          ?selector oa:exact ?surface;
                    oa:start ?start.
          } 
    GROUP BY ?source ?surface ?start
    HAVING ((count(?entityUri) %s ) && (count(?entityUri) > 1) )
  }

}
order by desc(?cnt)
'''

In [132]:
pd.options.display.max_colwidth = 100

df_multi_entities = df_multi_entities if 'df_multi_entities' in locals() else {}

df_multi_entities[vocab] = sparql_service_to_dataframe(issa_agritrop_endpoint,  
                                                        multiple_entities_query_templ %  (annotators[vocab], '> 3') )

df_multi_entities[vocab] = df_multi_entities[vocab].astype({'cnt': 'int'})

print(annotators[vocab] , df_multi_entities[vocab].shape)

issa:Pyclinrec (107, 4)


In [153]:
pd.options.display.max_rows = 10
df_multi_entities[vocab].sort_values(by=['cnt'], ascending=False)\
                        #.head()

Unnamed: 0,source,surface,start,cnt,repeat
0,http://data-issa.cirad.fr/document/593761#body_text,seeds in seed,655,98,False
1,http://data-issa.cirad.fr/document/459218#body_text,dried in a drying,10073,37,False
2,http://data-issa.cirad.fr/document/593680#abstract,espèce par espèce,1516,36,True
4,http://data-issa.cirad.fr/document/465032#abstract,espèce par espèce,1289,36,True
5,http://data-issa.cirad.fr/document/486470#abstract,espèce par espèce,706,36,True
...,...,...,...,...,...
88,http://data-issa.cirad.fr/document/455036#abstract,Gallus gallus,91,5,True
103,http://data-issa.cirad.fr/document/597318#abstract,Carassius carassius,1270,4,True
104,http://data-issa.cirad.fr/document/513625#abstract,Bison bison,228,4,True
105,http://data-issa.cirad.fr/document/553570#abstract,organic carbon carbon,1848,4,True


In [134]:
def detect_repeat(string_to_split):
    import re
    return pd.Series(re.split('-| ', string_to_split.lower())).value_counts().max() > 1

df_multi_entities[vocab]['repeat']  = df_multi_entities[vocab].surface.apply(detect_repeat)

In [155]:
# repeated text
pd.options.display.max_rows = 10
df_multi_entities[vocab].loc[df_multi_entities[vocab].repeat]\
                        .sort_values(by=['cnt', 'surface'], ascending=False)\
                        #.head(10)
      
    #.style.format({'source': hyperlink})

Unnamed: 0,source,surface,start,cnt,repeat
2,http://data-issa.cirad.fr/document/593680#abstract,espèce par espèce,1516,36,True
3,http://data-issa.cirad.fr/document/388166#abstract,espèce par espèce,125,36,True
4,http://data-issa.cirad.fr/document/465032#abstract,espèce par espèce,1289,36,True
5,http://data-issa.cirad.fr/document/486470#abstract,espèce par espèce,706,36,True
6,http://data-issa.cirad.fr/document/457753#body_text,espèce par espèce,6168,36,True
...,...,...,...,...,...
101,http://data-issa.cirad.fr/document/599862#body_text,Anser anser,1665,5,True
106,http://data-issa.cirad.fr/document/414119#body_text,sud-sud-ouest,2881,4,True
105,http://data-issa.cirad.fr/document/553570#abstract,organic carbon carbon,1848,4,True
103,http://data-issa.cirad.fr/document/597318#abstract,Carassius carassius,1270,4,True


In [157]:
# non repeated text
pd.options.display.max_rows = 10
df_multi_entities[vocab].loc[df_multi_entities[vocab].repeat==False]\
                        .sort_values(by=['cnt', 'surface'], ascending=False)\
                        

Unnamed: 0,source,surface,start,cnt,repeat
0,http://data-issa.cirad.fr/document/593761#body_text,seeds in seed,655,98,False
1,http://data-issa.cirad.fr/document/459218#body_text,dried in a drying,10073,37,False
10,http://data-issa.cirad.fr/document/569874#body_text,cells of cell,5210,30,False
9,http://data-issa.cirad.fr/document/575027#body_text,cells in cell,1145,30,False
31,http://data-issa.cirad.fr/document/413880#body_text,liquide-liquid,1962,16,False
86,http://data-issa.cirad.fr/document/578565#body_text,land uses using,4772,8,False


In [158]:
# Complementary query: get all the entities for a specific text

entities_for_surface_query_templ = ''' 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>
PREFIX issapr:  <http://data-issa.cirad.fr/property/> 

SELECT ?source ?start ?surface ?entityUri ?entityLabel_en ?entityLabel_fr
WHERE 
{
    VALUES (?source) {(<%s>)}
    VALUES (?start) {(%d)}
    
    ?entity a prov:Entity;
            oa:hasBody ?entityUri;       
            oa:hasTarget/oa:hasSelector ?selector;
            oa:hasTarget/oa:hasSource ?source;
            prov:wasAttributedTo ?annotator.
    FILTER (?annotator = %s)

    ?selector oa:exact ?surface;
              oa:start ?start.

    OPTIONAL {
        ?entityUri rdfs:label|skos:prefLabel|(skosxl:prefLabel/skosxl:literalForm)|dct:title|schema:name|gn:officialName|gn:alternateName|gn:name|gn:shortName ?entityLabel_en.
        FILTER langMatches(lang(?entityLabel_en), "en" )  }

	OPTIONAL {
	    ?entityUri rdfs:label|skos:prefLabel|(skosxl:prefLabel/skosxl:literalForm)|dct:title|schema:name|gn:officialName|gn:alternateName|gn:name|gn:shortName ?entityLabel_fr.
        FILTER langMatches(lang(?entityLabel_fr), "fr" ) }
}
'''

# copy/paste the values for this query here: 
source = 'http://data-issa.cirad.fr/document/593761#body_text'
start  = 655

df_nes = sparql_service_to_dataframe(issa_agritrop_endpoint,  
                                    entities_for_surface_query_templ %  (source, start, annotators[vocab]) )
print(df_nes.shape)
df_nes

(98, 6)


Unnamed: 0,source,start,surface,entityUri,entityLabel_en,entityLabel_fr
0,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_14170,seed inoculation,inoculation des semences
1,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_330719,cultivar selection,sélection de cultivars
2,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_330749,sowing methods,méthode d'ensemencement
3,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_12827,niger seed,niger (plante)
4,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_18705,castor beans,ricin
...,...,...,...,...,...,...
93,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_330627,wet seeding,
94,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_f3019d2b,seed shattering,
95,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_3d8dc048,seed aging,
96,http://data-issa.cirad.fr/document/593761#body_text,655,seeds in seed,http://aims.fao.org/aos/agrovoc/c_330767,sprouting of seed,


OBSERVATIONS: 
- AGROVOC: The number of overlaping entities has been reduced by 10 folds by the new vesrion of pyclinrec. However, the problem with multuple (> 3) entities for the surfaces with repetitive words still persist although to a lesser scale. 
- GEONAMES: no overapping.
- WIKIDATA: no overapping.
- DBPEDIA: no overalpping.

#### 2 or 3 entities for one surface form 
vast majority of them does not have repeated text

In [147]:
df_few_entities = df_few_entities if 'df_few_entities' in locals() else {}

df_few_entities[vocab] = sparql_service_to_dataframe(issa_agritrop_endpoint,  
                                                     multiple_entities_query_templ %  (annotators[vocab], '< 4') )

df_few_entities[vocab] = df_few_entities[vocab].astype({'cnt': 'int'})
print(annotators[vocab] , df_few_entities[vocab].shape)

issa:Pyclinrec (23984, 4)


In [159]:
df_few_entities[vocab].sort_values(by=['cnt', 'surface'], ascending=False) 

Unnamed: 0,source,surface,start,cnt
1007,http://data-issa.cirad.fr/document/568930#title,soils polluted,51,3
529,http://data-issa.cirad.fr/document/591757#abstract,soil salinization,4300,3
762,http://data-issa.cirad.fr/document/551785#abstract,soil salinization,107,3
1441,http://data-issa.cirad.fr/document/561446#abstract,soil salinization,731,3
1901,http://data-issa.cirad.fr/document/597369#body_text,soil salinization,15680,3
...,...,...,...,...
18408,http://data-issa.cirad.fr/document/455751#body_text,Acide ascorbique,5793,2
18513,http://data-issa.cirad.fr/document/455752#body_text,Acide ascorbique,58,2
19554,http://data-issa.cirad.fr/document/421912#body_text,Acide ascorbique,4375,2
20035,http://data-issa.cirad.fr/document/560487#abstract,Acid soils,12,2


In [161]:
df_few_entities[vocab]['repeat']  = df_few_entities[vocab].surface.apply(detect_repeat)
df_few_entities[vocab].loc[ df_few_entities[vocab].repeat ] \
                              .sort_values(by=['cnt', 'surface'], ascending=False)

Unnamed: 0,source,surface,start,cnt,repeat
2312,http://data-issa.cirad.fr/document/484053#body_text,monachus monachus,5550,3,True
5122,http://data-issa.cirad.fr/document/264904#body_text,development stage by stage,340,3,True
1237,http://data-issa.cirad.fr/document/555037#abstract,Sud-est est,10,3,True
2010,http://data-issa.cirad.fr/document/444556#body_text,Sud-Sud-Est,8064,3,True
767,http://data-issa.cirad.fr/document/549787#abstract,Sud-Est est,69,3,True
...,...,...,...,...,...
23792,http://data-issa.cirad.fr/document/488765#body_text,cas par cas,40812,2,True
13654,http://data-issa.cirad.fr/document/599562#body_text,Sprattus sprattus,612,2,True
19223,http://data-issa.cirad.fr/document/420881#body_text,Nord-Est est,69719,2,True
12419,http://data-issa.cirad.fr/document/601907#body_text,Melolontha melolontha,7684,2,True


In [162]:
df_few_entities[vocab].loc[ df_few_entities[vocab].repeat==False ] \
                              .sort_values(by=['cnt', 'surface'], ascending=False)

Unnamed: 0,source,surface,start,cnt,repeat
1007,http://data-issa.cirad.fr/document/568930#title,soils polluted,51,3,False
529,http://data-issa.cirad.fr/document/591757#abstract,soil salinization,4300,3,False
762,http://data-issa.cirad.fr/document/551785#abstract,soil salinization,107,3,False
1441,http://data-issa.cirad.fr/document/561446#abstract,soil salinization,731,3,False
1901,http://data-issa.cirad.fr/document/597369#body_text,soil salinization,15680,3,False
...,...,...,...,...,...
18408,http://data-issa.cirad.fr/document/455751#body_text,Acide ascorbique,5793,2,False
18513,http://data-issa.cirad.fr/document/455752#body_text,Acide ascorbique,58,2,False
19554,http://data-issa.cirad.fr/document/421912#body_text,Acide ascorbique,4375,2,False
20035,http://data-issa.cirad.fr/document/560487#abstract,Acid soils,12,2,False


OBSERVATIONS:
- AGROVOC: it seems that a seamingly large number but in fact a small percentage (23984/3505441, 0.7%) of surface forms are annotated with 2 or 3 entities.

In [163]:
multiple_surface_entities_templ = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>


SELECT ?surface ?entities  (count(?source) as ?surface_cnt) 
WHERE {
  { SELECT ?source ?surface ?start 
          (count(?entityUri) as ?cnt)
          (group_concat(?entityUri; separator=", ") as ?entities) 
    WHERE {
          ?entity a prov:Entity;
                  oa:hasBody ?entityUri;       
                  oa:hasTarget/oa:hasSelector ?selector;
                  oa:hasTarget/oa:hasSource ?source;
                  prov:wasAttributedTo ?annotator.
          FILTER (?annotator = %s) # issa:EntityFishing | issa:DBPediaSpotlight | issa:GeographicEntityExtractor | issa:Pyclinrec

          ?selector oa:exact ?surface;
                    oa:start ?start.

    } 
    GROUP BY ?source ?surface ?start ?end
    HAVING ((count(?entityUri) %s) && (count(?entityUri) > 1) )
    }

}
group by ?surface ?entities
#order by desc(?cnt)
#limit 100
'''

In [181]:
# Complementary query because adding the label fetching in the main query makes its execution too long
labels_for_uris_query_templ = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>

SELECT *
WHERE {

    VALUES(?entityUri) { %s }

    OPTIONAL {
        ?entityUri rdfs:label|skos:prefLabel|(skosxl:prefLabel/skosxl:literalForm)|dct:title|schema:name|gn:officialName|gn:alternateName|gn:name|gn:shortName ?entityLabel_en.
        FILTER langMatches(lang(?entityLabel_en), "en" )  }

	OPTIONAL {
	    ?entityUri rdfs:label|skos:prefLabel|(skosxl:prefLabel/skosxl:literalForm)|dct:title|schema:name|gn:officialName|gn:alternateName|gn:name|gn:shortName ?entityLabel_fr.
        FILTER langMatches(lang(?entityLabel_fr), "fr" ) }

}

'''
def decorate_uris_multi(val):
    uris = val.split(', ')
    uris_d = ['(<{}>)'.format(v) for v in uris]

    return ' '.join(uris_d)

def fetch_entity_labels(entityUris):
    return sparql_service_to_dataframe(issa_agritrop_endpoint,  
                                        labels_for_uris_query_templ % decorate_uris_multi(entityUris)  )

def entity_labels(entityUris):
    df_lbls = fetch_entity_labels(entityUris)
    df_lbls.fillna('', inplace=True)
    return [', '.join(df_lbls.entityLabel_en) ,  ', '.join(df_lbls.entityLabel_fr)]


In [176]:
#df_multi_entities_3 = df_multi_entities_3 if 'df_multi_entities_3' in locals() else {}

#df_multi_entities_3[vocab] = sparql_service_to_dataframe(issa_agritrop_endpoint,  
#                                                     multiple_surface_entities_templ %  (annotators[vocab], '< 4') )

df_multi_entities_3[vocab] = df_multi_entities_3[vocab].astype({'surface_cnt': 'int'})
print(annotators[vocab] , df_multi_entities_3[vocab].shape)

issa:Pyclinrec (1599, 3)


In [182]:
s_lbls = df_multi_entities_3[vocab].entities.apply(entity_labels)
df_multi_entities_3[vocab]['labels_en']=s_lbls.apply(lambda x: x[0])
df_multi_entities_3[vocab]['labels_fr']=s_lbls.apply(lambda x: x[1])

In [189]:
# two different ways to visualize the table, uncomment the prefered way

idx = ['entities', df_multi_entities_3[vocab].index ]
df_multi_entities_3[vocab].set_index(idx)\
                          .sort_values(by=['surface_cnt', 'surface'] , ascending=False)

#df_multi_entities_3[vocab].sort_values(by=['surface_cnt', 'surface'], ascending=False)\
    #.style.format({'entities': hyperlink_multi})                          

Unnamed: 0_level_0,Unnamed: 1_level_0,surface,surface_cnt,labels_en,labels_fr
entities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"http://aims.fao.org/aos/agrovoc/c_2509, http://aims.fao.org/aos/agrovoc/c_5330, http://aims.fao.org/aos/agrovoc/c_5514",165,oil palm,2556,"palm oils, oil palms, Elaeis guineensis","huile de palme, palmier oléifère, Elaeis guineensis"
"http://aims.fao.org/aos/agrovoc/c_14343, http://aims.fao.org/aos/agrovoc/c_2809",970,exploitations agricoles,871,"smallholders, farms","exploitant agricole, exploitation agricole"
"http://aims.fao.org/aos/agrovoc/c_2509, http://aims.fao.org/aos/agrovoc/c_5330, http://aims.fao.org/aos/agrovoc/c_5514",1449,palm oil,797,"palm oils, oil palms, Elaeis guineensis","huile de palme, palmier oléifère, Elaeis guineensis"
"http://aims.fao.org/aos/agrovoc/c_2221, http://aims.fao.org/aos/agrovoc/c_2222",1292,developing countries,715,"developed countries, developing countries","pays développé, pays en développement"
"http://aims.fao.org/aos/agrovoc/c_2ac7c9e1, http://aims.fao.org/aos/agrovoc/c_8678, http://aims.fao.org/aos/agrovoc/c_9fe82378",1291,agricultural production,673,"agricultural products, agricultural productivity, agricultural production","produit agricole, productivité agricole, production agricole"
"http://aims.fao.org/aos/agrovoc/c_2ac7c9e1, http://aims.fao.org/aos/agrovoc/c_8678, http://aims.fao.org/aos/agrovoc/c_9fe82378",...,...,...,...,...
"http://aims.fao.org/aos/agrovoc/c_2ac7c9e1, http://aims.fao.org/aos/agrovoc/c_8678, http://aims.fao.org/aos/agrovoc/c_9fe82378",656,Agricultural Productions,1,"agricultural products, agricultural productivity, agricultural production","produit agricole, productivité agricole, production agricole"
"http://aims.fao.org/aos/agrovoc/c_9fe82378, http://aims.fao.org/aos/agrovoc/c_8678, http://aims.fao.org/aos/agrovoc/c_2ac7c9e1",937,Agricultural Production,1,"agricultural products, agricultural productivity, agricultural production","produit agricole, productivité agricole, production agricole"
"http://aims.fao.org/aos/agrovoc/c_2604, http://aims.fao.org/aos/agrovoc/c_27513",146,Active enZymes,1,"enzyme activators, enzyme activity","activateur d'enzyme, activité enzymatique"
"http://aims.fao.org/aos/agrovoc/c_34901, http://aims.fao.org/aos/agrovoc/c_89",1585,Acid soils,1,"acid soils, soil pH","sol acide, pH du sol"


OBSERVATIONS: 
- AGROVOC: Looks like the annottaions with multiple entities is pretty consistent, i.e. the same text is annotated with the same few entities and possibly due to the alternative labels for one agrovoc term being the same as pref label for another. 
- GEONAMES: no overalping :thumbsup
- WIKIDATA: no overalping :thumbsup
- DBPEDIA: no overalping :thumbsup


### Overlaping surface forms

In [None]:
overlaping_surface_forms_query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX oa:     <http://www.w3.org/ns/oa#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX skosxl: <http://www.w3.org/2008/05/skos-xl#>
PREFIX issa:   <http://data-issa.cirad.fr/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX dct:    <http://purl.org/dc/terms/>
PREFIX schema: <http://schema.org/>
PREFIX gn:     <http://www.geonames.org/ontology#>

'''