## Generate a mapping for multilabel training
### Given a label, convert it to a list of parent labels

In [1]:
import json

In [2]:
# Read Hierarchy json and convert it into tree
with open('./datasets/material_hierarchy.json', 'r') as f:
  data = json.load(f)
data

{'https://w3id.org/isample/vocabulary/material/0.9/material': {'label': {'en': 'Material'},
  'children': [{'https://w3id.org/isample/vocabulary/material/0.9/anyanthropogenicmaterial': {'label': {'en': 'Any anthropogenic material'},
     'children': [{'https://w3id.org/isample/vocabulary/material/0.9/anthropogenicmetal': {'label': {'en': 'Anthropogenic metal material'},
        'children': [{'https://w3id.org/isample/vocabulary/opencontext/material/0.1/brass': {'label': {'en': 'brass'},
           'children': []}},
         {'https://w3id.org/isample/vocabulary/opencontext/material/0.1/bronze': {'label': {'en': 'bronze'},
           'children': []}},
         {'https://w3id.org/isample/vocabulary/opencontext/material/0.1/copper': {'label': {'en': 'Copper'},
           'children': []}},
         {'https://w3id.org/isample/vocabulary/opencontext/material/0.1/gold': {'label': {'en': 'Gold'},
           'children': []}},
         {'https://w3id.org/isample/vocabulary/opencontext/material/0

In [3]:
# space out multi-word labels or ones that are not mapped directly 
spaced_map = {
    "carbonatenitratemineral": "Mineral-Carbonate or Nitrate",
    "organicmaterial": "Organic material",
    "otheranthropogenicmaterial" : "Anthropogenic material",
    "boratemineral": "Mineral-Borate",
    "liquidwater": "Liquid water",
    "rockorsediment":"Rock/sediment",
    "mixedsoilsedimentrock" : "Soil/sediment/rock",
    "anthropogenicmetal":"Anthropogenic metal material",
    "biogenicnonorganicmaterial":"Biogenic non-organic material",
    "sulfateselenatetelluratemineral": "Mineral-Sulfate, Selenate, or Tellurate",
    "ceramicclay": "Ceramic clay",
    "oxidemineral":"Mineral-Oxide",
    "phosphatearsenatevanadatemineral":"Mineral-Phosphate, Arsenate, or Vanadate",
    "silicategermanatemineral":"Mineral-Silicate or Germanate",
    "sulfidesulfosaltmineral":"Mineral-Sulfide or Sulfosalt",
    "nativeelementmineral":"Mineral-Native Element",
    "halidemineral":"Mineral-Halide",
    "organicmineral":"Mineral-Organic Compound",
    "gas":"gaseous material"
}

In [4]:
parent = {} # store parent of key

In [5]:
prefixes = ["rksd:","mat:","ming:","ocmat:"]
def process_label(value):
    if value is None:
        return value
    value = value.lower().replace("_"," ")
    for prefix in prefixes:
        if value.startswith(prefix):
            value = value[len(prefix):].lower()
            if value in spaced_map:
                value = spaced_map[value]
            break
    return value

In [6]:
leaf_labels = [] # list of labels that are leaf in the hierarchy
def updateParent(dic, key, label, parentLabel):
    
    # store the parent label as parent
    parentLabel = process_label(parentLabel)
    label = process_label(label)
        
    parent[label] = parentLabel
    
    # store if no children exists
    if len(dic[key]["children"]) == 0:
        leaf_labels.append(label)
        
    # recurse
    for child in dic[key]["children"]:

        for key, value in child.items():
            childKey = key
            childLabel = value["label"]["en"].lower()
            updateParent(child,childKey, childLabel, label)

for key in data:
    updateParent(data, key, data[key]["label"]["en"], None)
    break


In [8]:
print(leaf_labels)

['brass', 'bronze', 'copper', 'gold', 'iron', 'lead', 'pewter', 'plastic (material)', 'brick clay', 'bucchero', 'faience', 'porcelain', 'terracotta', 'terra sigilata', 'fiber material', 'glass', 'paper', 'plaster', 'plaster or mortar', 'rubber', 'frozen water', 'amber', 'bone', 'charcoal', 'coal', 'shell', 'dispersed media', 'hematite', 'kaolin', 'mica', 'quartz', 'mineral-borate', 'mineral-carbonate or nitrate', 'mineral-halide', 'mineral-native element', 'mineral-organic compound', 'mineral-oxide', 'mineral-phosphate, arsenate, or vanadate', 'mineral-silicate or germanate', 'mineral-sulfate, selenate, or tellurate', 'mineral-sulfide or sulfosalt', 'mixed soil sediment or rock', 'cinder', 'basalt', 'flint', 'cinder', 'coal', 'dolomite', 'gabbro', 'greywacke', 'limestone', 'marble', 'obsidian', 'pumice', 'slate', 'travertine', 'aphanite', 'breccia', 'cataclasite series', 'mylonitic rock', 'breccia gouge series', 'pyroclastic rock', 'dacite', 'alkali feldspar granite', 'granite', 'grano

## Get list of labels up to depth threshold

In [8]:
# depth 1 threshold (right below material)
depth_level_1 = []
for child, direct_parent in parent.items():
    if direct_parent == "material":
        depth_level_1.append(child)
print(depth_level_1)
print(len(depth_level_1))

['any anthropogenic material', 'any ice', 'biogenic non-organic material', 'dispersed media', 'natural solid material', 'fluid material', 'organic material']
7


In [9]:
# depth 2 threshold
depth_level_2 = [] 
for child, direct_parent in parent.items():
    if direct_parent in depth_level_1:
        depth_level_2.append(child)
print(depth_level_2)
print(len(depth_level_2))

['anthropogenic metal material', 'anthropogenic material', 'anthropogenic organic material', 'frozen water', 'amber', 'bone', 'charcoal', 'shell', 'mineral', 'mixed soil sediment or rock', 'particulate', 'rock or sediment', 'soil', 'gaseous material', 'liquid water', 'non-aqueous liquid material', 'organic animal material', 'organic animal product', 'organic plant material', 'plant material']
20


In [10]:
# depth 3 threshold
depth_level_3 = [] 
for child, direct_parent in parent.items():
    if direct_parent in depth_level_2:
        depth_level_3.append(child)
print(depth_level_3)
print(len(depth_level_3))

['brass', 'bronze', 'copper', 'gold', 'iron', 'lead', 'pewter', 'plastic (material)', 'ceramic clay', 'fiber material', 'glass', 'paper', 'plaster', 'plaster or mortar', 'rubber', 'hematite', 'kaolin', 'mica', 'quartz', 'mineral-borate', 'mineral-carbonate or nitrate', 'mineral-halide', 'mineral-native element', 'mineral-organic compound', 'mineral-oxide', 'mineral-phosphate, arsenate, or vanadate', 'mineral-silicate or germanate', 'mineral-sulfate, selenate, or tellurate', 'mineral-sulfide or sulfosalt', 'rock', 'sediment', 'hair', 'leather', 'wood', 'plant fiber']
35


## Get entire path of parents

In [11]:
# get entire path of parents 
parents = {}
for key, value in parent.items():
    parents[key] = [value]

In [12]:
# expand to get all parents

def expandParents(curr, child): # current node / node we want to get all the parents
    if curr not in parents[child] and curr != child:
        parents[child.lower()].append(curr)
    # expand
    if parent[curr] is not None:
        expandParents(parent[curr],child)

for key in parent:
    expandParents(key, key)

In [13]:
parents # all parents 
print(parents, len(parents))

{'material': [None], 'any anthropogenic material': ['material'], 'anthropogenic metal material': ['any anthropogenic material', 'material'], 'brass': ['anthropogenic metal material', 'any anthropogenic material', 'material'], 'bronze': ['anthropogenic metal material', 'any anthropogenic material', 'material'], 'copper': ['anthropogenic metal material', 'any anthropogenic material', 'material'], 'gold': ['anthropogenic metal material', 'any anthropogenic material', 'material'], 'iron': ['anthropogenic metal material', 'any anthropogenic material', 'material'], 'lead': ['anthropogenic metal material', 'any anthropogenic material', 'material'], 'pewter': ['anthropogenic metal material', 'any anthropogenic material', 'material'], 'anthropogenic material': ['any anthropogenic material', 'material'], 'anthropogenic organic material': ['organic material', 'material'], 'plastic (material)': ['anthropogenic organic material', 'organic material', 'material'], 'ceramic clay': ['anthropogenic mate

## Generate map to convert labels to specific depth threshold

In [14]:
# generate map to convert all labels to depth level 1 
map_to_depth_level_1 = {"material":"material"}
for label, parent_labels in parents.items():
    if len(parent_labels) == 1 and parent_labels[0] == "material":
        map_to_depth_level_1[label] = label
        continue
        
    for parent_label in parent_labels:
        if parent_label in depth_level_1:
            map_to_depth_level_1[label] = parent_label
        
map_to_depth_level_1
assert len(map_to_depth_level_1) == len(parents)
assert len(set(map_to_depth_level_1.values())) == len(depth_level_1) + 1

In [15]:
map_to_depth_level_2 = {"material":"material"}

for label, parent_labels in parents.items():
    if len(parent_labels) == 1 and parent_labels[0] == "material":
        map_to_depth_level_2[label] = label # depth level 1 labels itself
        continue
    if label in depth_level_2:
        map_to_depth_level_2[label] = label
        continue
    for parent_label in parent_labels:
        if parent_label in depth_level_2:
            map_to_depth_level_2[label] = parent_label # found the substituting label
            continue

assert len(map_to_depth_level_2) == len(parents)
assert len(set(map_to_depth_level_2.values())) == len(depth_level_1) + len(depth_level_2) + 1
#map_to_depth_level_2

{'material': 'material',
 'any anthropogenic material': 'any anthropogenic material',
 'anthropogenic metal material': 'anthropogenic metal material',
 'brass': 'anthropogenic metal material',
 'bronze': 'anthropogenic metal material',
 'copper': 'anthropogenic metal material',
 'gold': 'anthropogenic metal material',
 'iron': 'anthropogenic metal material',
 'lead': 'anthropogenic metal material',
 'pewter': 'anthropogenic metal material',
 'anthropogenic material': 'anthropogenic material',
 'anthropogenic organic material': 'anthropogenic organic material',
 'plastic (material)': 'anthropogenic organic material',
 'ceramic clay': 'anthropogenic material',
 'brick clay': 'anthropogenic material',
 'bucchero': 'anthropogenic material',
 'faience': 'anthropogenic material',
 'porcelain': 'anthropogenic material',
 'terracotta': 'anthropogenic material',
 'terra sigilata': 'anthropogenic material',
 'fiber material': 'anthropogenic material',
 'glass': 'anthropogenic material',
 'paper'

In [16]:
map_to_depth_level_3 = {"material":"material"}

for label, parent_labels in parents.items():
    if len(parent_labels) == 1 and parent_labels[0] == "material":
        map_to_depth_level_3[label] = label # depth level 1 labels itself
        continue
    if label in depth_level_3:
        map_to_depth_level_3[label] = label
        continue
    if label in depth_level_2:
        map_to_depth_level_3[label] = label
        continue
        
    for parent_label in parent_labels:
        if parent_label in depth_level_3:
            map_to_depth_level_3[label] = parent_label # found the substituting label
            break # found it 
        elif parent_label in depth_level_2:
            map_to_depth_level_3[label] = parent_label
            break
        elif parent_label in depth_level_1:
            map_to_depth_level_3[label] = parent_label
            break

assert len(map_to_depth_level_3) == len(parents)
#print(len(set(map_to_depth_level_3.values())), len(depth_level_1) , len(depth_level_2) , len(depth_level_3) )
map_to_depth_level_3

63 7 20 35


{'material': 'material',
 'any anthropogenic material': 'any anthropogenic material',
 'anthropogenic metal material': 'anthropogenic metal material',
 'brass': 'brass',
 'bronze': 'bronze',
 'copper': 'copper',
 'gold': 'gold',
 'iron': 'iron',
 'lead': 'lead',
 'pewter': 'pewter',
 'anthropogenic material': 'anthropogenic material',
 'anthropogenic organic material': 'anthropogenic organic material',
 'plastic (material)': 'plastic (material)',
 'ceramic clay': 'ceramic clay',
 'brick clay': 'ceramic clay',
 'bucchero': 'ceramic clay',
 'faience': 'ceramic clay',
 'porcelain': 'ceramic clay',
 'terracotta': 'ceramic clay',
 'terra sigilata': 'ceramic clay',
 'fiber material': 'fiber material',
 'glass': 'glass',
 'paper': 'paper',
 'plaster': 'plaster',
 'plaster or mortar': 'plaster or mortar',
 'rubber': 'rubber',
 'any ice': 'any ice',
 'frozen water': 'frozen water',
 'biogenic non-organic material': 'biogenic non-organic material',
 'amber': 'amber',
 'bone': 'bone',
 'charcoal'

In [17]:
# store the mapping
import json

with open('depth_level_1_mapping.json', 'w') as f:
    json.dump(map_to_depth_level_1, f)
    
with open('depth_level_2_mapping.json', 'w') as f:
    json.dump(map_to_depth_level_2, f)

with open('depth_level_3_mapping.json', 'w') as f:
    json.dump(map_to_depth_level_3, f)

## Read Extension type and add that info

In [18]:
import pandas as pd

In [19]:
gold = pd.read_csv("./datasets/SESAR_CV_labeled.csv")
gold

  gold = pd.read_csv("./datasets/SESAR_CV_labeled.csv")


Unnamed: 0.1,Unnamed: 0,description_supplementMetadata_locality,description_material,description_supplementMetadata_country,description_supplementMetadata_province,description_sampleType,description_supplementMetadata_platformType,description_supplementMetadata_geologicalAge,description_supplementMetadata_locationDescription,description_supplementMetadata_purpose,...,description_supplementMetadata_primaryLocationName,description_supplementMetadata_geologicalUnit,description_supplementMetadata_localityDescription,description_supplementMetadata_originalArchive,description_supplementMetadata_platformDescr,description_collectionMethod,description_igsnPrefix,description_supplementMetadata_cruiseFieldPrgrm,description_supplementMetadata_publicationUrl_description,original_high_label
0,0,,ming:sulfateselenatetelluratemineral,United States,New Jersey,"a sample that is an individual unit, including...",,,Coordinates for Sterling Hill Mine (MRDS ID: W...,,...,Sterling Hill Mine,,Ogdensburg,,,,NHB,,Smithsonian collections record for NMNH C6294-...,mat:mineral
1,1,,ming:sulfateselenatetelluratemineral,United States,New Jersey,"a sample that is an individual unit, including...",,,"Matched to the GeoNames record for Franklin, S...",,...,Franklin Mine,,Franklin,,,,NHB,,Smithsonian collections record for NMNH C6295-...,mat:mineral
2,2,,ming:sulfateselenatetelluratemineral,United States,New Jersey,"a sample that is an individual unit, including...",,,"Matched to the GeoNames record for Franklin, S...",,...,Franklin Mine,,Franklin,,,,NHB,,Smithsonian collections record for NMNH C6295-...,mat:mineral
3,3,,ming:silicategermanatemineral,United States,New Jersey,"a sample that is an individual unit, including...",,,"Coordinates from GEOLocate for parse pattern ""...",,...,Franklin Mining District,,Franklin,,,,NHB,,Smithsonian collections record for NMNH C6296-...,mat:mineral
4,4,,ming:silicategermanatemineral,United States,New Jersey,"a sample that is an individual unit, including...",,,Coordinates for Sterling Hill Mine (MRDS ID: W...,,...,Sterling Hill Mine,,Ogdensburg,,,,NHB,,Smithsonian collections record for NMNH C6299-...,mat:mineral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987548,995644,,rksd:Metamorphic Rock,,,"a sample that is an individual unit, including...",,,,,...,"Mitchell Peak, West Fosdick Mountains, Antarctica",,,"Dept of Geology, Colorado College, Colorado Sp...",,Manual,MBL,,,mat:rock
987549,995645,,rksd:Metamorphic Rock,,,"a sample that is an individual unit, including...",,,,,...,"Mitchell Peak, West Fosdick Mountains, Antarctica",,,"Dept of Geology, Colorado College, Colorado Sp...",,Manual,MBL,,,mat:rock
987550,995646,,rksd:Metamorphic Rock,,,"a sample that is an individual unit, including...",,,,,...,Ben Burton Park,,,"Department of Geology, Bryn Mawr College, Bryn...",,Manual,UGA,,,mat:rock
987551,995647,,rksd:Metamorphic Rock,,,"a sample that is an individual unit, including...",,,,,...,"Salamander Range, north Victoria Land, Antarctica",,,"Dept of Geology, Colorado College, Colorado Sp...",,Manual,LTR,,,mat:rock


In [20]:
gold_labels = set(gold["description_material"].values.tolist())
print(gold_labels) # extension type 

{'rksd:Generic Mudstone/rksd:Coal', 'rksd:Generic Sandstone/rksd:Generic Mudstone', 'rksd:Tephritoid', 'mat:rockorsediment', 'mat:particulate', 'rksd:Carbonate Sedimentary Rock/rksd:Generic Mudstone', 'mat:mineral', 'rksd:Pyroclastic Rock/rksd:Basalt', 'rksd:Non Clastic Siliceous Sediment', 'rksd:Cataclasite Series', 'rksd:Phonolitoid', 'rksd:Generic Sandstone/rksd:Tuffite', 'rksd:Massive Sulphide', 'rksd:Rhyolitoid/rksd:Glass Rich Igneous Rock', 'rksd:Metasomatic Rock', 'rksd:Diamictite', 'mat:anthropogenicmetal', 'rksd:Fragmental Igneous Rock/rksd:Fine Grained Igneous Rock', 'rksd:Carbonate Sedimentary Rock', 'rksd:Generic Mudstone/rksd:Biogenic Sediment', 'rksd:Exotic Composition Igneous Rock', 'rksd:Basalt/rksd:Chemical Sedimentary Material', 'rksd:Anorthositic Rock', 'rksd:Tephritoid/rksd:Pyroclastic Rock', 'rksd:Generic Conglomerate', 'rksd:Basalt', 'mat:liquidwater', 'rksd:Impact Generated Material', 'rksd:High Magnesium Fine Grained Igneous Rock', 'rksd:Fragmental Igneous Rock'

In [21]:
# store original column
gold = gold.assign(original_label=gold.description_material)

In [22]:
# prefixes to remove 
prefixes = ["rksd:","mat:","ming:","ocmat:"]

def cleanup(value):
    # remove prefix and space out 
    splitted = value.split("/") # multiple labels can exist 
    cleanedup=[]
    for split in splitted:
        for prefix in prefixes:
            if split.startswith(prefix):
                transformed = split[len(prefix):].lower()
                if transformed in spaced_map:
                    # map to spaced label
                    transformed = spaced_map[transformed]
                cleanedup.append(transformed)
    if len(cleanedup) == 0:
        # no label starting with prefix 
        cleanedup = [split.lower() for split in splitted]
    return "/".join(cleanedup) # add back the parents
    
        
# strip off prefix and space out 
gold["description_material"]=gold["description_material"].apply(cleanup)

In [23]:
gold_labels = set(gold["description_material"].values.tolist())
print(gold_labels)

{'dioritoid', 'coal', 'hornblendite', 'generic sandstone', 'fragmental igneous rock', 'metasomatic rock/ultramafic igneous rock', 'metamorphic rock/fine grained igneous rock', 'acidic igneous rock', 'mineral', 'diamicton', 'pyroclastic rock/basalt', 'andesite/diamicton', 'anorthositic rock', 'tuffite', 'dacite', 'mud size sediment', 'fault related material', 'tonalite', 'basalt/chemical sedimentary material', 'fragmental igneous rock/doleritic rock', 'Rock/sediment', 'metamorphic rock/dioritoid', 'diamicton/dacite', 'residual material', 'Biogenic non-organic material', 'basalt', 'material', 'Ceramic clay', 'sediment', 'quartz rich igneous rock', 'pyroclastic rock/rhyolitoid', 'granite', 'Anthropogenic material', 'granodiorite', 'rhyolitoid/glass rich igneous rock', 'basic igneous rock', 'gabbroid', 'clastic sediment', 'generic conglomerate', 'igneous rock', 'ultramafic igneous rock', 'particulate', 'Mineral-Oxide', 'Mineral-Sulfide or Sulfosalt', 'generic mudstone/biogenic sediment', '

In [24]:
# add parent labels for each possible label
def expand(value):
    splitted = value.split("/")
    expanded = set(splitted)
    
    for split in splitted:
        split = split.lower()
        for p in parents[split]:
            if p in spaced_map: # add spaced out version
                expanded.add(spaced_map[p])
            else:
                expanded.add(p)
    expanded = set([x for x in expanded if x is not None])
                 
    return "/".join(list(expanded))
    

gold["description_material"]=gold["description_material"].apply(expand)

In [25]:
gold_labels = set(gold["description_material"].values.tolist())
print(gold_labels)

{'material/granitoid/phaneritic igneous rock/rock or sediment/igneous rock/natural solid material/rock/granodiorite', 'material/rock or sediment/igneous rock/natural solid material/glass rich igneous rock/rock', 'material/rock or sediment/igneous rock/natural solid material/rock/fine grained igneous rock/dacite', 'material/sediment/rock or sediment/generic mudstone/natural solid material/biogenic sediment/sedimentary rock/rock', 'material/fluid material/gaseous material', 'Anthropogenic metal material/material/any anthropogenic material', 'material/clastic sedimentary rock/rock or sediment/natural solid material/sedimentary rock/rock', 'Liquid water/material/fluid material', 'material/phaneritic igneous rock/rock or sediment/igneous rock/natural solid material/rock', 'material/rock or sediment/natural solid material/residual material/rock', 'material/rock or sediment/massive sulphide/natural solid material/rock', 'material/natural solid material/Mineral-Phosphate, Arsenate, or Vanadate

In [26]:
# replace multilabels and remove duplicates for parent labels that are not handled
duplicates = {
    "rock or sediment" : "rock/sediment",
    "mixed soil sediment or rock": "soil/sediment/rock"
}
def remove_parent_duplicates(value):
    splitted = value.split("/")
    unique = set([x.lower() for x in splitted])
    
    for label in splitted:
        if label in duplicates:
            unique = list(unique)
            unique.remove(label)
            unique.extend(duplicates[label].split("/"))
            unique = set(unique)
    
    unique = set([x for x in unique if x is not None and x  not in duplicates])
                 
    return "/".join(list(unique))
    


gold["description_material"]=gold["description_material"].apply(remove_parent_duplicates)

In [27]:
gold_labels = set(gold["description_material"].values.tolist())
print(gold_labels)

{'material/natural solid material/mineral/mineral-oxide', 'material/fluid material/gaseous material', 'material/sediment/non clastic siliceous sedimentary rock/natural solid material/sedimentary rock/rock', 'material/granitoid/phaneritic igneous rock/sediment/granodiorite/igneous rock/natural solid material/rock', 'material/sediment/igneous rock/natural solid material/ultramafic igneous rock/rock', 'material/granitoid/phaneritic igneous rock/sediment/igneous rock/natural solid material/alkali feldspar granite/rock', 'biogenic non-organic material/material', 'material/sediment/massive sulphide/natural solid material/rock', 'material/sediment/fault related material/natural solid material/cataclasite series/rock', 'material/clastic sedimentary rock/sediment/coal/natural solid material/organic rich sedimentary rock/sedimentary rock/rock', 'material/phaneritic igneous rock/sediment/igneous rock/natural solid material/foid syenitoid/rock', 'material/sediment/igneous rock/natural solid materi

In [28]:
gold

Unnamed: 0.1,Unnamed: 0,description_supplementMetadata_locality,description_material,description_supplementMetadata_country,description_supplementMetadata_province,description_sampleType,description_supplementMetadata_platformType,description_supplementMetadata_geologicalAge,description_supplementMetadata_locationDescription,description_supplementMetadata_purpose,...,description_supplementMetadata_geologicalUnit,description_supplementMetadata_localityDescription,description_supplementMetadata_originalArchive,description_supplementMetadata_platformDescr,description_collectionMethod,description_igsnPrefix,description_supplementMetadata_cruiseFieldPrgrm,description_supplementMetadata_publicationUrl_description,original_high_label,original_label
0,0,,material/natural solid material/mineral-sulfat...,United States,New Jersey,"a sample that is an individual unit, including...",,,Coordinates for Sterling Hill Mine (MRDS ID: W...,,...,,Ogdensburg,,,,NHB,,Smithsonian collections record for NMNH C6294-...,mat:mineral,ming:sulfateselenatetelluratemineral
1,1,,material/natural solid material/mineral-sulfat...,United States,New Jersey,"a sample that is an individual unit, including...",,,"Matched to the GeoNames record for Franklin, S...",,...,,Franklin,,,,NHB,,Smithsonian collections record for NMNH C6295-...,mat:mineral,ming:sulfateselenatetelluratemineral
2,2,,material/natural solid material/mineral-sulfat...,United States,New Jersey,"a sample that is an individual unit, including...",,,"Matched to the GeoNames record for Franklin, S...",,...,,Franklin,,,,NHB,,Smithsonian collections record for NMNH C6295-...,mat:mineral,ming:sulfateselenatetelluratemineral
3,3,,material/natural solid material/mineral/minera...,United States,New Jersey,"a sample that is an individual unit, including...",,,"Coordinates from GEOLocate for parse pattern ""...",,...,,Franklin,,,,NHB,,Smithsonian collections record for NMNH C6296-...,mat:mineral,ming:silicategermanatemineral
4,4,,material/natural solid material/mineral/minera...,United States,New Jersey,"a sample that is an individual unit, including...",,,Coordinates for Sterling Hill Mine (MRDS ID: W...,,...,,Ogdensburg,,,,NHB,,Smithsonian collections record for NMNH C6299-...,mat:mineral,ming:silicategermanatemineral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987548,995644,,material/sediment/natural solid material/metam...,,,"a sample that is an individual unit, including...",,,,,...,,,"Dept of Geology, Colorado College, Colorado Sp...",,Manual,MBL,,,mat:rock,rksd:Metamorphic Rock
987549,995645,,material/sediment/natural solid material/metam...,,,"a sample that is an individual unit, including...",,,,,...,,,"Dept of Geology, Colorado College, Colorado Sp...",,Manual,MBL,,,mat:rock,rksd:Metamorphic Rock
987550,995646,,material/sediment/natural solid material/metam...,,,"a sample that is an individual unit, including...",,,,,...,,,"Department of Geology, Bryn Mawr College, Bryn...",,Manual,UGA,,,mat:rock,rksd:Metamorphic Rock
987551,995647,,material/sediment/natural solid material/metam...,,,"a sample that is an individual unit, including...",,,,,...,,,"Dept of Geology, Colorado College, Colorado Sp...",,Manual,LTR,,,mat:rock,rksd:Metamorphic Rock


In [29]:
# store the map of multi-label mapping
mapping = {}
for _, row in gold.iterrows():
    mapped = row['description_material']
    original = row['original_label']
    if original == "material":
        mapped = "material"
    else:
        mapped = "/".join([x for x in mapped.split("/") if x != "material"])
    
    mapping[original] = mapped

In [30]:
mapping

{'ming:sulfateselenatetelluratemineral': 'natural solid material/mineral-sulfate, selenate, or tellurate/mineral',
 'ming:silicategermanatemineral': 'natural solid material/mineral/mineral-silicate or germanate',
 'ming:nativeelementmineral': 'natural solid material/mineral/mineral-native element',
 'ming:oxidemineral': 'natural solid material/mineral/mineral-oxide',
 'mat:soil': 'natural solid material/soil',
 'ming:carbonatenitratemineral': 'natural solid material/mineral-carbonate or nitrate/mineral',
 'rksd:Sedimentary Rock': 'sediment/natural solid material/sedimentary rock/rock',
 'ming:halidemineral': 'natural solid material/mineral-halide/mineral',
 'ming:phosphatearsenatevanadatemineral': 'natural solid material/mineral/mineral-phosphate, arsenate, or vanadate',
 'ming:sulfidesulfosaltmineral': 'natural solid material/mineral/mineral-sulfide or sulfosalt',
 'rksd:Generic Mudstone': 'sediment/generic mudstone/natural solid material/sedimentary rock/rock',
 'ming:boratemineral':

In [31]:
for key, value in mapping.items():
    if value == "":
        print(key)

mat:material


In [32]:
import json

with open('./datasets/multilabel_mapping.json', 'w') as f:
    json.dump(mapping, f)

In [10]:
import json
with open('./datasets/multilabel_mapping.json') as f:
    multilabel_mapping = json.load(f)
multilabel_mapping

{'ming:sulfateselenatetelluratemineral': 'natural solid material/mineral-sulfate, selenate, or tellurate/mineral',
 'ming:silicategermanatemineral': 'natural solid material/mineral/mineral-silicate or germanate',
 'ming:nativeelementmineral': 'natural solid material/mineral/mineral-native element',
 'ming:oxidemineral': 'natural solid material/mineral/mineral-oxide',
 'mat:soil': 'natural solid material/soil',
 'ming:carbonatenitratemineral': 'natural solid material/mineral-carbonate or nitrate/mineral',
 'rksd:Sedimentary Rock': 'sediment/natural solid material/sedimentary rock/rock',
 'ming:halidemineral': 'natural solid material/mineral-halide/mineral',
 'ming:phosphatearsenatevanadatemineral': 'natural solid material/mineral/mineral-phosphate, arsenate, or vanadate',
 'ming:sulfidesulfosaltmineral': 'natural solid material/mineral/mineral-sulfide or sulfosalt',
 'rksd:Generic Mudstone': 'sediment/generic mudstone/natural solid material/sedimentary rock/rock',
 'ming:boratemineral':

In [11]:
# get the version that excludes the leaf labels
mapping_wo_leaf = {}
for key, value in multilabel_mapping.items():
    splitted = value.split("/")
    
    parents_wo_leaf = []
    for split in splitted:
        if split not in leaf_labels:
            parents_wo_leaf.append(split)
    mapping_wo_leaf[key] = "/".join(parents_wo_leaf)
    

In [12]:
mapping_wo_leaf

{'ming:sulfateselenatetelluratemineral': 'natural solid material/mineral',
 'ming:silicategermanatemineral': 'natural solid material/mineral',
 'ming:nativeelementmineral': 'natural solid material/mineral',
 'ming:oxidemineral': 'natural solid material/mineral',
 'mat:soil': 'natural solid material',
 'ming:carbonatenitratemineral': 'natural solid material/mineral',
 'rksd:Sedimentary Rock': 'sediment/natural solid material/sedimentary rock/rock',
 'ming:halidemineral': 'natural solid material/mineral',
 'ming:phosphatearsenatevanadatemineral': 'natural solid material/mineral',
 'ming:sulfidesulfosaltmineral': 'natural solid material/mineral',
 'rksd:Generic Mudstone': 'sediment/natural solid material/sedimentary rock/rock',
 'ming:boratemineral': 'natural solid material/mineral',
 'mat:organicmaterial': 'organic material',
 'rksd:Generic Sandstone': 'sediment/natural solid material/sedimentary rock/rock',
 'mat:liquidwater': 'fluid material',
 'mat:mineral': 'natural solid material/mi

In [13]:
with open('./datasets/multilabel_mapping_wo_leaf.json', 'w') as f:
    json.dump(mapping_wo_leaf, f)

In [270]:
# store converted dataset 