# Create hierarchy
Creates a hierachy from the wikidata entries by its parent attribute.
The created tree is filtered and saved to json for visualization.
It is also saved back to csv so the reduced number of keywords can be used for clustering.

Uses the csv `keyword_wikidata_small.csv` from the reducing notebook. Could also be used with all keywords.
Expect csv `keyword_wikidata_small.csv` with columns keyword, wikidata and parent0.

In [1]:
#config

#path where csv files are stored and results are saved to
data_path = '../../../data'

In [2]:
import os
import pandas as pd
import copy
import sys
import requests
import json

In [3]:
# load csv file

keyword_wiki_path = os.path.join(data_path, 'keyword_wikidata_small.csv')
df_keywords_wiki = pd.read_csv(keyword_wiki_path, index_col='wikidata')

In [4]:
# remove numerical keywords

df_keywords_wiki = df_keywords_wiki[~df_keywords_wiki.keyword.str.fullmatch('[0-9]+')]

In [5]:
df_keywords_wiki.count()

keyword    60529
parent0    55706
dtype: int64

In [6]:
# get parents to list
def getParents(s):
    s = s[2:-2].split('", "')
    return s

In [7]:
# convert wikidata entries to nodes

nodes = []

for index, row in df_keywords_wiki.iterrows():
    node = {'id': index, 'name': row['keyword'], 'children': []}
    
    if not pd.isnull(row['parent0']):
        parent = getParents(row['parent0'])
        
        node['parent'] = parent
        
    
    # remove cycles
    if node['id'] == 'Q12136':
        parent.remove('Q179630')
        parent.remove('Q814207')
    
    if node['id'] == 'Q954007':
        parent.remove('Q954007')
    
    if node['id'] == 'Q35535':
        parent.remove('Q35535')
        
    if node['id'] == 'Q832237':
        parent.remove('Q20113959')
            
    nodes.append(node)

In [8]:
# add nodes that are not in the data

def addNode(id, name):
    nodes.append({'id': id, 'name': name, 'children': []})
    

addNode('Q17339814', 'chemical substances')
addNode('Q47154513', 'structural class of chemical compounds')
addNode('Q214609', 'base material')

In [9]:
# build tree from the nodes, creates multiple trees

def build_tree(nodes):
    
    nodes = copy.deepcopy(nodes)
    
    node_dict = {}
    
    for node in nodes:
        node_dict[node['id']] = node
        
    parents = node_dict.copy()
    
    notin = 0
        
    for node in nodes:
        if 'parent' in node:
            for parent in node['parent']:
                if parent in node_dict:
                    if node['id'] in parents:
                        del parents[node['id']]
                    node_dict[parent]['children'].append(node)
            
    return list(parents.values())


trees = build_tree(nodes)

In [10]:
# use only trees that have more than one node

trees_with_children = list(filter(lambda x: len(x['children']) > 0, trees))

In [11]:
# number of trees
len(trees_with_children)

1008

In [12]:
def tree_size(node):
    size = 1
    if 'children' not in node:
        return size
    for child in node['children']:
        size += tree_size(child)
    return size

In [13]:
# check for cycles in a combined tree

def find_cycle(node):
    discovered = set()
    finished = set()
    
    for child in node['children']:
        if child['id'] not in discovered and child['id'] not in finished:
            discovered, finished = dfs_visit(child, discovered, finished)
    

def dfs_visit(node, discovered, finished):
    discovered.add(node['id'])

    for child in node['children']:
        # Detect cycles
        if child['id'] in discovered:
            print(f"Cycle detected: found a back edge from {child['name']}, {child['id']} to {node['name']},{node['id']}.")
            
            # remove cycle TODO
            #node['children'].remove(child)

        # Recurse into DFS tree
        if child['id'] not in finished:
            dfs_visit(child, discovered, finished)

    if node['id'] in discovered:
        discovered.remove(node['id'])
    finished.add(node['id'])

    return discovered, finished


dfs_parents = copy.deepcopy(trees_with_children)
dfs_root = {'name': 'root', 'keyword': 'root', 'children': dfs_parents}
find_cycle(dfs_root)


In [14]:
# number of nodes

sizes = [(x['name'], tree_size(x)) for x in trees_with_children]
sum([x[1] for x in sizes])

44014

In [15]:
# use only larger trees
large_nodes = list(filter(lambda x: tree_size(x) > 20, trees_with_children))
# number of trees
len(large_nodes)

96

In [16]:
# number of nodes when only using larger trees
sum(tree_size(x) for x in large_nodes)

40368

In [17]:
# create one tree from all trees
root = {'name': 'root', 'id': 'root', 'children': large_nodes}

In [18]:
root_to_filter = copy.deepcopy(root)

In [19]:
# functions to filter the tree

def remove_subtree(node, id):
    for child in node['children']:
        if child['id'] == id:
            node['children'].remove(child)
        else:
            remove_subtree(child, id)
            
def remove_if_not_in(node, id, parent):
    if node['id'] == id:
        return
    if node['id'] != parent:
        for child in node['children']:
            if child['id'] == id:
                node['children'].remove(child)
    for child in node['children']:
        remove_if_not_in(child, id, parent)

In [20]:
# remove subtrees that seem not necessary for a chemical context

to_remove = [
    'Q3305213', #painting
    'Q11424', #film
    'Q1348305', #erratum
    'Q134556', #single
    'Q93184', #drawing
    'Q871232', #editorial
    'Q125191', #photograph
    'Q41176', #building
    'Q4022', #river
    'Q860861', #sculpture
    'Q8502', #mountain
    'Q20026787', #!chemical component
    'Q4026292', #?action
    'Q24229398', #?agent
    'Q151885', #concept
    'Q71421787', #hydroxides
    'Q61788060', #human activity
    'Q386724', #work
    'Q16889133', #class
    'Q34749', #social science
    'Q21198', #computer science
    'Q7748', #law
    'Q7991', #natural science
    'Q6256', #country
    'Q23442', #island
    'Q34770', #language
    'Q267628', #column
    'Q107425', #landscape
    'Q1076968', #digital media
    'Q1186952', #interactive media
    'Q11024', #communication
    'Q11028', #information
    'Q16387', #information science
    'Q79782', #motion
    'Q131257', #intellectual property
    'Q170658', #creativity
    'Q2684591', #statement
    'Q28797', #applied science
    'Q362482', #operation
    'Q395', #mathematics
    'Q41217', #mechanics
    'Q702492', #urban area
    'Q8008', #earth sciences
    'Q9081', #knowledge
    'Q214339', #role
    'Q3769299', #human behaviour
    'Q11348', #function
]


for r in to_remove:
    remove_subtree(root_to_filter, r)


In [21]:
# remove subtrees that have multiple occurrences in the tree

remove_not_in = [
    ('Q8054', 'Q17339814'), #protein
    ('Q3249551', 'Q1190554'), #process
    ('Q1183543', 'Q39546'), #device
    ('Q39546', 'Q16798631'), #tool
    ('Q42889', 'Q11019'), #vehicle
    ('Q1799072', 'Q2695280'), #method
    ('Q46857', 'Q1799072'), #scientific method
    ('Q1379672', 'Q1799072'), #evaluation
    ('Q133500', 'Q2200417'), #learning
    ('Q156', 'Q174211'), #alcohol
    ('Q217602', 'Q46857'), #analysis
]

for r in remove_not_in:
    remove_if_not_in(root_to_filter, r[0], r[1])

In [22]:
# final tree size

tree_size(root_to_filter)

16299

In [23]:
# load entries to remove

remove_file = os.path.join(data_path, 'remove.txt')

remove_file_nodes = {}

if os.path.isfile(remove_file):
    with open(remove_file) as f:
        content = f.readlines()
    content = [x.strip() for x in content]

    for l in content:
        split = l.split('/')
        if split[-1] in remove_file_nodes:
            remove_file_nodes[split[-1]].append(split[0:-1])
        else:
            remove_file_nodes[split[-1]] = [split[0:-1]]
            
else:
    print('Remove file not found. Ignore')

In [24]:
# check if file entries are correct

def find_in_tree(tree, id, h, path):
    path.append(tree['name'])
    if tree['id'] == id and path == h:
        global count
        count += 1
        return True
    for child in tree['children']:
        if find_in_tree(child, id, h, path.copy()):
            return True
    return False

count = 0

for k,v in remove_file_nodes.items():
    for h in v:
        if not find_in_tree(root_to_filter, k, h, []):
            print('Not found: {} {}'.format(k, h))
    
if count == len(content):
    print('All nodes found')

All nodes found


In [25]:
def remove_from_file(tree, id, h, path):
    path.append(tree['name'])
    for child in tree['children']:
        if child['id'] == id and path == h[:-1]:
            tree['children'].remove(child)
            return True
        if remove_from_file(child, id, h, path.copy()):
            return True
    return False

for k,v in remove_file_nodes.items():
    for h in v:
        remove_from_file(root_to_filter, k, h, [])

In [26]:
# final size
tree_size(root_to_filter)

13721

In [27]:
# remove parents from the visualization tree for a smaller file size

vis_tree = copy.deepcopy(root_to_filter)

def remove_parents(node):
    node.pop('parent', None)
    for child in node['children']:
        remove_parents(child)

remove_parents(vis_tree)

In [28]:
# output tree to json file

out_file = os.path.join(data_path, 'hierarchy_tree.json')

with open(out_file, 'w+') as outfile:
    json.dump(vis_tree, outfile)

In [29]:
# get set of tree nodes

node_set = set()

def traverse_tree_to_set(root):
    global node_set
    if root['id'] != 'root': 
        if 'parent' in root:          
            parent0 = '"{}"'.format(', '.join(root['parent']))
        else:
            parent0 = ''
        node_set.add((root['id'], root['name'], parent0))
    for child in root['children']:
        traverse_tree_to_set(child)
        
traverse_tree_to_set(root_to_filter)

In [30]:
# convert nodes to dataframe

results_keywords = pd.DataFrame(columns=['keyword', 'wikidata', 'parent0'])

for node in node_set:
    if node[2] != '':
        results_keywords = results_keywords.append({'keyword': node[1], 'wikidata': node[0], 'parent0': node[2]}, ignore_index=True)
    else:
        results_keywords = results_keywords.append({'keyword': node[1], 'wikidata': node[0]}, ignore_index=True)

In [31]:
# save filterd keywords to csv

keyword_result_path = os.path.join(data_path, 'keyword_result.csv')
results_keywords.to_csv(keyword_result_path, index = False)

## Further methods for analysis

In [32]:
# calculate sizes of all subtrees

sub_tree_sizes = pd.DataFrame(columns=['keyword', 'id', 'size'])

def tree_size_to_df(node):
    global sub_tree_sizes
    size = 1
    if 'children' not in node:
        sub_tree_sizes = sub_tree_sizes.append({'keyword': node['name'], 'id': node['id'], 'size': size}, ignore_index=True)
        return size
    for child in node['children']:
        size += tree_size_to_df(child)
    sub_tree_sizes = sub_tree_sizes.append({'keyword': node['name'], 'id': node['id'], 'size': size}, ignore_index=True)
    return size

tree_size_to_df(root_to_filter)

13721

In [33]:
# check for larger subtrees that occure multiple times in the tree

grouped_sizes = sub_tree_sizes.groupby(['id', 'keyword']).agg(['count', 'sum'])
grouped_sizes = grouped_sizes.sort_values([('size', 'count')], ascending=False)

grouped_sizes[(grouped_sizes[('size', 'count')] > 1) & (grouped_sizes[('size', 'sum')] > 50)]

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
id,keyword,Unnamed: 2_level_2,Unnamed: 3_level_2
Q80294,cellulose,23,69
Q41534,starch,14,70
Q173600,disaccharide,12,72
Q407553,glycosaminoglycan,9,90
Q320607,oligosaccharide,8,80
Q134219,polysaccharides,8,512
Q2250497,unsaturated fatty acids,6,60
Q416796,glucan,5,90
Q8066,amino acid,5,95
Q61476,fatty acid,5,90


# Update Categories
Merge categories with the feedback from the expert team.

In [34]:
def addNodeAndMove(name, tree, children):
    node = {'id': name.replace(' ', '_'), 'name': name, 'children': []}
    for child in children:
        found = False
        for t_child in tree['children']:
            if child == t_child['id']:
                node['children'].append(t_child)
                tree['children'].remove(t_child)
                found = True
                break
        if found != True:
            print('{} not found'.format(child))
    tree['children'].append(node)
    
def moveNode(tree, moveId, toId):
    toNode = None
    moveNode = None
    for child in tree['children']:
        if child['id'] == toId:
            toNode = child
            break
    if toNode == None:
        print('{} not found'.format(toId))
        return
    
    for child in tree['children']:
        if child['id'] == moveId:
            moveNode = child
            break
    if moveNode == None:
        print('{} not found'.format(moveId))
        return
    
    toNode['children'].append(moveNode)
    tree['children'].remove(moveNode)
    
def renameNode(tree, nodeId, name):
    found = False
    for child in tree['children']:
        if child['id'] == nodeId:
            child['name'] = name
            found = True
            break
    if not found:
        print('{} not found'.format(nodeId))
    

In [35]:
root_new_categories = copy.deepcopy(root_to_filter)

moveNode(root_new_categories, 'Q389735', 'Q12136')

moveNode(root_new_categories, 'Q2996394', 'Q420')
moveNode(root_new_categories, 'Q30336093', 'Q420')
moveNode(root_new_categories, 'Q7239', 'Q420')
moveNode(root_new_categories, 'Q417841', 'Q420')
moveNode(root_new_categories, 'Q40397', 'Q420')
moveNode(root_new_categories, 'Q40867', 'Q420')
moveNode(root_new_categories, 'Q66589580', 'Q420')
moveNode(root_new_categories, 'Q7187', 'Q420')
moveNode(root_new_categories, 'Q420168', 'Q420')
moveNode(root_new_categories, 'Q189118', 'Q420')

moveNode(root_new_categories, 'Q407479', 'Q35825432')
moveNode(root_new_categories, 'Q96286', 'Q35825432')
moveNode(root_new_categories, 'Q407355', 'Q35825432')

renameNode(root_new_categories, 'Q35825432', 'catalysis')

moveNode(root_new_categories, 'Q19829510', 'Q17339814')
moveNode(root_new_categories, 'Q350176', 'Q17339814')
moveNode(root_new_categories, 'Q4373292', 'Q937228')

moveNode(root_new_categories, 'Q11435', 'Q35758')
moveNode(root_new_categories, 'Q11438', 'Q35758')
moveNode(root_new_categories, 'Q1621273', 'Q35758')


addNodeAndMove('agriculture', root_new_categories, ['Q2746959', 'Q28969364', 'Q864939'])
addNodeAndMove('medicine', root_new_categories, ['Q409205', 'Q181394', 'Q12136', 'Q179289', 'Q1149305'])
addNodeAndMove('chemistry', root_new_categories, ['Q2281940', 'Q11369', 'Q47154513', 'Q17339814', 'Q169336', 'Q6671777'])
addNodeAndMove('material science', root_new_categories, ['Q214609'])


In [36]:
# final categories
[child['name'] for child in root_new_categories['children']]

['occurrence',
 'catalysis',
 'representation',
 'physical chemistry',
 'energy',
 'nanotechnology',
 'matter',
 'biology',
 'light',
 'separation process',
 'property',
 'experiment',
 'agriculture',
 'medicine',
 'chemistry',
 'material science']

# Create keywords - class table

In [37]:
# load all keywords

keywords_path = os.path.join(data_path, 'keyword.csv')
df_keywords = pd.read_csv(keywords_path, index_col='name')

In [38]:
categories = {node['id']:node['name'] for node in root_new_categories['children']}

In [39]:
def label_tree(node):
    if node['id'] in categories:
        category = categories[node['id']]
        keyword_category_list.append({'keyword': node['name'], 'category': category})
        for child in node['children']:
            label_subtree(child, category)
    for child in node['children']:
        label_tree(child)
    

def label_subtree(node, category):
    keyword_category_list.append({'keyword': node['name'], 'category': category})
    for child in node['children']:
        label_subtree(child, category)

In [40]:
keyword_category_list = []

label_tree(root_new_categories)

In [41]:
keyword_category_df = pd.DataFrame(keyword_category_list, columns = ['keyword', 'category'])
keyword_category_df = keyword_category_df.drop_duplicates()

In [42]:
other_keywords = df_keywords[~ df_keywords.index.isin(keyword_category_df.keyword)].index.to_list()
others_df = pd.DataFrame({'keyword': other_keywords, 'category': ['others'] * len(other_keywords)})

In [43]:
keyword_category_result_df = keyword_category_df.append(others_df, ignore_index = True)

In [44]:
keyword_category_result_df.groupby('category').count()

Unnamed: 0_level_0,keyword
category,Unnamed: 1_level_1
agriculture,110
biology,1507
catalysis,145
chemistry,3196
energy,30
experiment,21
light,18
material science,249
matter,247
medicine,1345


In [45]:
# save keyword-class table to csv

keyword_category_result_path = os.path.join(data_path, 'keyword_categories.csv')
keyword_category_result_df.to_csv(keyword_category_result_path, index = False)

# export graph

In [46]:
category_list = [{':ID':'category_'+x.replace(' ', '_'), 'name':x, ':LABEL':'Category'} for x in categories.values()]
category_list.append({':ID':'category_others', 'name':'others', ':LABEL':'Category'})

In [47]:
category_node_df = pd.DataFrame(category_list, columns = [':ID', 'name', ':LABEL'])
category_node_df['rank:int'] = range(len(category_node_df.index), 0, -1)
category_node_df

Unnamed: 0,:ID,name,:LABEL,rank:int
0,category_occurrence,occurrence,Category,17
1,category_catalysis,catalysis,Category,16
2,category_representation,representation,Category,15
3,category_physical_chemistry,physical chemistry,Category,14
4,category_energy,energy,Category,13
5,category_nanotechnology,nanotechnology,Category,12
6,category_matter,matter,Category,11
7,category_biology,biology,Category,10
8,category_light,light,Category,9
9,category_separation_process,separation process,Category,8


In [48]:
category_edge_df = keyword_category_result_df.copy()
category_edge_df.columns = [':START_ID', ':END_ID']
category_edge_df[':TYPE'] = 'IS_CATEGORY'
category_edge_df[':END_ID'] = 'category_' + category_edge_df[':END_ID']
category_edge_df = category_edge_df.replace(' ', '_', regex=True)

In [49]:
category_edge_df

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,occurrence,category_occurrence,IS_CATEGORY
1,automation,category_occurrence,IS_CATEGORY
2,process_automation,category_occurrence,IS_CATEGORY
3,field_trial,category_occurrence,IS_CATEGORY
4,prediction,category_occurrence,IS_CATEGORY
...,...,...,...
42873,zwitterions,category_others,IS_CATEGORY
42874,zygote,category_others,IS_CATEGORY
42875,zymogram,category_others,IS_CATEGORY
42876,zymography,category_others,IS_CATEGORY


In [50]:
graph_nodes = pd.read_csv(os.path.join(data_path, 'nodes.csv'))

In [51]:
category_edge_df = category_edge_df[category_edge_df[':START_ID'].isin(graph_nodes[':ID'])]

In [52]:
category_node_path = os.path.join(data_path, 'category_nodes.csv')
category_node_df.to_csv(category_node_path, index = False)

category_edge_path = os.path.join(data_path, 'category_edges.csv')
category_edge_df.to_csv(category_edge_path, index = False)