In [1]:
import os
import sys

base_path = '/home/jovyan'
private_data_path = os.path.join(base_path, 'data/_private/chaoran')
number_entries = 7836565

# Add path to /src to sys.path
script_base_path = os.path.join(base_path, 'src')
if script_base_path not in sys.path:
  sys.path.append(script_base_path)

# Import own modules
from data import data_models

# Reload own modules (since they will be changing quite often)
import importlib
importlib.reload(data_models)

None

In [143]:
import pickle

with open(os.path.join(private_data_path, 'playground.pickle'), 'wb') as f:
    data = {
        'topic_counter': topic_counter,
        'subject_counter': subject_counter,
        'stats': {
            'has_t': has_t,
            'has_s': has_s,
            'has_full_text': has_full_text
        },
        'edge_dict': edge_dict,
        'G': G,
        'G2': G2
    }
    pickle.dump(data, f)

In [2]:
from collections import Counter
import re
from tqdm.notebook import tqdm

topic_counter = Counter()
subject_counter = Counter()

has_t = 0
has_s = 0
has_full_text = 0
reg = re.compile('[a-zA-Z0-9 ]*')

entries = data_models.basic_read_from_xz(os.path.join(base_path, 'data/11-basic/basics.json.xz'))

for i, entry in tqdm(enumerate(entries), total=number_entries): 
    ts = entry.topics
    ss = entry.subjects
    if ts is not None and len(ts) > 0:
        topic_counter.update([t.lower() for t in ts if reg.fullmatch(t) is not None])
        has_t += 1
    if ss is not None and len(ss) > 0:
        subject_counter.update([s.lower() for s in ss if reg.fullmatch(s) is not None])
        has_s += 1
    if entry.has_full_text:
        has_full_text += 1
print(i, has_t, has_s, has_full_text)

HBox(children=(FloatProgress(value=0.0, max=7836565.0), HTML(value='')))


7836564 6738168 7759934 1661261


In [144]:
print(len(topic_counter))
print('-----')

import csv
import io
output = io.StringIO()
writer = csv.writer(output)
pos = 200
for x in topic_counter.most_common()[pos:pos+10]:
    writer.writerow(x)
print(output.getvalue())
# Selected: 0, 200, 750, 3000, 10000, 25000 (+50)

1207489
-----
polymer,6164
raman spectroscopy,6141
metals,6110
liver,6068
research,6025
biodegradation,6006
stress,5994
carbon dioxide,5991
nitric oxide,5950
fish,5947



## Network

### Preparation

In [149]:
def count_edges():
    edge_dict = {}  # (node1:str, node2:str) -> weight where node1<node2; This contains ALL topics - without any filtering!
    entries = data_models.basic_read_from_xz(os.path.join(base_path, 'data/11-basic/basics.json.xz'))
    for entry in tqdm(entries, total=number_entries):
        if entry.topics is None:
            continue
        for i in range(0, len(entry.topics) - 1):
            for j in range(i, len(entry.topics)):
                t1 = entry.topics[i]
                t2 = entry.topics[j]
                if t2 < t1:
                    tmp = t1
                    t1 = t2
                    t2 = tmp
                if (t1, t2) not in edge_dict:
                    edge_dict[(t1, t2)] = 0
                edge_dict[(t1, t2)] += 1
    return edge_dict

edge_dict = count_edges()

HBox(children=(FloatProgress(value=0.0, max=7836565.0), HTML(value='')))




In [30]:
import pandas as pd

print('Number of edges: {:d}'.format(len(edge_dict)))

df = pd.DataFrame(list(edge_dict.items()), columns = ['edge_name', 'weight']) 
df.weight.describe()

Number of edges: 49259126


count    4.925913e+07
mean     7.314620e+00
std      1.022218e+03
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      1.772047e+06
Name: weight, dtype: float64

### The Actual And Filtered Network

In [169]:
import networkx as nx

def create_network(node_dict, edge_dict, node_set):
    """
    :param node_dict: dict<node id, weight>
    :param edge_dict: dict<(node1 id, node2 id), weight>
    :param node_set: a set of nodes that should be included in the network; other nodes will be ignored.
    """
    G = nx.Graph()
    for ((n1, n2), w) in edge_dict.items():
        if w < 5:
            continue
        if n1 == n2:
            continue
        if n1 not in node_set or n2 not in node_set:
            continue
        G.add_edge(n1, n2, weight=w)
    nx.set_node_attributes(G, topic_counter, name='weight')
    
    # Compute a normalized edge weight
    normalized_weights = {}
    for e in G.edges(data=True):
        n1, n2, data = e
        w = data['weight']
        normalized_weight = (w / G.degree(n1, weight='weight')) * (w / G.degree(n2, weight='weight'))
        normalized_weights[(n1, n2)] = normalized_weight
    nx.set_edge_attributes(G, normalized_weights, name='normalized_weight')
    
    return G

def reduce_network(G, k):
    """
    Let's only keep the top k edges per node
    """
    G2 = nx.Graph()
    for n1 in G.nodes():
        G2.add_node(n1, weight=G.nodes[n1]['weight'])
        neighbors = list(G[n1].items())
        top_neighbors = sorted(neighbors, key=lambda neigh: neigh[1]['weight'], reverse=True)[:k]
        for neigh in top_neighbors:
            n2 = neigh[0]
            w = neigh[1]['weight']
            normalized_w = neigh[1]['normalized_weight']
            if n2 < n1:
                continue
            G2.add_edge(n1, n2, weight=w, normalized_weight=normalized_w)
    return G2

nodes = [x[0] for x in topic_counter.most_common()[20:50000] if x[0] != '']
node_set = set(nodes)
G = create_network(topic_counter, edge_dict, node_set)
G2 = reduce_network(G, 20)

In [175]:
def export_graph(G, path):
    """
    Writes the files node_list.csv and edge_list.csv.
    """
    # Write node list
    with open(os.path.join(path, 'node_list.csv'), 'wt') as f:
        f.write('Name,Weight\n')
        for n, data in G.nodes(data=True):
            w = data['weight']
            line = '{},{}'.format(n, w)
            f.write(line + '\n')
        
    # Write edge list
    with open(os.path.join(path, 'edge_list.csv'), 'wt') as f:
        f.write('Source,Target,Type,Weight,NormalizedWeight\n')
        for n1, n2, data in G.edges(data=True):
            line = '{},{},undefined_relation,{},{:.16f}'.format(n1, n2, data['weight'], data['normalized_weight'])
            f.write(line + '\n')

export_graph(G, os.path.join(private_data_path, 'size_50000/G'))
export_graph(G2, os.path.join(private_data_path, 'size_50000/G2'))

In [176]:
def _node_name_to_neo4j_id(name):
    return name.replace(' ', '_')

def export_to_neo4j(G, path):
    """
    Writes csv files that can be imported into neo4j. This file can be imported with LOAD
    """
    # Write node list
    with open(os.path.join(path, 'nodes.csv'), 'wt') as f:
        f.write('id,name,weight\n')
        for n, data in G.nodes(data=True):
            w = data['weight']
            line = '{},{},{}'.format(_node_name_to_neo4j_id(n), n, w)
            f.write(line + '\n')
            
    # Write edge list
    with open(os.path.join(path, 'edges.csv'), 'wt') as f:
        f.write('node1,node2,type,weight,normalizedWeight\n')
        for n1, n2, data in G.edges(data=True):
            line = '{},{},RELATED_TO,{},{:.16f}'.format(_node_name_to_neo4j_id(n1), _node_name_to_neo4j_id(n2),
                                                         data['weight'], data['normalized_weight'])
            f.write(line + '\n')
            
def export_to_neo4j1(G, path):
    """
    Writes csv files that can be imported into neo4j. This file can be imported with the import tool
    """
    # Write node list
    with open(os.path.join(path, 'nodes_header.csv'), 'wt') as f:
        f.write(':ID,name,weight:int,:LABEL\n')
    with open(os.path.join(path, 'nodes_data.csv'), 'wt') as f:
        for n, data in G.nodes(data=True):
            w = data['weight']
            line = '{},{},{},Topic'.format(_node_name_to_neo4j_id(n), n, w)
            f.write(line + '\n')
            
    # Write edge list
    with open(os.path.join(path, 'edges_header.csv'), 'wt') as f:
        f.write(':START_ID,:END_ID,:TYPE,weight:int,normalizedWeight:float\n')
    with open(os.path.join(path, 'edges_data.csv'), 'wt') as f:
        for n1, n2, data in G.edges(data=True):
            line = '{},{},RELATED_TO,{},{:.16f}'.format(_node_name_to_neo4j_id(n1), _node_name_to_neo4j_id(n2),
                                                         data['weight'], data['normalized_weight'])
            f.write(line + '\n')
            
export_to_neo4j1(G, os.path.join(private_data_path, 'size_50000/G_neo4j'))