In [1]:
import os
import sys

base_path = '/home/jovyan'
private_data_path = os.path.join(base_path, 'data/_private/chaoran')
number_entries = 7836565

# Add path to /src to sys.path
script_base_path = os.path.join(base_path, 'src')
if script_base_path not in sys.path:
  sys.path.append(script_base_path)

# Import own modules
from data import data_models

# Reload own modules (since they will be changing quite often)
import importlib
importlib.reload(data_models)

None

In [97]:
import pickle

with open(os.path.join(private_data_path, 'playground.pickle'), 'wb') as f:
    data = {
        'topic_counter': topic_counter,
        'subject_counter': subject_counter,
        'stats': {
            'has_t': has_t,
            'has_s': has_s,
            'has_full_text': has_full_text
        },
        'edge_dict': edge_dict,
        'G': G,
        'G2': G2
    }
    pickle.dump(data, f)

In [2]:
from collections import Counter
import re
from tqdm.notebook import tqdm

topic_counter = Counter()
subject_counter = Counter()

has_t = 0
has_s = 0
has_full_text = 0
reg = re.compile('[a-zA-Z0-9 ]*')

entries = data_models.basic_read_from_xz(os.path.join(base_path, 'data/11-basic/basics.json.xz'))

for i, entry in tqdm(enumerate(entries), total=number_entries): 
    ts = entry.topics
    ss = entry.subjects
    if ts is not None and len(ts) > 0:
        topic_counter.update([t.lower() for t in ts if reg.fullmatch(t) is not None])
        has_t += 1
    if ss is not None and len(ss) > 0:
        subject_counter.update([s.lower() for s in ss if reg.fullmatch(s) is not None])
        has_s += 1
    if entry.has_full_text:
        has_full_text += 1
print(i, has_t, has_s, has_full_text)

HBox(children=(FloatProgress(value=0.0, max=7836565.0), HTML(value='')))


7836564 6738168 7759934 1661261


In [108]:
print(len(topic_counter))
print('-----')

import csv
import io
output = io.StringIO()
writer = csv.writer(output)
pos = 200
for x in topic_counter.most_common()[pos:pos+50]:
    writer.writerow(x)
print(output.getvalue())
# Selected: 0, 200, 750, 3000, 10000, 25000 (+50)

1207489
-----
polymer,6164
raman spectroscopy,6141
metals,6110
liver,6068
research,6025
biodegradation,6006
stress,5994
carbon dioxide,5991
nitric oxide,5950
fish,5947
crystallization,5908
graphene,5892
nanotechnology,5890
identification,5872
monitoring,5858
sintering,5855
brazil,5849
composites,5833
materials science,5831
gold,5777
climate change,5739
silicon,5737
biological sciences,5724
model,5709
c1,5709
amino acids,5678
silica,5644
biomarkers,5643
enzymes,5639
remote sensing,5607
nickel,5601
mechanical engineering and machinery,5588
cyclic voltammetry,5583
aging,5567
escherichia coli,5567
recycling,5566
density functional theory,5561
cadmium,5520
diabetes,5485
wireless sensor networks,5477
photoluminescence,5467
management,5439
cellulose,5422
nanomaterials,5413
ionic liquids,5412
energy,5386
lipids,5384
purification,5338
mitochondria,5330
milk,5299



## Network

### Preparation

In [61]:
nodes = [x[0] for x in topic_counter.most_common()[20:10000] if x[0] != '']
node_set = set(nodes)

In [27]:
edge_dict = {}  # (node1:str, node2:str) -> weight where node1<node2; This contains ALL topics - without any filtering!

entries = data_models.basic_read_from_xz(os.path.join(base_path, 'data/11-basic/basics.json.xz'))
for entry in tqdm(entries, total=number_entries):
    if entry.topics is None:
        continue
    for i in range(0, len(entry.topics) - 1):
        for j in range(i, len(entry.topics)):
            t1 = entry.topics[i]
            t2 = entry.topics[j]
            if t2 < t1:
                tmp = t1
                t1 = t2
                t2 = tmp
            if (t1, t2) not in edge_dict:
                edge_dict[(t1, t2)] = 0
            edge_dict[(t1, t2)] += 1

HBox(children=(FloatProgress(value=0.0, max=7836565.0), HTML(value='')))




In [30]:
import pandas as pd

print('Number of edges: {:d}'.format(len(edge_dict)))

df = pd.DataFrame(list(edge_dict.items()), columns = ['edge_name', 'weight']) 
df.weight.describe()

Number of edges: 49259126


count    4.925913e+07
mean     7.314620e+00
std      1.022218e+03
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      1.772047e+06
Name: weight, dtype: float64

### The Actual And Filtered Network

In [98]:
import networkx as nx

G = nx.Graph()
for ((n1, n2), w) in edge_dict.items():
    if w < 3:
        continue
    if n1 == n2:
        continue
    if n1 not in node_set or n2 not in node_set:
        continue
    G.add_edge(n1, n2, weight=w)

In [101]:
# Write edge list
with open(os.path.join(private_data_path, 'edge_list2.csv'), 'wt') as f:
    f.write('Source,Target,Type,Weight\n')
    for n1, n2, data in G2.edges(data=True):
        w = data['weight']
        line = '{},{},undefined_relation,{}'.format(n1, n2, w)
        f.write(line + '\n')


In [78]:
import numpy as np

x = np.array([len(G[n]) for n in G.nodes()])
np.histogram(x, bins=np.arange(0, 1300, 100))

(array([7854,  470,  130,   35,   23,    9,    9,    6,    1,    1,    0,
           0]),
 array([   0,  100,  200,  300,  400,  500,  600,  700,  800,  900, 1000,
        1100, 1200]))

In [100]:
# Let's only keep the top k edges per node
k = 20
G2 = nx.Graph()
for n1 in G.nodes():
    neighbors = list(G[n1].items())
    top_neighbors = sorted(neighbors, key=lambda neigh: neigh[1]['weight'], reverse=True)[:k]
    for neigh in top_neighbors:
        n2 = neigh[0]
        w = neigh[1]['weight']
        if n2 < n1:
            continue
        G2.add_edge(n1, n2, weight=w)

In [91]:
print(len(G2.edges()))
x = np.array([len(G2[n]) for n in G2.nodes()])
np.histogram(x, bins=np.arange(0, 1300, 100))

62195


(array([8398,   42,    6,    2,    0,    0,    0,    0,    0,    0,    0,
           0]),
 array([   0,  100,  200,  300,  400,  500,  600,  700,  800,  900, 1000,
        1100, 1200]))