In [4]:
x = 123

In [6]:
x

123

In [2]:
import os
import sys

base_path = '/home/jupyter-cchen'
private_data_path = os.path.join(base_path, 'data/_private/chaoran')

# Define source of entries
# number_entries = 7836565
number_entries = 3444311
# number_entries = 9958
def get_entries():
    # return data_models.basic_read_from_xz(os.path.join(base_path, 'data/11-basic/basics.json.xz'))
    return data_models.read_basics_from_database('localhost', '12210', 'icebreaker_network', 'postgres', 'postgres')

# Add path to /src to sys.path
script_base_path = os.path.join(base_path, 'src')
if script_base_path not in sys.path:
  sys.path.append(script_base_path)

# Import own modules
from data import data_models

# Reload own modules (since they will be changing quite often)
import importlib
importlib.reload(data_models)


None

In [3]:
for e in get_entries():
    print(e)
    break

BasicDataEntry(icebreaker_id=1673331, doi=None, core_id='102549444', title='Concentration-Dependent Profiles for Describing the Scatter of Results of Interlaboratory Surveys', abstract='Summary: In an interlaboratory survey for the quantitative determination of a clinical chemical quantity, samples the same specimen are analysed in different laboratories. If the number of participating laboratories is sufficiently large, then the differences between the 50th percentile (median) and e. g. the 25th and 75th percentiles of the results give a very reliable impression of the ränge of interlaboratory scatter for the particular analytical technique. Results from a relatively large number of interlaboratory surveys, in which specimens containing different concen-trations of the analyte are investigated, can be handled in the same way. If the resulting differences between the ichosen percentiles are plotted against the median, and the corresponding two regression lines (upper and lower) are idr

In [None]:
import pickle

with open(os.path.join(private_data_path, 'playground.pickle'), 'wb') as f:
    data = {
        'topic_counter': topic_counter,
        'subject_counter': subject_counter,
        'edge_dict': edge_dict,
        'G': G,
        'G2': G2,
        'nodes_label_to_numeric_id': nodes_label_to_numeric_id,
        'nodes_numeric_id_to_label': nodes_numeric_id_to_label
    }
    pickle.dump(data, f)

In [7]:
import pickle

with open(os.path.join(private_data_path, 'playground.pickle'), 'rb') as f:
    data = pickle.load(f)
#     topic_counter = data['topic_counter']
#     subject_counter = data['subject_counter']
#     has_t = data['stats']['has_t']
#     has_s = data['stats']['has_s']
#     has_full_text = data['stats']['has_full_text']

In [10]:
G = data['G']

In [None]:
from collections import Counter
import re
from tqdm.notebook import tqdm

def count_topics_and_subjects(entries):
    topic_counter = Counter()
    subject_counter = Counter()

    has_t = 0
    has_s = 0
    has_full_text = 0
    reg = re.compile('[a-zA-Z0-9 ]*')
    
    for i, entry in tqdm(enumerate(entries), total=number_entries): 
        ts = entry.topics
        ss = entry.subjects
        if ts is not None and len(ts) > 0:
            topic_counter.update([t.lower() for t in ts if reg.fullmatch(t) is not None])
            has_t += 1
        if ss is not None and len(ss) > 0:
            subject_counter.update([s.lower() for s in ss if reg.fullmatch(s) is not None])
            has_s += 1
        if entry.has_full_text:
            has_full_text += 1
    print('total entries={}, has topics={}, has subjects={}, has full text={}'.format(i, has_t, has_s, has_full_text))
    return topic_counter, subject_counter

topic_counter, subject_counter = count_topics_and_subjects(get_entries())

In [None]:
print(len(topic_counter))
print('-----')

import csv
import io
output = io.StringIO()
writer = csv.writer(output)
pos = 200
for x in topic_counter.most_common()[pos:pos+10]:
    writer.writerow(x)
print(output.getvalue())
# Selected: 0, 200, 750, 3000, 10000, 25000 (+50)

In [None]:
import csv

def write_entries_to_csv(entries, path):
    """
    Writes a small part of the data as csv to be imported by a relational database
    """
    with open(path, 'wt') as f:
        csv_writer = csv.DictWriter(f, fieldnames=['icebreaker_id', 'doi', 'core_id', 'title', 'year'])
        csv_writer.writeheader()
        for entry in tqdm(entries, total=number_entries):
            csv_writer.writerow({
                'icebreaker_id': entry.icebreaker_id,
                'doi': entry.doi,
                'core_id': entry.core_id,
                'title': entry.title,
                'year': entry.year
            })
        
# write_entries_to_csv(get_entries(), os.path.join(private_data_path, 'basics.csv'))

## Network

### Preparation

In [None]:
from tqdm.notebook import tqdm

def count_edges(entries):
    """
    Computes the weights for all edges. This contains ALL topics - without any filtering!
    """
    edge_dict = {}  # (node1:str, node2:str) -> weight where node1<node2
    for entry in tqdm(entries, total=number_entries):
        if entry.topics is None:
            continue
        for i in range(0, len(entry.topics) - 1):
            for j in range(i, len(entry.topics)):
                t1 = entry.topics[i]
                t2 = entry.topics[j]
                if t2 < t1:
                    tmp = t1
                    t1 = t2
                    t2 = tmp
                if (t1, t2) not in edge_dict:
                    edge_dict[(t1, t2)] = 0
                edge_dict[(t1, t2)] += 1
    return edge_dict

edge_dict = count_edges(get_entries())

In [None]:
import pandas as pd

print('Number of edges: {:d}'.format(len(edge_dict)))

df = pd.DataFrame(list(edge_dict.items()), columns = ['edge_name', 'weight']) 
df.weight.describe()

### The Actual And Filtered Network

In [None]:
import networkx as nx

def create_network(node_dict, edge_dict, node_set):
    """
    :param node_dict: dict<node id, weight>
    :param edge_dict: dict<(node1 id, node2 id), weight>
    :param node_set: a set of nodes that should be included in the network; other nodes will be ignored.
    """
    G = nx.Graph()
    for ((n1, n2), w) in tqdm(edge_dict.items(), total=len(edge_dict), desc='Filter edges'):
#         if w < 3:
#             continue
        if n1 == n2:
            continue
        if n1 not in node_set or n2 not in node_set:
            continue
        G.add_edge(n1, n2, weight=w)
    nx.set_node_attributes(G, topic_counter, name='weight')

    # Compute a normalized edge weight
    normalized_weights = {}
    for e in tqdm(G.edges(data=True), total=len(G.edges()), desc='Computing normalized edge weights'):
        n1, n2, data = e
        w = data['weight']
        normalized_weight = (w / G.degree(n1, weight='weight')) * (w / G.degree(n2, weight='weight'))
        normalized_weights[(n1, n2)] = normalized_weight
    nx.set_edge_attributes(G, normalized_weights, name='normalized_weight')
    
    return G

def reduce_network(G, k):
    """
    Let's only keep the top k edges per node
    """
    G2 = nx.Graph()
    for n1 in tqdm(G.nodes(), total=len(G.nodes()), desc='Reducing network'):
        G2.add_node(n1, weight=G.nodes[n1]['weight'])
        neighbors = list(G[n1].items())
        top_neighbors = sorted(neighbors, key=lambda neigh: neigh[1]['weight'], reverse=True)[:k]
        for neigh in top_neighbors:
            n2 = neigh[0]
            w = neigh[1]['weight']
            normalized_w = neigh[1]['normalized_weight']
            references = None
            if 'references' in neigh[1]:
                references = neigh[1]['references']
            if n2 < n1:
                continue
            G2.add_edge(n1, n2, weight=w, normalized_weight=normalized_w, references=references)
    return G2

def add_references_to_edges(entries, G):
    """
    This function add the corresponding references to all edges
    """
    node_set = set(G.nodes())
    edge_dict = {}  # (node1:str, node2:str) -> ids of references as comma separated list where node1<node2
    for entry in tqdm(entries, total=number_entries, desc='Add edge references'):
        if entry.topics is None:
            continue
        nodes = list(node_set.intersection(set(entry.topics)))
        for i in range(0, len(nodes) - 1):
            for j in range(i + 1, len(nodes)):
                t1 = nodes[i]
                t2 = nodes[j]
                if t2 < t1:
                    tmp = t1
                    t1 = t2
                    t2 = tmp
                if (t1, t2) not in edge_dict:
                    edge_dict[(t1, t2)] = ''
                edge_dict[(t1, t2)] += '{},'.format(entry.icebreaker_id)
    nx.set_edge_attributes(G, edge_dict, name='references')

def assign_numeric_ids_to_nodes(node_set):
    """
    Sometimes we just need numeric IDs, so here they are.
    """
    label_to_numeric_id = {}
    numeric_id_to_label = {}
    for i, label in enumerate(node_set):
        label_to_numeric_id[label] = i
        numeric_id_to_label[i] = label
    return label_to_numeric_id, numeric_id_to_label

nodes = [x[0] for x in topic_counter.most_common()[20:50000] if x[0] != '']
node_set = set(nodes)
nodes_label_to_numeric_id, nodes_numeric_id_to_label = assign_numeric_ids_to_nodes(node_set)
G = create_network(topic_counter, edge_dict, node_set)
add_references_to_edges(get_entries(), G)
G2 = reduce_network(G, 20)

### Export Network

In [None]:
def export_graph(G, path):
    """
    Writes the files node_list.csv and edge_list.csv.
    """
    # Write node list
    with open(os.path.join(path, 'node_list.csv'), 'wt') as f:
        f.write('Name,Weight\n')
        for n, data in G.nodes(data=True):
            w = data['weight']
            line = '{},{}'.format(n, w)
            f.write(line + '\n')
        
    # Write edge list
    with open(os.path.join(path, 'edge_list.csv'), 'wt') as f:
        f.write('Source,Target,Type,Weight,NormalizedWeight\n')
        for n1, n2, data in G.edges(data=True):
            line = '{},{},undefined_relation,{},{:.16f}'.format(n1, n2, data['weight'], data['normalized_weight'])
            f.write(line + '\n')

export_graph(G, os.path.join(private_data_path, 'size_50000/G'))
export_graph(G2, os.path.join(private_data_path, 'size_50000/G2'))

In [None]:
import csv

def _node_name_to_neo4j_id(name):
    return name.replace(' ', '_')


def export_to_neo4j(G, path):
    """
    Writes csv files that can be imported into neo4j. This file can be imported with the import tool
    """
    # Write node list  
    with open(os.path.join(path, 'nodes.csv'), 'wt') as f:
        f.write(':ID,name,weight:int,:LABEL\n')
        for n, data in G.nodes(data=True):
            w = data['weight']
            line = '{},{},{},Topic'.format(_node_name_to_neo4j_id(n), n, w)
            f.write(line + '\n')

    # Write edge list
    with open(os.path.join(path, 'edges.csv'), 'wt') as f:
        f.write(':START_ID,:END_ID,:TYPE,weight:int,normalizedWeight:float,references\n')
        csv_writer = csv.writer(f)
        for n1, n2, data in G.edges(data=True):
            csv_writer.writerow([_node_name_to_neo4j_id(n1), _node_name_to_neo4j_id(n2), 'RELATED_TO', 
                            data['weight'], data['normalized_weight'], data['references']])

export_to_neo4j(G, os.path.join(private_data_path, 'size_50000/G_neo4j'))

In [23]:
import csv
def export_top_nodes(G, k, path):
    ns = sorted(list(G.nodes(data=True)), key=lambda n: n[1]['weight'], reverse=True)
    ns = ns[:k]
    ns = sorted(ns, key=lambda n: n[0])
    with open(path, 'wt') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(['Begriff', 'Anzahl Vorkommnisse'])
        for n in ns:
            label = n[0]
            weight = n[1]['weight']
            csv_writer.writerow([label, weight])

export_top_nodes(G, 10000, os.path.join(private_data_path, 'begriffe.csv'))

In [22]:
G.nodes['2010']

{'weight': 90}

### Network Embedding

In [None]:
from node2vec import Node2Vec

# https://github.com/eliorc/node2vec
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder='/tmp/node2vec')
model = node2vec.fit(window=10, min_count=1, batch_words=4)
with open(os.path.join(private_data_path, 'G_node2vec_model.pickle'), 'wb') as f:
    pickle.dump(model, f)
model.wv.save_word2vec_format(os.path.join(private_data_path, 'G_node2vec.embd'))
model.save(os.path.join(private_data_path, 'G_node2vec.model'))