In [None]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki

# Get index of articles

* [all indices on Wikipedia](https://en.wikipedia.org/wiki/Portal:Contents/Indices)
* topics not searched
* international trade ("topics"), theory of constraints (small)
* too big: mathematics, neuroscience

In [None]:
path_base = '/Users/harangju/Developer/data/wiki/dumps/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = wiki.Dump(path_xml, path_index)

In [None]:
# natural & physical sciences
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology']
topics += ['chemistry', 'biophysics', 'energy', 'optics', 
           'earth science', 'geology', 'meteorology']
# philosophy
# topics += []
topics += ['philosophy of language', 'philosophy of law', 
           'philosophy of mind', 'philosophy of science']
# social sciences
topics += ['economics', 'accounting', 'education', 'linguistics', 'law', 'psychology',
           'sociology']
# technology & applied sciences
topics += ['electronics', 'software engineering', 'robotics']

In [None]:
links = {}
for topic in topics:
    dump.load_page('Index of %s articles' % topic)
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

In [None]:
# https://en.wikipedia.org/wiki/Lists_of_mathematics_topics
# algebra
math_topics = ['calculus', 'geometry', 'abstract algebra',
               'Boolean algebra', 'commutative algebra',# 'homological algebra',
               'group theory',# 'representation theory', 
               'linear algebra']
# calculus & analysis
# math_topics += ['complex analysis', 'functional analysis',
#                 'integration and measure theory', 'harmonic analysis',
#                 'Fourier analysis', 'multivariable calculus', 'real analysis',
#                 'variational']
# geometry
# math_topics += ['geometry', 'curves', 'triangle', 'circle', 'general topology',
#                 'differential geometry', 'algebraic geometry', 'algebraic topology',
#                 'geometric topology', 'know theory', 'Lie groups']
# number theory
math_topics += [#'algebraic number theory',
                'number theory']
# applied math
math_topics += ['dynamical systems and differential equations']
#                 'partial differential equation']
topics += math_topics

In [None]:
links = {}
for topic in math_topics:
    dump.load_page(f"List of {topic} topics")
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

In [None]:
topics += ['physics']

In [None]:
import string

topic = 'physics'
links[topic] = []
for letter in ['!$@', '0–9'] + list(string.ascii_uppercase):
    dump.load_page('Index of physics articles (%s)' % letter)
    links[topic].extend([str(l) for l in dump.article_links])
print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

In [None]:
topics += ['mathematics']

In [None]:
topic = 'mathematics'
links[topic] = []
for letter in ['0–9'] + list(string.ascii_uppercase):
    dump.load_page(
        f"Wikipedia:WikiProject Mathematics/List of mathematics articles ({letter})'
    )
    links[topic].extend([str(l) for l in dump.article_links])
print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

# Build graphs of topics

In [None]:
import pickle
import gensim.utils as gu

path_models = '/Users/harangju/Developer/data/wiki/models/'
tfidf = gu.SaveLoad.load(path_models + 'tfidf.model')
dct = pickle.load(open(path_models + 'dict.model','rb'))

## One network per topic

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated/'

networks = {}
for topic in topics:
    ls = links[topic]
    print('\nTopic: ' + topic)
    networks[topic] = wiki.Net()
    networks[topic].build_graph(
        name=topic, dump=dump, nodes=ls, model=tfidf, dct=dct
    )
    networks[topic].save_graph(path + topic + '.pickle')
    networks[topic].save_graph(path + topic + '.gexf')
    networks[topic].save_barcodes(path + topic + '.barcode')

## Redo barcodes

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net(
        path_graph=os.path.join(path_saved, topic+'.pickle'),
    )

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/barcode/'

for topic in topics:
    print('\nTopic: ' + topic)
    networks[topic].save_barcodes(os.path.join(path, topic+'.barcode'))

## Subnetworks

## Big network

In [None]:
all_links = list(set([v for l in links.values() for v in l]))
len(all_links)

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated/'

big_network = wiki.Net()
big_network.build_graph(
    name='big_network',
    dump=dump, 
    nodes=all_links, 
    model=tfidf, 
    dct=dct,
    compute_core_periphery=False, 
    compute_communities=False, 
    compute_community_cores=False
)
big_network.save_graph(os.path.join(path, 'big_network_physics_math.pickle'))
big_network.save_graph(os.path.join(path, 'big_network_physics_math.gexf'))
# big_network.save_barcodes(os.path.join(path, 'big_network.barcode'))

## Nodes without years

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated-noyear/'

if not os.path.isdir(path):
    os.mkdir(path)

networks_noyear = {}
for topic in topics:
    print('\nTopic: ' + topic)
    networks_noyear[topic] = wiki.Net()
    networks_noyear[topic].build_graph(
        name=topic, dump=dump, nodes=links[topic],
        fill_empty_years=False,
        compute_core_periphery=False,
        compute_communities=False,
        compute_community_cores=False
    )
    networks_noyear[topic].save_graph(path + topic + '.pickle')
    networks_noyear[topic].save_graph(path + topic + '.gexf')

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/dated-noyear/'

networks_noyear = {}
for topic in topics:
    print(topic, end=' ')
    networks_noyear[topic] = wiki.Net()
    networks_noyear[topic].load_graph(path + topic + '.pickle')

In [None]:
import pandas as pd
import networkx as nx
import plotly.express as px

fraction_years = pd.DataFrame(
    [
        [
            topic,
            len([
                y
                for n, y in nx.get_node_attributes(
                    networks_noyear[topic].graph, 'year'
                ).items()
                if y
            ]) / len(networks_noyear[topic].graph.nodes)
        ]
        for topic in topics
    ],
    columns=['topics', 'fraction']
)
fraction_years

In [None]:
# for topic in topics:
#     print(topic, end='\t')
#     print( 
#         len([
#             y
#             for n, y in nx.get_node_attributes(
#                 networks_noyear[topic].graph, 'year'
#             ).items()
#             if y
#         ]) / len(networks_noyear[topic].graph.nodes)
#     )

In [None]:
path_fig = '/Users/harangju/Library/Mobile Documents/com~apple~CloudDocs/' +\
    'Documents/research/wikipedia/results'
path_plot = '0 graphs'

fig = px.histogram(fraction_years.fraction)
fig.update_layout(
    width=500, height=360,
    template='plotly_white',
    xaxis={'range': [0, 1]},
    showlegend=False
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'fraction_years_with_math.pdf'))

# Generate null networks

## Load networks

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

## Randomized target & year

Just randomizing year -> you get the same structures, it's just a matter of when you get those structures.
If you randomize year & target, then you're randomizing the structure & how they come about without changing any basic network statistics.

In [None]:
null_models = ['target', 'year']
num_nulls = 10
nulls = {}
for null_model in null_models:
    print('Null model: ' + null_model)
    path_to_save_null = '/Users/harangju/Developer/data/wiki/graphs/null-'\
                        +null_model+'/'
    nulls[null_model] = {}
    for topic, network in networks.items():
        print('Topic: ' + topic)
        nulls[null_model][topic] = []
        for i in range(num_nulls):
            print('Null: ' + str(i))
            null = network.randomize(null_model)
            null.graph.name = topic+'-null-'+str(i)
            null.save_graph(path_to_save_null + null.graph.name + '.pickle')
            null.save_barcodes(path_to_save_null + null.graph.name + '.barcode')
            nulls[null_model][topic].append(null)

## Jittered years

In [None]:
num_jitters = 1
max_jitter = 1
null_model = 'jitter'
path_to_save_null = '/Users/harangju/Developer/data/wiki/graphs/null-'+null_model+'/'
if not os.path.isdir(path_to_save_null):
    os.mkdir(path_to_save_null)
jittered = {}

In [None]:
import copy
import numpy as np
import numpy.random

for topic, network in networks.items():
    print('Topic: ' + topic)
    jittered[topic] = []
    print('Null: ', end='')
    for i in range(num_jitters):
        print(str(i), end=' ')
        null = wiki.Net()
        null.graph = copy.deepcopy(network.graph)
        for node in null.graph.nodes:
            null.graph.nodes[node]['year'] = null.graph.nodes[node]['year'] +\
                np.random.randint(-max_jitter, max_jitter+1)
        null.graph.name = topic+'-null-'+str(i)
        null.save_graph(path_to_save_null + null.graph.name + '.pickle')
        null.save_barcodes(path_to_save_null + null.graph.name + '.barcode')
        jittered[topic].append(null)
    print()

Gephi notes
* node size, fruchterman reingold = [10, 40], force atlas 2 = [4 16]
* text size = [1 1.4]
* preview font size = 5

## Generative networks
* random geometric graph (modularity)
* stochastic block model (modularity)
* caveman graph (modularity, cliques, most clustered & sparse)
* random clustered graph (clustering)

In [None]:
num_nulls = 10
gen_functions = {
    'rgg': lambda g: nx.random_geometric_graph(
        g.number_of_nodes(), 
    ),
    'sbm': lambda g: nx.stochastic_block_model(
        
    ),
    'cg': lambda g: nx.caveman_graph(
        
    ),
    'rcg': lambda g: nx.random_clustered_graph(
        
    )
}

In [None]:
num_nulls = 10
gen_nulls = {}
for name, function in gen_functions.items():
    print('Null model: ' + null_model)
    path_to_save_null = '/Users/harangju/Developer/data/wiki/graphs/null-'+\
        null_model+'/'
    nulls[null_model] = {}
    for topic, network in networks.items():
        print('Topic: ' + topic)
        nulls[null_model][topic] = []
        for i in range(num_nulls):
            print('Null: ' + str(i))
            null = network.randomize(null_model)
            null.graph.name = topic+'-null-'+str(i)
            null.save_graph(path_to_save_null + null.graph.name + '.pickle')
            null.save_barcodes(path_to_save_null + null.graph.name + '.barcode')
            nulls[null_model][topic].append(null)

# Generate networks for D3

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

In [None]:
[(topic, len(networks[topic].graph.nodes)) for topic in topics]

In [None]:
import json as js
import networkx as nx

path = '/Users/harangju/Developer/data/wiki/graphs/json/'

for topic, network in networks.items():
    nodes = sorted(network.graph.nodes, key=lambda n: network.graph.nodes[n]['year'])
    json = js.dumps({
        'nodes': [
            {
                'id': node,
                'year': int(network.graph.nodes[node]['year']),
#                 'core_be': int(network.graph.nodes[node]['core_be']),
#                 'core_rb': network.graph.nodes[node]['core_rb'],
                'community': int(network.graph.nodes[node]['community']),
                'degree': network.graph.degree(node)
            }
            for i, node in enumerate(nodes)
        ],
        'links': [
            {
                'source': i,
                'target': nodes.index(target),
                'weight': network.graph.edges[node, target]['weight']
            }
            for i, node in enumerate(nodes)
            for target in network.graph.successors(node)
        ]
    })
    with open(os.path.join(path, topic+'.json'), 'w') as file:
        file.write(json)

# Generate barcodes for D3

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net(
        path_graph=os.path.join(path_saved, topic+'.pickle'),
#         path_barcodes=os.path.join(path_saved, topic+'.barcode')
    )

In [None]:
f = networks['cognitive science'].filtration
m = networks['cognitive science'].persistence
for i, c in enumerate(m):
    if m.pair(i) < i: continue      # skip negative simplices
    dim = f[i].dimension()
    if m.pair(i) != m.unpaired:
        print(f"{i}, {dim}, {c}, {m[i]}, {m.pair(i)}, {m[m.pair(i)]}")
    else:
        print(f"{i}, {dim}, {c}, {m[i]}")

In [None]:
m[m.pair(13)]

In [None]:
import dionysus as d

dgms = d.init_diagrams(
    networks['earth science'].persistence,
    networks['earth science'].filtration
)
dgms

In [None]:
for i, dgm in enumerate(dgms):
    print(f"dim {i}", end=' ')
    for p in dgm:
        print(p, end='; ')
    print()

In [None]:
dgms[2][0], dgms[2][0].data

In [None]:
import numpy as np
barcodes = networks['cognitive science'].barcodes.copy()
barcodes = barcodes\
    .drop(index=barcodes[barcodes.lifetime==0].index)\
    .reset_index(drop=True)
barcodes

In [None]:
barcodes.iloc[27]['death simplex'], barcodes.iloc[27]['homology nodes']

In [None]:
path = '/Users/harangju/Developer/data/wiki/graphs/barcode csv/'

for topic, network in {'cognitive science': networks['cognitive science']}.items(): #networks.items():
    barcodes = network.barcodes.copy(deep=True)
    barcodes = barcodes\
        .drop(index=barcodes[barcodes.lifetime==0].index)\
        .reset_index(drop=True)
    barcodes.death = barcodes.death\
        .replace(np.inf, 2100)\
        .astype(int)
    csv = 'i,birth,death,dim,cavity,death_nodes\n'
    for i, row in barcodes.iterrows():
        if row.lifetime==np.inf:
            cavity = row['birth simplex']
        else:
            cavity = row['homology nodes']
        csv += f"{i},{row.birth},{row.death},{row.dim},{';'.join(cavity)}," + \
            f"{';'.join(row['death nodes'])}\n"
    with open(os.path.join(path, topic+'.csv'), 'w') as file:
        file.write(csv)
# print(csv)