In [1]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki

## Get index of articles

* [all indices on Wikipedia](https://en.wikipedia.org/wiki/Portal:Contents/Indices)
* topics not searched
* international trade ("topics"), theory of constraints (small)
* too big: mathematics, neuroscience

In [4]:
path_base = '/Users/harangju/Developer/data/wiki/dumps/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = wiki.Dump(path_xml, path_index)

In [5]:
# natural & physical sciences
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology']
topics += ['chemistry', 'biophysics', 'energy', 'optics', 
           'earth science', 'geology', 'meteorology']
# philosophy
# topics += []
topics += ['philosophy of language', 'philosophy of law', 
           'philosophy of mind', 'philosophy of science']
# social sciences
topics += ['economics', 'accounting', 'education', 'linguistics', 'law', 'psychology',
           'sociology']
# technology & applied sciences
topics += ['electronics', 'software engineering', 'robotics']

In [6]:
links = {}
for topic in topics:
    dump.load_page('Index of %s articles' % topic)
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

Dump: Loading index...
Dump: Loaded.
Topic "anatomy" has 2331 articles.
Topic "biochemistry" has 1216 articles.
Topic "cognitive science" has 127 articles.
Topic "evolutionary biology" has 287 articles.
Topic "genetics" has 1441 articles.
Topic "immunology" has 572 articles.
Topic "molecular biology" has 507 articles.
Topic "chemistry" has 1088 articles.
Topic "biophysics" has 773 articles.
Topic "energy" has 158 articles.
Topic "optics" has 386 articles.
Topic "earth science" has 135 articles.
Topic "geology" has 116 articles.
Topic "meteorology" has 761 articles.
Topic "philosophy of language" has 275 articles.
Topic "philosophy of law" has 208 articles.
Topic "philosophy of mind" has 109 articles.
Topic "philosophy of science" has 448 articles.
Topic "economics" has 562 articles.
Topic "accounting" has 154 articles.
Topic "education" has 872 articles.
Topic "linguistics" has 420 articles.
Topic "law" has 3657 articles.
Topic "psychology" has 1801 articles.
Topic "sociology" has 772 

In [13]:
import string

topic = 'physics'
links[topic] = []
for letter in ['!$@', '0–9'] + list(string.ascii_uppercase):
    dump.load_page('Index of physics articles (%s)' % letter)
    links[topic].extend([str(l) for l in dump.article_links])
print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

Topic "physics" has 15215 articles.


In [9]:
topic = 'mathematics'
links[topic] = []
for letter in ['0–9'] + list(string.ascii_uppercase):
    dump.load_page('Wikipedia:WikiProject Mathematics/List of mathematics articles (%s)'
                   % letter)
    links[topic].extend([str(l) for l in dump.article_links])
print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

Topic "mathematics" has 26001 articles.


## Build graphs of topics

### 1 network per topic

In [10]:
import pickle
import gensim.utils as gu

path_save = '/Users/harangju/Developer/data/wiki/models/'
tfidf = gu.SaveLoad.load(path_save + 'tfidf.model')
dct = pickle.load(open(path_save + 'dict.model','rb'))

In [None]:
path_to_save = '/Users/harangju/Developer/data/wiki/graphs/dated/'

networks = {}
for topic, ls in links.items():
    print('\nTopic: ' + topic)
    networks[topic] = wiki.Net()
    networks[topic].build_graph(name=topic, dump=dump, nodes=ls, model=tfidf, dct=dct)
    networks[topic].save_graph(path_to_save + topic + '.pickle')
    networks[topic].save_barcodes(path_to_save + topic + '.barcode')

In [20]:
# networks['mathematics'].save_graph(topic + '.pickle')
# networks['mathematics'].save_barcodes(topic + '.barcode')

## Generate null networks

In [32]:
networks = {}
for topic in links.keys():
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_to_save + topic + '.pickle')

In [33]:
null_models = ['target', 'year']
num_nulls = 2
# nulls = {}
_null_models = {'physics': null_models['physics'],
               'mathematics': null_models['mathematics']}
for null_model in _null_models:
    print('Null model: ' + null_model)
    path_to_save_null = '/Users/harangju/Developer/data/wiki/graphs/null-'+null_model+'/'
    nulls[null_model] = {}
    for topic, network in networks.items():
        print('Topic: ' + topic)
        nulls[null_model][topic] = []
        for i in range(num_nulls):
            print('Null: ' + str(i))
            null = network.randomize(null_model)
            null.graph.name = topic+'-null-'+str(i)
            null.save_graph(path_to_save_null + null.graph.name + '.pickle')
            null.save_barcodes(path_to_save_null + null.graph.name + '.barcode')
            nulls[null_model][topic].append(null)

Null model: target
Topic: anatomy
Null: 0
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 8027/8029
Null: 1
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 8041/8042
Topic: biochemistry
Null: 0
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 5288/5290
Null: 1
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 5282/5287
Topic: cognitive science
Null: 0
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 369/384
Null: 1
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wik

wiki.Net: barcode 1813/1824
Null: 1
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 1847/1864
Topic: electronics
Null: 0
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 4185/4268
Null: 1
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 4233/4346
Topic: software engineering
Null: 0
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 692/716
Null: 1
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip negatives)
wiki.Net: barcode 703/724
Topic: robotics
Null: 0
wiki.Net: computing core-periphery...
wiki.Net: computing communities...
wiki.Net: computing barcodes... (skip nega

KeyboardInterrupt: 

Gephi notes
* node size, fruchterman reingold = [10, 40], force atlas 2 = [4 16]
* text size = [1 1.4]
* preview font size = 5