In [1]:
%load_ext autoreload
%autoreload 2

## Exploring the wiki dump

In [2]:
import wiki

path_base = '/Users/harangju/Developer/data/wiki/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = wiki.Dump(path_xml, path_index)

In [3]:
%time dump.load_page('Portal:Physics/Topics')
dump.links[:5]

Dump: Loading index...
Dump: Loaded.
CPU times: user 1min 11s, sys: 3.46 s, total: 1min 14s
Wall time: 1min 15s


['Classical physics', 'Mechanics', 'Optics', 'Electricity', 'Magnetism']

In [4]:
dump.load_page('Danielle Bassett')
dump.links[:5]

['University of pennsylvania',
 'Pennsylvania state university',
 'University of cambridge',
 'Sloan research fellowship',
 'Macarthur fellowship']

In [5]:
dump.load_page('Matter', filter_top=True).strip_code()[:200]

"In classical physics and general chemistry, '''matter''' is any substance that has mass and takes up space by having volume. All everyday objects that can be touched are ultimately composed of atoms, "

## Get index of physics articles

* [all indices on Wikipedia](https://en.wikipedia.org/wiki/Portal:Contents/Indices)
* topics not searched
* international trade ("topics"), theory of constraints (small)
* too big: mathematics, neuroscience

In [6]:
# natural & physical sciences
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology']
topics += ['chemistry', 'biophysics', 'energy', 'optics', 
           'earth science', 'geology', 'meteorology']
# philosophy
# topics += []
topics += ['philosophy of language', 'philosophy of law', 
           'philosophy of mind', 'philosophy of science']
# social sciences
topics += ['economics', 'accounting', 'education', 'linguistics', 'law', 'psychology', 'sociology']
# technology & applied sciences
topics += ['electronics', 'software engineering', 'robotics']

In [7]:
links = {}
for topic in topics:
    dump.load_page('Index of %s articles' % topic)
    links[topic] = [str(l) for l in dump.article_links]
    print('Topic "' + topic + '" has ' + str(len(links[topic])) + ' articles.')

Topic "anatomy" has 2331 articles.
Topic "biochemistry" has 1216 articles.
Topic "cognitive science" has 127 articles.
Topic "evolutionary biology" has 287 articles.
Topic "genetics" has 1441 articles.
Topic "immunology" has 572 articles.
Topic "molecular biology" has 507 articles.
Topic "chemistry" has 1088 articles.
Topic "biophysics" has 773 articles.
Topic "energy" has 158 articles.
Topic "optics" has 386 articles.
Topic "earth science" has 135 articles.
Topic "geology" has 116 articles.
Topic "meteorology" has 761 articles.
Topic "philosophy of language" has 275 articles.
Topic "philosophy of law" has 208 articles.
Topic "philosophy of mind" has 109 articles.
Topic "philosophy of science" has 448 articles.
Topic "economics" has 562 articles.
Topic "accounting" has 154 articles.
Topic "education" has 872 articles.
Topic "linguistics" has 420 articles.
Topic "law" has 3657 articles.
Topic "psychology" has 1801 articles.
Topic "sociology" has 772 articles.
Topic "electronics" has 127

In [8]:
import string

links['physics'] = []
for letter in ['!$@', '0–9'] + list(string.ascii_uppercase):
    dump.load_page('Index of physics articles (%s)' % letter)
    links['physics'].extend([str(l) for l in dump.article_links])
print('Topic "' + 'physics' + '" has ' + str(len(links['physics'])) + ' articles.')

Topic "physics" has 15215 articles.


## Build graphs of topics

### 1 network per topic

In [9]:
import networkx as nx

# path_to_save = '/Users/harangju/Box Sync/Research/my papers/wikipedia paper/data/graphs/top_section/undated/'
path_to_save = '/Users/harangju/Box Sync/Research/my papers/wikipedia paper/data/graphs/full_page/undated/'

graphs = {}
page_noload = {}
depth = 1
for topic in links.keys():
    print('Graph topic: ' + topic)
    graphs[topic] = nx.DiGraph()
    page_noload[topic] = wiki.Crawler.bfs(graphs[topic], dump, links[topic],
                                          depth_goal = depth, nodes = links[topic],
                                          filter_top = False)
    nx.write_gexf(graphs[topic], path_to_save + topic + '.gexf')

Graph topic: anatomy
Depth: 0
Crawler: len(queue) = 1228
Depth: 1
Graph topic: biochemistry
Depth: 0
Crawler: len(queue) = 7320
Depth: 1
Graph topic: cognitive science
Depth: 0
Crawler: len(queue) = 800
Depth: 1
Graph topic: evolutionary biology
Depth: 0
Crawler: len(queue) = 178
Depth: 1
Graph topic: genetics
Depth: 0
Crawler: len(queue) = 6630
Depth: 1
Crawler: len(queue) = 662Graph topic: immunology
Depth: 0
Crawler: len(queue) = 302
Depth: 1
Crawler: len(queue) = 301Graph topic: molecular biology
Depth: 0
Crawler: len(queue) = 237
Depth: 1
Graph topic: chemistry
Depth: 0
Crawler: len(queue) = 7550
Depth: 1
Graph topic: biophysics
Depth: 0
Crawler: len(queue) = 386
Depth: 1
Crawler: len(queue) = 385Graph topic: energy
Depth: 0
Crawler: len(queue) = 550
Depth: 1
Graph topic: optics
Depth: 0
Crawler: len(queue) = 243
Depth: 1
Graph topic: earth science
Depth: 0
Crawler: len(queue) = 760
Depth: 1
Graph topic: geology
Depth: 0
Crawler: len(queue) = 530
Depth: 1
Graph topic: meteorology


In [12]:
for topic, graph in graphs.items():
    graph.remove_nodes_from(nx.isolates(graph))
    nx.write_gexf(graph, path_to_save + topic + '.gexf')

Gephi notes
* node size, fruchterman reingold = [10, 40], force atlas 2 = [4 16]
* text size = [1 1.4]
* preview font size = 5

### 1 network for all topics
i.e., connect all networks into 1 big network

In [None]:
# big_list = [item for sublist in list(links.values()) for item in sublist]
# big_graph = nx.DiGraph()
# wiki.Crawler.bfs(big_graph, dump, big_list, depth_goal = 2, nodes = big_list)
# nx.write_gexf(big_graph, path_base + 'graphs/big_graph.gexf')

### Building larger networks with depth first search -- then coarse community detection

In [13]:
import networkx as nx

topic = 'biochemistry'
graph = nx.DiGraph()
depth = 2
%time wiki.Crawler.bfs(graph, dump, links[topic], depth_goal = depth);

Depth: 0
Crawler: len(queue) = 6771
Depth: 1
Crawler: len(queue) = 31398
Depth: 2
CPU times: user 6min 25s, sys: 6.75 s, total: 6min 32s
Wall time: 7min 20s


['Abc-transporter genes',
 'Alpha-beta t-cell antigen receptor',
 'Amino acid receptor',
 'Annexin ii',
 'Cam photosynthesis',
 'Cam plants',
 'Ccr5 receptor',
 'Cd4 antigen',
 'Cd45 antigen',
 'Cd95 antigen',
 'Cdc28 protein kinase',
 'Complement 3a',
 'Complement 5a',
 'Bf (protein)',
 'Cooperativity cellular respiration',
 'Cxcr4 receptor',
 'Cyclic amp receptor',
 'Cyclic amp receptor protein',
 'Cyclic amp-responsive dna-binding protein',
 'Coenzyme q - cytochrome c reductase',
 'Cytochrome p-450',
 'Dna fragmentation',
 'Dna topology',
 'Dna transposable element',
 'Dna-binding protein',
 'Dopamine d1 receptor',
 'Dopamine d2 receptor',
 'Eif-2',
 'Elispot',
 'Energy decomposition cycles',
 'Env gene product',
 'Equine gonadotropin',
 'Erba gene',
 'Erbb gene',
 'Erbb-2 gene',
 'Fadh',
 'Fadh2',
 'Fms gene',
 'Fos gene',
 'Fsh receptor',
 'G3p',
 'Gaba-a receptor',
 'Gamma-delta t-cell antigen receptor',
 'Gastrointestinal hormone receptor',
 'Gtp-binding protein',
 'Gtpase',
 'H

In [14]:
path_to_save = '/Users/harangju/Box Sync/Research/my papers/wikipedia paper/data/graphs/depth 2/undated/'
nx.write_gexf(graph, path_to_save + topic + '.gexf')

In [17]:
import bct

%time ci, q = bct.community_louvain(nx.convert_matrix.to_numpy_array(graph), gamma=1)
print(ci, q)

BCTParamError: Modularity infinite loop style G. Please contact the developer.

NameError: name 'ci' is not defined

In [16]:
len(graph.nodes())

33615

In [None]:
import networkx as nx

# path_to_save = '/Users/harangju/Box Sync/Research/my papers/wikipedia paper/data/graphs/top_section/undated/'
path_to_save = '/Users/harangju/Box Sync/Research/my papers/wikipedia paper/data/graphs/full_page/undated/'

graphs = {}
page_noload = {}
depth = 2
for topic in links.keys():
    print('Graph topic: ' + topic)
    graphs[topic] = nx.DiGraph()
    page_noload[topic] = wiki.Crawler.bfs(graphs[topic], dump, links[topic],
                                          depth_goal = depth, nodes = links[topic],
                                          filter_top = False)
    nx.write_gexf(graphs[topic], path_to_save + topic + '.gexf')