### WikiEngine

In [1]:
from wiki.dump import WikiDump

path_base = '/Users/harangju/Developer/data/wiki/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = WikiDump(path_xml, path_index)

### Exploring the wiki dump

In [2]:
dump.load_page('Portal:Physics/Topics')
dump.links[:5]

WikiDump: Loading index...
WikiDump: Loaded.


['Classical physics', 'mechanics', 'optics', 'electricity', 'magnetism']

In [3]:
dump.load_page('Danielle Bassett')
dump.links[:5]

['University of Pennsylvania',
 'Pennsylvania State University',
 'University of Cambridge',
 'Sloan Research Fellowship',
 'MacArthur fellowship']

In [4]:
dump.load_page('Matter', filter_top=True).strip_code()

'In classical physics and general chemistry, matter is any substance that has mass and takes up space by having volume. All everyday objects that can be touched are ultimately composed of atoms, which are made up of interacting subatomic particles, and in everyday as well as scientific usage, "matter" generally includes atoms and anything made up of them, and any particles (or combination of particles) that act as if they have both rest mass and volume. However it does not include massless particles such as photons, or other energy phenomena or waves such as light or sound.  Matter exists in various states (also known as phases). These include classical everyday phases such as solid, liquid, and gas – for example water exists as ice, liquid water, and gaseous steam – but other states are possible, including plasma, Bose–Einstein condensates, fermionic condensates, and quark–gluon plasma.\n\nUsually atoms can be imagined as a nucleus of protons and neutrons, and a surrounding "cloud" of

In [5]:
dump.load_page('Index of physics articles (P)')
dump.links[50:54]

['Pandemonium effect', 'Panemone windmill', 'Panofsky Prize', 'Pantur Silaban']

In [6]:
# testing XML cache
%time page = dump.load_page('AccessibleComputing')
%time page = dump.load_page('Anarchism')
%time page = dump.load_page('Angola')

CPU times: user 81.3 ms, sys: 1.98 ms, total: 83.2 ms
Wall time: 107 ms
CPU times: user 67.7 ms, sys: 62 µs, total: 67.8 ms
Wall time: 67.8 ms
CPU times: user 212 ms, sys: 3.66 ms, total: 215 ms
Wall time: 219 ms


### Wikipedia hypernet traversal

In [7]:
import networkx as nx

class WikiCrawler():
    @staticmethod
    def bfs(graph, dump, queue, depth_goal=1):
        depth = 0
        depth_num_items = len(queue)
        depth_inc_pending = False
        while queue:
            page = queue.pop(0)
            depth_num_items -= 1
            if depth_num_items == 0:
                depth += 1
                print('Depth: ' + str(depth))
                depth_inc_pending = True
            if dump.load_page(page, filter_top=True):
                for link in dump.links:
                    link = str(link).split('#')[0].capitalize()
                    if (page, link) not in graph.edges:
                        graph.add_edge(link, page, 
                                       weight=1)
                        queue.append(link)
            if depth_inc_pending:
                depth_num_items = len(queue)
                depth_inc_pending = False
            if depth == depth_goal:
                break

In [8]:
graph = nx.DiGraph()
queue = ['Matter']
WikiCrawler.bfs(graph, dump, queue, 3)

Depth: 1
Depth: 2
Depth: 3


In [9]:
len(graph.nodes)

9492

### Port to gephi

In [None]:
nx.write_gexf(graph, path_base + 'graphs/matter_d3.gexf')

### Edge weights