In [1]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0],
                                '..', 'module'))
import wiki
import pickle
import numpy as np
import gensim.models as gm
import gensim.utils as gu
import gensim.corpora as gc
import gensim.matutils as gmat
import matplotlib.pyplot as plt
import sklearn.metrics.pairwise as smp

In [2]:
path_base = '/Users/harangju/Developer/data/wiki/dumps/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = wiki.Dump(path_xml, path_index)

### Test big model
See `preparation/build-tfidf-model.ipynb`.

In [3]:
path_save = '/Users/harangju/Developer/data/wiki/models/'
tfidf = gu.SaveLoad.load(path_save + 'tfidf.model')
dct = pickle.load(open(path_save + 'dict.model','rb'))

In [4]:
dump.load_page('Physics')
words = gu.simple_preprocess(dump.page.strip_code())

Dump: Loading index...
Dump: Loaded.


In [5]:
tfidf[dct.doc2bow(words)][:5]

[(4, 0.0030784267402274444),
 (5, 0.2103100457446028),
 (12, 0.0333281420135526),
 (13, 0.2007292418185778),
 (17, 0.04902932080705284)]

### Weight graph

In [6]:
topic = 'earth science'
dump.load_page('Index of %s articles' % topic)
links = [str(l) for l in dump.article_links]
network = wiki.Net()
network.build_graph(dump=dump, nodes=links)
network.graph.nodes

wiki.Net: traversing Wikipedia...
wiki.Net: depth = 0
wiki.Net: len(queue) = 530
wiki.Net: depth = 1
wiki.Net: removing isolates...
wiki.Net: adding years...
wiki.Net: filling empty years...


NodeView(('Planetary science', 'Earth science', 'Lithosphere', 'Hydrosphere', 'Biosphere', 'Holism', 'Geology', 'Geography', 'Chronology', 'Physics', 'Chemistry', 'Biology', 'Mathematics', 'Science', 'Planetary geology', 'Geochemistry', 'Geophysics', 'Oceanography', 'Hydrology', 'Glaciology', 'Life', 'Human geography', 'Physical geography', 'Geosphere', 'Atmospheric sciences', 'Geochronology', 'Plate tectonics', 'Paleoclimatology', 'Petrology', 'Environmental geology', "Earth's magnetic field", 'Geodynamics', 'Magma', 'Magnetosphere', 'Geodesy', 'Crust (geology)', 'Antarctic convergence', 'Atmospheric chemistry', 'Meteorology', 'Volcanology', 'Climatology', 'Atmospheric physics', 'Biogeography', 'Cartography', "Earth's spheres", 'Chemical oceanography', 'Earth (planet)', 'Erosion', 'Cryosphere', 'Mineralogy', 'Dynamo theory', 'Economic geology', 'Structural geology', 'Soil science', 'Edaphology', 'Engineering geology', 'Hydrogeology', 'Geomorphology', 'Environmental science', 'Limnolog

In [7]:
network.graph['Biosphere']

AtlasView({'Earth science': {'weight': 1}, 'Geography': {'weight': 1}, 'Environmental geology': {'weight': 1}, 'Gaia hypothesis': {'weight': 1}, 'Pedosphere': {'weight': 1}, 'Physical geography': {'weight': 1}})

In [8]:
n1 = 'Biosphere'
n2 = 'Earth science'
p1 = gu.simple_preprocess(dump.load_page(n1).strip_code())
p2 = gu.simple_preprocess(dump.load_page(n2).strip_code())
p1[:5], p2[:5]

(['thumb', 'px', 'false', 'color', 'composite'],
 ['earth', 'science', 'or', 'geoscience', 'includes'])

In [9]:
b1 = tfidf[dct.doc2bow(p1)]
b2 = tfidf[dct.doc2bow(p2)]
b1[:3], b2[:3]

([(5, 0.13155321660272415),
  (9, 0.005814052937824806),
  (12, 0.021480370659743037)],
 [(4, 0.018341276283594058),
  (5, 0.12114910825706296),
  (9, 0.005201260490809376)])

In [10]:
v = gmat.corpus2csc([b1, b2])
v[:,0], v[:,1]

(<2050373x1 sparse matrix of type '<class 'numpy.float64'>'
 	with 660 stored elements in Compressed Sparse Column format>,
 <2050373x1 sparse matrix of type '<class 'numpy.float64'>'
 	with 605 stored elements in Compressed Sparse Column format>)

In [11]:
smp.cosine_similarity(X=v[:,0].transpose(), Y=v[:,1].transpose())

array([[0.41726377]])

In [12]:
smp.cosine_similarity(X=v[:,0].transpose(), Y=v[:,1].transpose())[0,0]

0.4172637706806667

#### Function

In [13]:
def set_weights(graph, dump, model, dct):
    nodes = list(graph.nodes)
    bows = [model[dct.doc2bow(gu.simple_preprocess(dump.load_page(page).strip_code()))]
            if page else []
            for page in nodes]
    vecs = gmat.corpus2csc(bows)
    for n1, n2 in graph.edges:
        v1 = vecs[:,nodes.index(n1)].transpose()
        v2 = vecs[:,nodes.index(n2)].transpose()
        graph[n1][n2]['weight'] = smp.cosine_similarity(X=v1, Y=v2)[0,0]

In [14]:
network.graph['Biosphere']

AtlasView({'Earth science': {'weight': 1}, 'Geography': {'weight': 1}, 'Environmental geology': {'weight': 1}, 'Gaia hypothesis': {'weight': 1}, 'Pedosphere': {'weight': 1}, 'Physical geography': {'weight': 1}})

In [15]:
set_weights(network.graph, dump, tfidf, dct)

In [16]:
network.graph['Biosphere']

AtlasView({'Earth science': {'weight': 0.4172637706806667}, 'Geography': {'weight': 0.2948794573348659}, 'Environmental geology': {'weight': 0.19913703642234504}, 'Gaia hypothesis': {'weight': 0.28702049086615644}, 'Pedosphere': {'weight': 0.28738002838917165}, 'Physical geography': {'weight': 0.2802678005676159}})

In [17]:
network.graph['Chemistry']

AtlasView({'Earth science': {'weight': 0.2502682157266995}, 'Science': {'weight': 0.4431454820261675}, 'Physics': {'weight': 0.3909709005752355}, 'Atmospheric chemistry': {'weight': 0.488462080888366}, 'Atmospheric physics': {'weight': 0.24918699013654128}, 'Mineralogy': {'weight': 0.3639238071195595}, 'Soil science': {'weight': 0.14720482139517832}, 'Environmental science': {'weight': 0.2777806346954509}, 'Geochemistry': {'weight': 0.45309574688801724}, 'Limnology': {'weight': 0.23635818232782133}, 'Oceanography': {'weight': 0.29344839147227514}, 'Paleoceanography': {'weight': 0.23278909380701496}, 'Pedosphere': {'weight': 0.3651094268777166}, 'Petrology': {'weight': 0.17507917347288227}})