First, we import neccessary packages and point to the datasets on the computer.

In [12]:
import bz2
import os
import numpy as np
from xml.etree import ElementTree as ET
from html import unescape
from nltk.tokenize import sent_tokenize, word_tokenize
import time
from playsound import playsound
from pathlib import Path, PurePath

dataroot = Path.home() / "Documents" / "Data"
glovepath = dataroot / "glove.6B" / "glove.6B.50d.txt"
wikipath = dataroot / "wikipedia" / "enwiki-20191220.xml.bz2"

glovefile = glovepath.resolve()
wikifile = wikipath.resolve()

Next, we instantiate the information we're extracting from wikipedia.

In [2]:
redirect_table = {} # table of all redirects. used for merging the tables below.
title_freq = {} # table of the frequency of links to each article.
anchor_title_freq = {} # table of Freq(anchor | title). So A[i][j] = COUNT(j | i)
title_contexts = {} # the KMEANS++
category_table = {} # Map Title Category

Then, we iterate over wikipedia to fill out those tables.

In [18]:
embed = {}
count = 0
with open(glovefile, 'r') as fp:
    for line in fp:
        parts = line.split()
        embed[parts[0]] = np.array(list(map(float, parts[1:])))
        
def embed_word(w):
    w = w.lower()
    if w in embed:
        return embed[w]
    else:
        return embed['unk']

In [28]:
def embed_wordlist(words, summary_fn):
    return summary_fn([embed_word(w) for w in words])

def embed_sentence(wordlist):
    summary_fn = lambda x : np.mean(x, axis=0)
    return embed_wordlist(wordlist, summary_fn)

def cosine_sim(u,v):
    return np.dot(u,v) / (np.linalg.norm(u) * np.linalg.norm(v))

[ 3.34456667e-01  5.11975000e-03  3.81589167e-02 -2.62422250e-01
  5.15089167e-01 -1.61305833e-02 -8.18162500e-01 -1.02467500e-01
 -3.13640872e-01 -7.94299525e-02 -5.48941667e-02  5.28040833e-02
 -2.48377667e-01 -2.57688750e-01  5.27354250e-01  1.62426083e-01
 -2.02633917e-01  6.17953333e-02 -3.48771167e-01 -2.53367583e-01
  3.87532075e-01  2.63060333e-01  1.18577142e-01  1.22785667e-01
  8.02510000e-02 -1.51455667e+00 -4.52058333e-01  4.73327500e-02
  3.06048833e-01 -4.71920167e-01  2.92805042e+00  1.28444167e-01
 -1.47871000e-01  2.46600833e-02  2.36680355e-01  8.75444000e-02
 -7.18991667e-02  1.68826917e-01  2.70514833e-01  1.55518333e-02
 -2.72602500e-03  1.85462167e-01  3.49908333e-03  9.22332500e-02
 -3.85677500e-02  5.12871000e-02 -1.91620600e-01 -2.69056417e-01
 -1.96925833e-02 -2.62114917e-01]


In [70]:
lines = 0
pages = 0

start = time.time()

keeping = False
articles = []
buffer = ""
for line in bz2.BZ2File(wikifile, 'r'):
    line = line.decode('utf-8').strip()
    lines += 1
    if line == '<page>':
        keeping = True
    if keeping:
        buffer += line
    if keeping and line == '</page>':
        keeping = False
        pages += 1
        process_article(buffer)
        buffer = ""
        if pages % 10000 == 0:
            print("Found {} pages at {} pages per second.".format(pages, pages / (time.time() - start)))
    if lines % 1000000 == 0:
        print("Processed {} lines at {} lines per second".format(lines, lines / (time.time() - start)))

Processed 1000000 lines at 91684.92836568487 lines per second
Processed 2000000 lines at 94758.33805876356 lines per second
Found 10000 pages at 409.78580024335446 pages per second.
Processed 3000000 lines at 96545.65990616998 lines per second
Processed 4000000 lines at 96303.22249440274 lines per second
Found 20000 pages at 408.94790979413125 pages per second.
Processed 5000000 lines at 96244.45822170719 lines per second
Processed 6000000 lines at 97702.52645452421 lines per second
Found 30000 pages at 447.9189069665639 pages per second.
Processed 7000000 lines at 98111.22971995671 lines per second
Processed 8000000 lines at 98445.40297749237 lines per second
Found 40000 pages at 474.44301666877817 pages per second.
Processed 9000000 lines at 99196.20867436587 lines per second
Found 50000 pages at 503.81198150210486 pages per second.
Processed 10000000 lines at 100245.64689196595 lines per second
Processed 11000000 lines at 101471.6432418664 lines per second
Found 60000 pages at 539.4

KeyboardInterrupt: 

In [272]:
"""
We care about the following states:
0: scanning, the default state.
1: article-page

If we are in an article-page state, we concatenate all lines...
until we hit the </page> tag.

Once the lines have been concatenated, we process the article-page.

The article-page is either a DISAMBIGUATION, REDIRECT, or ARTICLE
DISAMBIGUATION pages are largely ignored.
REDIRECT pages update a redirect table
ARTICLE pages have their individual sentences processed.

Each sentence updates three things:
title_freqs, representing the frequency of each article title appearing in an internal link.
anchor_title_freqs, representing frequency of each anchor linking to a title.
title_contexts, the set of all embedding contexts in which an article is linked.
"""

def get_title(title, block):
    if '<title>' in block:
        return block.strip()[7:-8]
    else:
        return title

def get_state(state, line):
    """
    This method determines the current state by examining a line.
    If the line says namespace = 0, then we've entered a page of the article type.
    If the line says /page, then the current page is over -- so if we're in the article type, we now exit.
    """
    new_state = state
    if state == 0 and '<ns>0</ns>' in line:
        new_state = 1
    if state == 1 and '</page>' in line:
        new_state = 0
    return new_state, state

def string_at(txt, idx, substr):
    start = idx
    end = idx + len(substr)
    if end >= len(txt):
        return False
    elif txt[start:end] == substr:
        return True
    else:
        return False  
    
def remove_tags(text, tag_pairs):
    res = []
    looking_for = []
    for i, c in enumerate(text):
        for (a,b) in tag_pairs:
            if string_at(text, i, a):
                looking_for.append(b)
        if looking_for == []:
            res.append(c)
        elif string_at(text, i-len(looking_for[-1])+1, looking_for[-1]):
            looking_for = looking_for[:-1]
    return ''.join(res)

def clean_wiki(body_text):
    body_text = unescape(body_text)
    tags = [('{{','}}'),('<!--', '-->'),('<ref', '>'),('</ref','>'),('[[File:', ']]')]
    body_text = remove_tags(body_text, tags)
    body_text = body_text.strip()

    return body_text

def find_and_remove_categories(body_text):
    skip_ahead = 0
    reading = False
    ignoring = False
    categories = []
    buffer = ""
    body = ""
    for i in range(len(body_text)):
        if skip_ahead > 0:
            skip_ahead -= 1
            continue
            
        c = body_text[i]
        if c == '[' and body_text[i+1] == '[' and string_at(body_text, i+2, 'Category:'):
            skip_ahead = len('[Category:')
            reading = True
        elif reading == True:
            if c == '|':
                ignoring = True
            if c == ']' and body_text[i+1] == ']':
                categories.append(buffer.strip())
                buffer = ""
                reading = False
                ignoring = False
                skip_ahead += 1
            elif not ignoring:
                buffer += c
        else:
            body += c
    return categories, body.strip()
    
def process_article(buffer, title):
    full_xml = '\n'.join(buffer)
    root = ET.fromstring("<root>" + full_xml + "</root>")
    bodies = root.iter('text')
#     xml = ET.tostring(root)
    for body in bodies:
        body_text = body.text
        body_text = clean_wiki(body_text)
        categories, body_text = find_and_remove_categories(body_text)
        category_table[title] = categories
        sentences = sent_tokenize(body_text)
        
        for sent in sentences:
            state = 0 #1=1open, 2=inlink, 3=insurface, 4=1close,
            surface = ""
            surface_sentence = ""
            link = ""
            links = []
            for i,c in enumerate(sent):                    
                if state == 0 and c == '[':
                    state = 1
                elif state == 1 and c == '[':
                    state = 2
                elif state == 2 and c != '|' and c != ']':
                    link += c
                elif state == 2 and c == '|':
                    state = 3
                elif state == 2 and c == ']':
                    state = 4
                    surface = link
                elif state == 3 and c != ']':
                    surface += c
                elif state == 3 and c == ']':
                    state = 4
                elif state == 4 and c == ']':
                    state = 0

                    if link not in title_freq:
                        title_freq[link] = 0
                    title_freq[link] += 1

                    if link not in anchor_title_freq:
                        anchor_title_freq[link] = {}
                    if surface not in anchor_title_freq[link]:
                        anchor_title_freq[link][surface] = 0
                    anchor_title_freq[link][surface] += 1

                    surface_sentence += surface
                    links.append(link)
                    surface = ""
                    link = ""
                else:
                    state = state
                    surface_sentence += c
                    
            emb = embed_wordlist(surface_sentence.strip().lower().strip())
            for link in links:
                if link not in title_contexts:
                    title_contexts[link] = []
                title_contexts[link].append(emb)
            
            
def process_redirect(buffer, title):
    source = title
    for x in buffer:
        if '<redirect' in x:
            target = x.strip()[17:-4]
            if source not in redirect_table:
                redirect_table[source] = target
            return True
    return False

def process_disambiguation(buffer, title):
    for x in buffer:
        if '(disambiguation)' in x:
            return True
    return False
        
def process_contents(buffer, title, verbose=False):
    """
    This method takes a list of article lines and processes them.
    """    
    article_count = 0
    if process_disambiguation(buffer, title):
        if verbose:
            print("Disambiguation: {}".format(title))
    
    elif process_redirect(buffer, title):
        if verbose:
            print("Redirect: {} -> {}".format(title, redirect_table[title]))
        
    else:
        process_article(buffer, title)
        article_count += 1
        if verbose:
            print("Article: {}".format(title))
    
    return article_count

In [273]:
def process_wikifile(wp, max_count):
    start = time.time()
    with bz2.BZ2File(wp, 'rb') as wiki_file:
        state = 0
        buffer = []
        counter = 0
        title = None
        article_count = 0
        while True:
            # get the line.
            block = wiki_file.readline().decode('utf-8')
            if block == None or counter >= max_count:
                break
            counter += 1

            # get the reader state, store contents, and process each block.
            title = get_title(title, block)
            state, old_state = get_state(state, block)
            if state == 1:
                # this state indicates we're in an article, and should keep track of content.
                buffer.append(block.strip())
            if old_state == 1 and state == 0:
                # this case indicates a transition away from the article.
                ac = process_contents(buffer, title, verbose=False)
                article_count += ac
                buffer = []
    end = time.time()
    return end - start, counter, article_count

In [293]:
redirect_table = {} # table of all redirects. used for merging the tables below.
title_freq = {} # table of the frequency of links to each article.
anchor_title_freq = {} # table of Freq(anchor | title). So A[i][j] = COUNT(j | i)
title_contexts = {} # the KMEANS++ if we need to compress. MAX neighbor otherwise.
category_table = {}

t, l, ac = process_wikifile(wiki_path, 100000)
print(t, l, ac)

38.955867767333984 100000 194


In [296]:
print(len(redirect_table))
print(len(anchor_title_freq))
print(len(title_freq))
print(len(category_table))
print(len(title_contexts))

for title in redirect_table:
    if title in title_freq:
        title_freq[redirect_table[title]] += title_freq[title]
        title_freq[title] = 0
    if title in anchor_title_freq:
        for anchor in anchor_title_freq[title]:
            anchor_title_freq[redirect_table[title]][anchor] += anchor_title_freq[title][anchor]
            anchor_title_freq[title][anchor] = 0
            
title_anchor_freq = {}
for title in anchor_title_freq:
    for anchor in anchor_title_freq[title]:
        if anchor not in title_anchor_freq:
            title_anchor_freq[anchor] = {}
        title_anchor_freq[anchor][title] = anchor_title_freq[title][anchor]

151
33610
33610
194
33610


In [298]:
def get_candidate_titles(link):
    xs = title_anchor_freq[link].keys()
    freqs = np.array([title_anchor_freq[link][x] for x in xs])
    probs = freqs / sum(freqs)
    return list(xs), probs

In [None]:
sentence = ""
word = ""
candidates, probs = get_candidate_titles(word)
prob_table = {}

def get_link_given_candidate(candidate, link):
    link_freqs = [anchor_title_freq[candidate][x] for x in anchor_title_freq[candidate]]
    return anchor_title_freq[candidate][link] / sum(link_freqs)

def get_total_candidate_freq():
    acc = 0
    for x in title_freq:
        acc += title_freq[x]
    return acc

def get_context_given_candidate(candidate, sentence):
    wordlist = sentence.strip().lower().split()
    emb = embed_wordlist(wordlist)
    for context in title_contexts
        
for i, candidate in enumerate(candidates):
    p_context_given_candidate =
    p_link_given_candidate = get_link_given_candidate(candidate, word)
    p_candidate = title_freq[candidate] / get_total_candidate_freq()
#     p_link_given_context = ignored for now.
    
    
    prob_table[candidate] = probs[i]
    