In [87]:
import os
import bz2
import json

from revscoring.features import wikitext
from revscoring.datasources.meta import mappers, vectorizers
from revscoring.datasources import revision_oriented
from revscoring.dependencies import solve
from revscoring.features.meta import aggregators

import mwxml

In [20]:
def get_files(indir):
    return [os.path.join(indir,f) for f in os.listdir(indir)]

In [84]:
def process_dump(filepath, save_text=False, save_tokens=False):
    with bz2.open(filepath) as filestream:
        observation_list = []
        dump = mwxml.Dump.from_file(filestream)
        for i, page in enumerate(dump):
            for rev in page:
                observation = {
                    'title':page.title,
                    'page_id':page.id,
                    'rev_id':rev.id,
                    'redirect':page.redirect
                }
                
                if save_text:
                    observation['text'] = rev.text
                    
                tokenized_text = tokenize_and_clean_text(rev.text,TOKENIZER)
                
                if save_tokens:
                    observation['tokenized_text'] = tokenized_text
                    
                observation['feature_vector'] = vectorize_text(observation['tokenized_text'],w2v)
                observation_list.append(observation)
                
    return observation_list

In [77]:
def tokenize_and_clean_text(revision_text,tokenizer):
    cache = {}
    cache[revision_oriented.revision.text] = revision_text
    return solve(tokenizer, cache=cache, context=None)

In [78]:
def vectorize_text(tokenized_text,vectorizer):
    cache = {}
    cache[TOKENIZER] = tokenized_text
    return solve(vectorizer, cache=cache, context=None)

In [89]:
TOKENIZER = mappers.lower_case(wikitext.revision.datasources.words)
INDIR = '/home/jmads/data/enwiki/'
OUTDIR = '/home/jmads/wiki-ltt-cluster/datasets/vectorized/'
ENWIKI_KVS = '/home/jmads/wiki-ltt-cluster/word2vec/enwiki-20200501-learned_vectors.50_cell.10k.kv'
    
enwiki_kvs = vectorizers.word2vec.load_gensim_kv(
    path=ENWIKI_KVS,
    mmap="r"
)

def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(enwiki_kvs, words)

revision_text_vectors = vectorizers.word2vec(
    mappers.lower_case(wikitext.revision.datasources.words),
    vectorize_words,
    name="revision.text.en_vectors")

w2v = aggregators.mean(
    revision_text_vectors,
    vector=True,
    name="revision.text.en_vectors_mean"
)

def main():
    
    files_to_process = os.listdir(INDIR)
    for infile in files_to_process:
        infile_path = os.path.join(INDIR,infile)
        page_list = process_dump(infile_path, save_text=True, save_tokens=True)
        outfile_name = '{0}.json'.format(infile.split('.')[0])
        outfile_path = os.path.join(OUTDIR,outfile_name)
        with open(outfile_path,'w') as outfile:
            json.dump(page_list,outfile)

main()

In [81]:
os.listdir(INDIR)

['enwiki-latest-pages-articles1.xml-p1p41242.bz2']

In [83]:
'enwiki-latest-pages-articles1.xml-p1p41242.bz2'.split('.')

['enwiki-latest-pages-articles1', 'xml-p1p41242', 'bz2']