In [2]:
from __future__ import print_function, division, unicode_literals
import six
import os, sys
from os.path import join
import json
from codecs import open
from collections import defaultdict
from operator import itemgetter
import nltk
import numpy as np
from nltk.corpus import stopwords
import re
import codecs
import random
from time import time
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

In [3]:
ROOT_DATA = join(os.environ["HOME"], "data/allen-ai-challenge")

Datasets
----

In [18]:
WIKI = join(ROOT_DATA, "parsed_wiki_data") #top5 search wiki hits from wiki
CK12 = join(ROOT_DATA, "ck12_dump") #parsing ck12
TRAINING = join(ROOT_DATA, "training_set.tsv")
TRAINING_CLEANED = join(ROOT_DATA, "training_set_cleaned.tsv")
VALIDATION = join(ROOT_DATA, "validation_set.tsv")
VALIDATION_CLEANED = join(ROOT_DATA, "validation_set_cleaned.tsv")

In [5]:
MERGED = join(ROOT_DATA, "merged_corpus.txt")

In [6]:
def tokenize(text):
    return [stemmer.stem(w) for w in nltk.word_tokenize(text.lower()) if w not in stopwords]

In [98]:
%%time
ck12_paragraphs = []
for i, fn_short in enumerate(os.listdir(CK12)):
    fn = join(CK12, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        ck12_article = json.load(f)
        for subtitle, paragraph in ck12_article['contents'].items():
            ck12_paragraphs.append(paragraph.strip())

CPU times: user 124 ms, sys: 16 ms, total: 140 ms
Wall time: 140 ms


In [99]:
len(ck12_paragraphs)

7148

In [100]:
%%time
wiki_paragraphs = []
for i, fn_short in enumerate(os.listdir(WIKI)):
    fn = join(WIKI, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        for line in (line.strip() for line in f):
            if not line:
                continue
            if line.startswith('=='):
                continue
            wiki_paragraphs.append(line)

CPU times: user 5.6 s, sys: 267 ms, total: 5.87 s
Wall time: 5.88 s


In [101]:
len(wiki_paragraphs)

561764

In [102]:
from itertools import chain

Cleaning datasets
-----

In [8]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english') + '. , ! ? !? ?! ... ; : - — summary youtube www'.split())
for t in "no not above".split():
    stopwords.remove(t)

In [9]:
def text_clean(text):
    s = re.sub(r'[^\w\s\d]', '', text)
    return [stemmer.stem(w) for w in nltk.word_tokenize(s.lower()) if w not in stopwords]

In [10]:
text_clean("hello Died \\ went I Summary all of the above 45 5 www youtube")

[u'hello', u'die', u'went', u'abov', u'45', u'5']

In [106]:
%%time
with open(MERGED, encoding="utf-8", mode="w") as f:
    for d in chain(ck12_paragraphs, wiki_paragraphs):
        ct = text_clean(d)
        print(*ct, sep=" ", file=f)

CPU times: user 7min 1s, sys: 2.46 s, total: 7min 4s
Wall time: 7min 4s


Train/validation
-----

In [16]:
def text_clean_join(t):
    return " ".join(text_clean(t))

In [17]:
with open(TRAINING_CLEANED, encoding="utf-8", mode="w") as fo:
    with open(TRAINING, encoding="utf-8") as f:
        next(f)
        for i, l in enumerate(f):
            [qid, q, r, aa, ab, ac, ad] = l.strip().split("\t")
            print(qid, text_clean_join(q), r,
                  text_clean_join(aa),
                  text_clean_join(ab),
                  text_clean_join(ac),
                  text_clean_join(ad),
                  sep="\t", file=fo)

In [19]:
with open(VALIDATION_CLEANED, encoding="utf-8", mode="w") as fo:
    with open(VALIDATION, encoding="utf-8") as f:
        next(f)
        for i, l in enumerate(f):
            [qid, q, aa, ab, ac, ad] = l.strip().split("\t")
            print(qid, text_clean_join(q),
                  text_clean_join(aa),
                  text_clean_join(ab),
                  text_clean_join(ac),
                  text_clean_join(ad),
                  sep="\t", file=fo)