In [1]:
from __future__ import print_function, division, unicode_literals
import os, sys
from os.path import join
import json
from codecs import open
import nltk
import numpy as np
from nltk.corpus import stopwords
import re
import random
from itertools import chain
from time import time
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

In [2]:
ROOT_DATA = join(os.environ["HOME"], "data/allen-ai-challenge")

Datasets
----

In [14]:
WIKI = join(ROOT_DATA, "parsed_wiki_data") #top5 search wiki hits from wiki
CK12 = join(ROOT_DATA, "ck12_dump") #parsing ck12
QUIZLET = join(ROOT_DATA, 'quizlet')
TOPKEK = join(ROOT_DATA, 'studystack')

TRAINING = join(ROOT_DATA, "training_set.tsv")
TRAINING_CLEANED = join(ROOT_DATA, "training_set_cleaned.tsv")
VALIDATION = join(ROOT_DATA, "validation_set.tsv")
VALIDATION_CLEANED = join(ROOT_DATA, "validation_set_cleaned.tsv")

CORPUS_PARAGRAPH = join(ROOT_DATA, "corpus_paragraph.txt")

In [4]:
def tokenize(text):
    return [stemmer.stem(w) for w in nltk.word_tokenize(text.lower()) if w not in stopwords]

In [5]:
%%time
ck12_paragraphs = []
for i, fn_short in enumerate(os.listdir(CK12)):
    fn = join(CK12, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        ck12_article = json.load(f)
        for subtitle, paragraph in ck12_article['contents'].items():
            ck12_paragraphs.append(paragraph.strip())

CPU times: user 119 ms, sys: 14.3 ms, total: 133 ms
Wall time: 133 ms


In [6]:
len(ck12_paragraphs)

7148

In [7]:
%%time
wiki_paragraphs = []
for i, fn_short in enumerate(os.listdir(WIKI)):
    fn = join(WIKI, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        for line in (line.strip() for line in f):
            if not line:
                continue
            if line.startswith('=='):
                continue
            wiki_paragraphs.append(line)

CPU times: user 5.58 s, sys: 219 ms, total: 5.8 s
Wall time: 5.81 s


In [8]:
len(wiki_paragraphs)

561764

In [9]:
%%time
from StringIO import StringIO
terms = {}
for i, fn_short in enumerate(os.listdir(QUIZLET)):
    term_count = 0
    fn = join(QUIZLET, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        for line in f:
            j = json.load(StringIO(line))
            for t in j['terms']:
                terms[t['term']] = t['definition']
                term_count += 1
    print(fn_short, term_count, sep=': ')
print()
quizlet_paragraphs = [t + ' ' + d for t, d in terms.iteritems()]    

geology.txt: 211689
biology.txt: 238449
chem.txt: 186910
ck-12.txt: 9811
ck 12.txt: 9811
space.txt: 103268
lifescience.txt: 7104
earthscience.txt: 8281
climate.txt: 107490
anatomy.txt: 275689
physicalscience.txt: 4753
physiology.txt: 288193
earth science.txt: 180614
botany.txt: 247006
astronomy.txt: 190219
science.txt: 164241
life science.txt: 166535
physical science.txt: 191991
chemistry.txt: 209191
physics.txt: 196988
genetics.txt: 181100
ck12.txt: 4632
DNA.txt: 126230

CPU times: user 27.4 s, sys: 730 ms, total: 28.2 s
Wall time: 28.1 s


In [10]:
len(quizlet_paragraphs)

1194285

In [15]:
%%time
topkek_cards = []
for i, path in enumerate(os.listdir(TOPKEK)):
    fn = join(TOPKEK, path)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        docs = f.readlines()
        for d in docs:
            topkek_cards.append(d.strip())
        print(path, len(docs), sep=': ')
topkek_cards = set(topkek_cards)
print()

ch_docs.txt: 426147
gen_docs.txt: 17119
es_docs.txt: 459625
bio_docs.txt: 1736698
apgeo_docs.txt: 81173
ph_docs.txt: 106345
science_docs.txt: 1094720
geo_docs.txt: 332818
anth_docs.txt: 36011
ps_docs.txt: 413622

CPU times: user 5.54 s, sys: 1.59 s, total: 7.13 s
Wall time: 7.12 s


In [16]:
len(topkek_cards)

1927951

Cleaning datasets
-----

In [18]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english') + '. , ! ? !? ?! ... ; : - — summary youtube www'.split())
for t in "no not above".split():
    stopwords.remove(t)

In [19]:
def text_clean(text):
    s = re.sub(r'[^\w\s\d]', '', text)
    return [stemmer.stem(w) for w in nltk.word_tokenize(s.lower()) if w not in stopwords]

In [20]:
text_clean("hello Died \\ went I Summary all of the above 45 5 www youtube")

[u'hello', u'die', u'went', u'abov', u'45', u'5']

In [None]:
%%time
c = 0
with open(CORPUS_PARAGRAPH, encoding="utf-8", mode="w") as f:
    for d in chain(ck12_paragraphs, wiki_paragraphs, quizlet_paragraphs, topkek_cards):
        ct = text_clean(d)
        print(*ct, sep=" ", file=f)
        c += 1
print('Wrote', c, 'paragraphs')

Train/validation
-----

In [None]:
def text_clean_join(t):
    return " ".join(text_clean(t))

In [None]:
with open(TRAINING_CLEANED, encoding="utf-8", mode="w") as fo:
    with open(TRAINING, encoding="utf-8") as f:
        next(f)
        for i, l in enumerate(f):
            [qid, q, r, aa, ab, ac, ad] = l.strip().split("\t")
            print(qid, text_clean_join(q), r,
                  text_clean_join(aa),
                  text_clean_join(ab),
                  text_clean_join(ac),
                  text_clean_join(ad),
                  sep="\t", file=fo)

In [None]:
with open(VALIDATION_CLEANED, encoding="utf-8", mode="w") as fo:
    with open(VALIDATION, encoding="utf-8") as f:
        next(f)
        for i, l in enumerate(f):
            [qid, q, aa, ab, ac, ad] = l.strip().split("\t")
            print(qid, text_clean_join(q),
                  text_clean_join(aa),
                  text_clean_join(ab),
                  text_clean_join(ac),
                  text_clean_join(ad),
                  sep="\t", file=fo)

In [None]:
'+'.join('104080  germin compos mollusk kingdom   scallop protista        dna molecul     '.split())