In [29]:
from __future__ import print_function, division, unicode_literals
import six

from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh import scoring
from whoosh.qparser import QueryParser
from whoosh.support.charset import accent_map
from whoosh.analysis.analyzers import *
import whoosh

import os
from os.path import join
import json
from codecs import open
from collections import defaultdict
from operator import itemgetter
import nltk
import numpy as np

from nltk.corpus import stopwords

In [7]:
DATA_DIR = join(os.environ['HOME'], 'data/allen-ai-challenge')
WIKI_FILES = join(DATA_DIR, 'wiki_dump')
CK12_DIR = join(DATA_DIR, 'ck12_dump')
TRAINING_SET = join(DATA_DIR, 'training_set.tsv')
TRAINING_SET_MERGED = join(DATA_DIR, 'training_set_merged.tsv')

# INDEX_DIR = join(DATA_DIR, 'index-wiki-stop-stem')
index_dir = join(DATA_DIR, 'indexw-ck12-stem')

In [8]:
stoplist = stopwords.words('english')
my_analyzer = StemmingAnalyzer(stoplist=stoplist) | whoosh.analysis.filters.CharsetFilter(accent_map)
# my_analyzer = whoosh.analysis.NgramWordAnalyzer(1,2)

In [9]:
# index_dir = join(DATA_DIR, 'index-ck12-stem')
# index_dir = join(DATA_DIR, 'index-ck12-paragraph-stem')
# schema = Schema(title=TEXT(stored=True), content=TEXT(analyzer=my_analyzer), summary=TEXT(analyzer=my_analyzer), 
#                 file=ID(stored=True))

schema = Schema(content=TEXT(analyzer=my_analyzer, stored=True), title=TEXT(stored=True))


if not os.path.isdir(index_dir):
    os.mkdir(index_dir)

ix = create_in(index_dir, schema)

In [10]:
%%time
# writer = ix.writer()
# for i, w in enumerate(os.listdir(WIKI_FILES)):
#     fn = join(WIKI_FILES, w)
#     with open(fn, encoding='utf8') as f:
#         title, summary, content = json.load(f)
#     writer.add_document(title=title, summary=summary, content=content, file=fn)
# print('Read %d files' % i)
# writer.commit()

# writer = ix.writer()
# for i, fn_short in enumerate(os.listdir(CK12_DIR)):
#     fn = join(CK12_DIR, fn_short)
#     with open(fn, encoding='utf-8', errors='ignore') as f:
#         ck12_article = json.load(f)
#         for subtitle, paragraph in ck12_article['contents'].items():
#             writer.add_document(content=paragraph, title=fn_short+' in '+subtitle)
# print('Read %d files' % i)
# writer.commit()

writer = ix.writer()
for i, fn_short in enumerate(os.listdir(CK12_DIR)):
    fn = join(CK12_DIR, fn_short)
    with open(fn, encoding='utf-8', errors='ignore') as f:
        ck12_article = json.load(f)
        content = '. '.join([paragraph for subtitle, paragraph in ck12_article['contents'].items()])
        writer.add_document(content=content, title=fn_short)
print('Read %d files' % i)
writer.commit()

Read 1629 files
CPU times: user 12 s, sys: 128 ms, total: 12.2 s
Wall time: 12.2 s


In [38]:
query_text = 'when athletes begin to exercise , their heart rates and respiration rates increase . at at the tissue level does the human body coordinate these functions .'

ix = open_dir(INDEX_DIR)
def search(searcher, query_txt):
    relevances = defaultdict(float)
    for q in query_txt.split():
        query = QueryParser("content", ix.schema).parse(q)
        results = searcher.search(query)
        for r in results:
            relevances[r['title']] += r.score
    return relevances
    
with ix.searcher(weighting=scoring.BM25F()) as searcher:
    relevances = search(searcher, query_text)
#     print(relevances)
    print(sorted(list(relevances.items()), key=itemgetter(1), reverse=True))

[(u'biology_Organization-of-the-Human-Body.json', 23.538227657180567), (u'chemistry_Factors-Affecting-Reaction-Rate.json', 18.626529151427228), (u'earth-science_Future-Human-Population-Growth.json', 18.62597006171592), (u'chemistry_Chemical-Equilibrium.json', 18.51060808897595), (u'biology_Circulatory-System-Diseases.json', 15.976283798318159), (u'biology_Smooth-Skeletal-and-Cardiac-Muscles.json', 14.600964422058166), (u'biology_Respiration.json', 14.256633204439286), (u'chemistry_Rate-Law-and-Specific-Rate-Constant.json', 14.226703969021322), (u'biology_Demographic-Transition.json', 14.012193546125406), (u'chemistry_Determining-the-Rate-Law-from-Experimental-Data.json', 13.993931531490224), (u'chemistry_Order-of-Reaction.json', 13.887185251972129), (u'chemistry_Rate-Determining-Step.json', 13.836772118511988), (u'biology_Population-Growth.json', 13.584493684554628), (u'chemistry_Chemical-Reaction-Rate.json', 13.140079281169232), (u'biology_Organization-of-Living-Things.json', 12.23356

In [33]:
sorted(list(relevances.items()), key=itemgetter(1), reverse=True)

[(u'Intramuscular fat', 19.468597356233502),
 (u'Muscle hypertrophy', 14.839836128856096),
 (u'Androgen', 13.698429760477254),
 (u'Rate-determining step', 11.104043772502674),
 (u'Reaction rate', 11.068265514728342),
 (u'Birth rate', 11.054941375545903),
 (u'Federal funds rate', 11.02680128637756),
 (u'Lindemann mechanism', 11.020739918204626),
 (u'Lapse rate', 11.019220864775175),
 (u'Power rating', 10.977132118894263),
 (u'Compound interest', 10.97506804741638),
 (u'Order of reaction', 10.973269970791167),
 (u'Rate equation', 10.969117005545705),
 (u'Epitestosterone', 10.870810094879438),
 (u'Effects of high altitude on humans', 10.71091996922197),
 (u'Altitude', 10.183911661162476),
 (u'Long-distance running', 10.133231122239982),
 (u'Soil respiration', 9.917618051405336),
 (u'Cellular waste product', 9.874770155258101),
 (u'Respiration (physiology)', 9.839063188337487),
 (u'Aerobic exercise', 9.772188787952539),
 (u'Physical exercise', 9.76941046513647),
 (u"Biot's respiration", 9.

In [18]:
res

[129.53213387181756,
 136.29547413534897,
 129.53213387181756,
 129.53213387181756]

In [39]:
def total_score(relevances, top=5):
    return sum(sorted(relevances.values(), reverse=True)[:top])

ix = open_dir(INDEX_DIR)
corrects = []
with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
    with open(TRAINING_SET_MERGED, encoding='utf-8') as f:
        f.readline()  # skip header
#         for qid, question, correct, A, B, C, D in (line.strip().split('\t') for line in f):
        for qid, c, q, A, B, C, D in (line.strip().split('\t') for line in f):
            res = [total_score(search(searcher, V), 10) for V in [A, B, C, D]]
            corrects.append(c == 'ABCD'[np.argmax(res)])
            print(c, 'ABCD'[np.argmax(res)], np.mean(corrects))
            sys.stdout.flush()
#             query_text = QueryParser("content", ix.schema).parse(A)
#             results = searcher.search(query)
#             print(len(results))
#             for r in results:
#                 print(r.score, r['title'])
#             break

C D 0.0
D C 0.0
B C 0.0
B B 0.25
B D 0.2
A A 0.333333333333
A B 0.285714285714
D B 0.25
C C 0.333333333333
D D 0.4
D C 0.363636363636
A A 0.416666666667
C D 0.384615384615
B A 0.357142857143
B C 0.333333333333
C B 0.3125
A A 0.352941176471
B D 0.333333333333
B C 0.315789473684
C D 0.3
C C 0.333333333333
C B 0.318181818182
D D 0.347826086957
A C 0.333333333333
B D 0.32
D C 0.307692307692
D A 0.296296296296
B B 0.321428571429
B D 0.310344827586
B D 0.3
B C 0.290322580645
B C 0.28125
A A 0.30303030303
A C 0.294117647059
D A 0.285714285714
C B 0.277777777778
B C 0.27027027027
B D 0.263157894737
D D 0.282051282051
D B 0.275
A A 0.292682926829
A C 0.285714285714
A C 0.279069767442
C C 0.295454545455
C C 0.311111111111
A B 0.304347826087
A D 0.297872340426
A A 0.3125
B B 0.326530612245
C A 0.32
B C 0.313725490196
A D 0.307692307692
D D 0.320754716981
B B 0.333333333333
D A 0.327272727273
A B 0.321428571429
A A 0.333333333333
B D 0.327586206897
C C 0.338983050847
A A 0.35
D A 0.344262295082
D 

In [23]:
import telepot, os
os.environ['TELEGRAM_BOT'] = '123209868:AAHGkFt5NPUNnkjW8VbG4wUqpMHmhwPGarM'
os.environ['TELEGRAM_ID'] = '87799679'
import time

In [37]:
class TelegramStream:
    def __init__(self, token, reciever_id):
        self.bot = telepot.Bot(token)
        self.id = reciever_id
    def write(self, txt):
        print('>>>', txt)
        if txt.strip():
            self.bot.sendMessage(self.id, txt)
    
tele = TelegramStream(os.environ['TELEGRAM_BOT'], os.environ['TELEGRAM_ID'])

In [39]:
print('testing newlines\nhere', file=tele)

>>> testing newlines
here
>>> 

