# Setup Notes<br>
PLEASE SET YOUR WORKING DIRECTORY BEFORE PROCEEDING<br>
-*- coding: utf-8 -*-

Load necessary modules

In [None]:
import re
from urllib.request import urlopen
from pathlib import Path
import os
import glob
from tqdm import tqdm
import whoosh.index
from whoosh.fields import Schema, TEXT, ID
from whoosh.analysis import StemmingAnalyzer
import nltk.corpus
from whoosh.qparser import QueryParser, OrGroup, WildcardPlugin
from deeppavlov import build_model, configs

Create directories on local machine to hold data<br>
Do this step the first time you run this script, then comment out

In [None]:
os.makedirs('data/chapters/')
os.mkdir('wizco_idx')

Part 1: Mine Text

Download Wizard of Oz text from Project Gutenberg<br>
if you run into issues, clear your default browser cache/history

In [None]:
url = "http://www.gutenberg.org/cache/epub/55/pg55.txt"
with urlopen(url) as f:
    wizco = f.read().decode('utf-8')

Split into prologue + epilogue + chapters (drop prefacing text)

In [None]:
chapter_split = re.compile(r'\d''.')   #splits on number and '.'
chapters = re.split(chapter_split, wizco)[31:55]

Create dictionary of named chapters

In [None]:
named_chapters = {}

In [None]:
for i, chapter in enumerate(chapters):
    named_chapters[f'chapter_{i + 1}'] = chapter.strip()

Save text to files

In [None]:
for title, chapter in named_chapters.items():
    path = Path('data', 'chapters', title).with_suffix('.txt')
    with path.open('w') as f:
        f.write(chapter)


Part 2: Build the Index

List out all text files to be indexed

In [None]:
wizco_files = glob.glob('data/chapters/*.txt')

Augment Stopwords

In [None]:
nltk.download('stopwords')
nltk_stopwords = set(nltk.corpus.stopwords.words('english'))
qa_stopwords = frozenset(nltk_stopwords)

Define schema with title as a unique key

In [None]:
schema = Schema(
        chapter_title=ID(stored=True, unique=True),
        chapter_text=TEXT(stored=True, analyzer=StemmingAnalyzer(stoplist = qa_stopwords)),
    )

Create an index named 'wizco' in 'wizco_idx' directory

In [None]:
idx = whoosh.index.create_in('wizco_idx', schema=schema, indexname='wizco')

In [None]:
writer = idx.writer()
for file in tqdm(wizco_files):
    path = Path(file)

    # Read info to be indexed
    chapter_title = path.stem
    with path.open('r') as f:
        chapter_text = f.read()

    # Add info to index
    writer.update_document(
        chapter_title=chapter_title,
        chapter_text=chapter_text,
    )

Finalize index build

In [None]:
writer.commit()

Part 3: Build Question and Answer search

Load the index named 'wizco' in 'wizco_idx' directory

In [None]:
whoosh_idx = whoosh.index.open_dir('wizco_idx', schema=schema, indexname='wizco')

Define query parser to search the chapter_text field<br>
Only one field to search here, if we wanted to search multiple we would use MultifieldParser

In [None]:
query_parser = QueryParser('chapter_text',
                                schema=whoosh_idx.schema,
                                group=OrGroup)

Remove WildcardPlugin from the query parser<br>
Our goal is to make queries as natural as possible

About wildcards in queries<br>
* Overview<br>
    * https://nlp.stanford.edu/IR-book/html/htmledition/wildcard-queries-1.html<br>
* Example characters<br>
    * https://support.office.com/en-us/article/examples-of-wildcard-characters-939e153f-bd30-47e4-a763-61897c87b3f4

In [None]:
query_parser.remove_plugin_class(WildcardPlugin)

Input a query and parse it

In [None]:
query_text = 'What is the name of Dorothy* dog?'
parsed_query = query_parser.parse(query_text)

Search index and grab top hit

In [None]:
with whoosh_idx.searcher() as searcher:
    search_results = searcher.search(parsed_query, limit = 1)
    top_hit = [hit['chapter_text'] for hit in search_results][0]

Is the known correct answer in the chapter that was identified as containing top hit?

In [None]:
print(f'`"Toto" in top_hit`: {"Toto" in top_hit}')

 Part 4: Integrate Question and Answer with BERT model

Init Q&A model - the FIRST time running this code, if you get an error, change below to 'download=True'.
This will take longer to run, but may prevent errors.

In [None]:
bert_squad_model = build_model(configs.squad.squad, download=False)

Search index and grab top hit

In [None]:
with whoosh_idx.searcher() as searcher:
    while True:
        query = input('Query ("exit" to quit): ')
        if query == 'exit':
            break
        
        parsed_query = query_parser.parse(query)
        search_results = searcher.search(parsed_query, limit=1)
        top_hit = [hit['chapter_text'] for hit in search_results][0]
        print(bert_squad_model([top_hit], [query]))

Now type a query into console to return result<br>
Example queries:<br>
What is the name of the dog?<br>
What color is Dorthy* dress?<br>
What is the name of Dorothy's aunt?<br>
What is cowardly?<br>
What is the road paved with?<br>
How far is it to the Emerald City?<br>
Where did Oz go?