### ** Don't forget to activate the appropriate environment!!
https://ipython.readthedocs.io/en/stable/install/kernel_install.html#kernels-for-different-environments

In [4]:
import json
import itertools
#import pandas as pd
import ast
import string
import operator
from collections import defaultdict
import re
import os
import nltk
import math
import nltk

#nltk.download('stopwords')
#nltk.download('punkt')

# encoding =utf8
#import sys
#reload(sys)
#sys.setdefaultencoding('utf8')

## Wikipedia Article Pre-Processing
To move our data through the pipeline and ensure that it is suitable for our app, we will format it like the SQuAD dataset.

### ONLY RUN THIS FOR TESTING

In [4]:
wikipedia_data = {"data": [], "version" : 1.0}

num_files = 1 # each folder has 100 articles
end_letter = 1 # because we only have folders up through 9

# at this point, just for one letter of the alphabet. Would need to add another loop.
wiki_input_file_dir = './A'
#os.mkdir('./output')
wiki_output_file_dir = './output/A'

### ONLY RUN THIS FOR FULL SET

In [2]:
num_files = 10 # each folder has 100 articles
end_letter = 1 # because we only have folders up through 9

wiki_input_file_dir = '../bucket-w210/wikipedia/json_out/A'
#os.mkdir('./wikipedia_squad')
wiki_output_file_dir = './wikipedia_squad/A'

### BACK TO NORMAL

In [10]:
# go through each file, add to mega-file of articles
for c in string.ascii_uppercase[:end_letter]:
    
    output_dir = wiki_output_file_dir + c

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # right now, it looks like all folders have exactly 100 files
    for i in range(num_files): 
        
        wikipedia_data = {"data": [], "version" : 1.0}

        input_file = wiki_input_file_dir + c + '/wiki_' + '%02d' % i
        #print(input_file)
        output_file = output_dir + '/wiki_squad_' + '%02d' % i + '.json'

        with open(input_file) as f:
            # each line represents a different wikipedia article
            # we will ignore the id and url for now, not needed
            for line in f:
                line_dict = ast.literal_eval(line)
                title = line_dict['title']
                #print(title)
                text = line_dict['text'].split("\n\n",1)[1] # title is duplicated within text as well

                # arbitrary length, should eliminate articles like "disambiguation" articles, etc. 
                if len(text) > 1000: 
                    # Break text up into paragraphs
                    paras = text.split("\n\n")

                    context = [{'context': para, 'qas' : []} for para in paras]

                    wikipedia_data['data'].append({'title' : title, 'paragraphs' : context})
        
        with open(output_file, 'w') as outfile:  
            json.dump(wikipedia_data, outfile)
    
#print(wikipedia_data)

## Sentence Selection

In [11]:
def sents_and_weights(d,paragraphs):
        """Clean sentences, remove digit, punctuation, upper case to lower
        Args: paragraphs = list of dicts with contexts
        Return: sentences_processed: dict of cleaned sentences. key = number of sentence; value = list of stemmed words.
        """
        labeled_sentences = {}
        stemmed_sentences = {}
        
        # this needs to be a default dict so it returns 0 if word not found 
        word_dict = defaultdict(int)
        word_distr = defaultdict(int)

        # initialize for stemming
        stop_words = nltk.corpus.stopwords.words('english')
        stemmer = nltk.stem.PorterStemmer()
        tokenize = nltk.word_tokenize

        def stem_and_add(wd):
            word_dict[wd] += 1
            word_dict['total_count'] += 1
            return stemmer.stem(wd)
        
        # need to go through each 'paragraph' (context) to gather info about sentences
        for i,context in enumerate(paragraphs):

            # split paragraph into sentences, make sure to keep digits, etc. together
            sentences = context['context'].split('. ') #last one still has a period at the end

            for j,sentence in enumerate(sentences):
                # save list of unprocessed sentences for later
                labeled_sentences[(d,i,j)] = sentence
                            
                # Remove all digits
                sentence = ''.join([x for x in sentence if not x.isdigit()])
                # Remove all punctuation (OK to include periods since it's been split)
                sentence = ''.join([x for x in sentence if x not in string.punctuation])
                
                # Lowercase everything
                sentence = sentence.strip()
                sentence = sentence.lower()
                
                # Split into words & rejoin (remove extra spaces)
                sentence = ' '.join(sentence.split())
                
                tokenized_stemmed_sent = [stem_and_add(word) for word in nltk.tokenize.word_tokenize(sentence) 
                                          if not word in stop_words]
                
                # keep track of tokenized for calculating sentence weight in next step
                stemmed_sentences[(d,i,j)] = tokenized_stemmed_sent
                
                # update our word dictionary to be relative frequencies rather than absolute values
                for word, ct in word_dict.items():
                    # but keep our total count, we may want that later (not sure)
                    if not word == 'total_count':
                        word_distr[word] = word_dict[word] / word_dict['total_count']
                
        #print("article length:",word_dict['total_count'])
        #print("word dict:",word_distr)

        return labeled_sentences,stemmed_sentences,word_distr

In [12]:
def calc_sent_weight(word_dist, stemmed_sents):
        """Compute weight with respect to sentences
        Args:
                word_distribution: dict with word: weight
                stemmed_sents: list with 
        Return:
                sentence_weight: dict of weight of each sentence. key = sentence #, value = weight
        """
        sentences_weight = {}
        # Iterate through each word in each sentence, if word distribution and sentence id are in dictionary, 
        # add to existing word distribution. Else, sentence weight for given sentence equals current word distribution
          
        for key, words in stemmed_sents.items():
            #print(words)
            # Sentence weight equals sum of word distributions divided by length of cleaned sentence
            if len(words) == 0:
                weight = 0
            else:
                weight = sum([word_dist[word] for word in words]) / len(words)
            
            sentences_weight[key] = weight
            
        sentences_weight = sorted(sentences_weight.items(), key=operator.itemgetter(1), reverse=True)
        #print('sentence weight: ',sentences_weight)

        return sentences_weight

In [13]:
def topically_important_sentence(sentences_weight, labeled_sentences):
        """Select topically import sentences
        Args:
                sentence_weight: list of tuples, (sentence_num, sentence_weight) computed in sentence_weight
                paragraph: set of sentences
        Return:
                sentences_selected: dict, topically important sentences selected
        """
        final_sentences = {}
        
        total_sentences = len(sentences_weight)
        # how many sentences to retain
        num_sentences_selected = math.ceil(float(0.05) * total_sentences)
        #print('num sentences for this passage:',num_sentences_selected)
        
        # key of selected sentences (# order of sentence in paragraph)
        #sentences_selected_key = []
        
        # dictionary of all sentences 
        sentences_dict = {}
        flag = 0
        
        # select num_sentences_selected # of sentences from list of sentence weights
        #print(sentence_weight[0])
        selected_keys = [k for k,v in sentence_weight[0:num_sentences_selected]]
        
        #print("selected sentence(s):",selected_keys)


        for sent_key in selected_keys:
            pre_processed_sentence = labeled_sentences[sent_key]
            
            processed_sentence = pre_processed_sentence.lower() #lowercase
            processed_sentence = processed_sentence.replace('[[','')
            processed_sentence = processed_sentence.replace(']]','')
            processed_sentence = processed_sentence.replace(']','')
            processed_sentence = processed_sentence.replace('[','')
            processed_sentence = re.sub('(?<!\d)([.,!?()])(?<!\d)', r' \1 ', processed_sentence)
            processed_sentence = re.sub(r'\(','-lrb- ',processed_sentence) # replace left parens, add space after
            processed_sentence = re.sub(r'\)',' -rrb-',processed_sentence) # replace left parens, add space after
            processed_sentence = re.sub(r'\([^)]*\)', '',processed_sentence) #replace brackets in links
            processed_sentence = re.sub('(?<=\s)\"','`` ',processed_sentence) # replace first double quotes with ``
            processed_sentence = re.sub(r'\"', " ''", processed_sentence) # replace second double quote with two single quotes ''

            #print(processed_sentence)

            final_sentences[sent_key] = processed_sentence
            
        return final_sentences

In [14]:
for c in string.ascii_uppercase[:end_letter]:
    
    output_dir = wiki_output_file_dir + c

    for i in range(num_files): 
    
        output_file = output_dir + '/wiki_squad_' + '%02d' % i + '.json'

        with open(output_file) as json_file:  
            data = json.load(json_file)

        data = pd.DataFrame.from_dict(data)
        df = data['data']

        sentences_onmt = open(output_dir + '/test_sents_qg_' + '%02d' % i, "w")
        sentences_labeled = open(output_dir + '/test_sents_labeled_' + '%02d' % i, "w")

        # for each article in the file
        for row,value in df.iteritems():
            # here is where we clean and stem words, build word distribution
            #print(value['title'])
            labeled_sentences, stemmed_sentences, word_distribution = sents_and_weights(row,value['paragraphs'])

            #print(labeled_sentences)

            # use this word distribution to get weights for each sentence and calculate most important sentences 
            sentence_weight = calc_sent_weight(word_distribution,stemmed_sentences)
            

            # pull out most important sentences
            # and keep track of where they came from: (doc #, context #, sentence #)
            chosen_sentences = topically_important_sentence(sentence_weight,labeled_sentences)

            for sents in chosen_sentences.items():

                #save selected sentences directly to file, for onmt model
                sentences_onmt.write(str(sents[1])+'\n')
                # keep track of their locations, though
                sentences_labeled.write(str(sents)+'\n')

## Question Generation

For reference: https://github.com/drewserles/GenerationQ. Assuming training has already been completed.

In [3]:
# update as needed, depending on which model has lowest perplexity
generationq_dir = '~/GenerationQ/model'

for c in string.ascii_uppercase[:end_letter]:
    
    output_dir = wiki_output_file_dir + c

    for i in range(num_files): 
    
        wiki_squad_file = output_dir + '/wiki_squad_' + '%02d' % i + '.json'
        sentences_onmt = output_dir + '/test_sents_qg_' + '%02d' % i
        sentences_labeled = output_dir + '/test_sents_labeled_' + '%02d' % i
        
        question_output = output_dir + '/test_questions_' + '%02d' % i + '.txt'
    
        print "generating questions for file #" + str(i)
        print sentences_onmt
    
        !python $generationq_dir/test.py -model $generationq_dir/trained/600rnn_step_14000.pt -src $sentences_onmt -output $question_output \
        -replace_unk -beam_size 3 -gpu 0 -batch_size 30

generating questions for file #0
./wikipedia_squad/AA/test_sents_qg_00
  var = torch.tensor(arr, dtype=self.dtype, device=device)
  return torch.tensor(a, requires_grad=False)
PRED AVG SCORE: -0.6408, PRED PPL: 1.8980
generating questions for file #1
./wikipedia_squad/AA/test_sents_qg_01
Traceback (most recent call last):
  File "/home/julia_buffinton/GenerationQ/model/test.py", line 36, in <module>
    main(opt)
  File "/home/julia_buffinton/GenerationQ/model/test.py", line 24, in main
    attn_debug=opt.attn_debug)
  File "/home/julia_buffinton/GenerationQ/model/onmt/translate/translator.py", line 229, in translate
    for batch in data_iter:
  File "/home/julia_buffinton/anaconda3/envs/pytorch-env/lib/python2.7/site-packages/torchtext/data/iterator.py", line 157, in __iter__
    yield Batch(minibatch, self.dataset, self.device)
  File "/home/julia_buffinton/anaconda3/envs/pytorch-env/lib/python2.7/site-packages/torchtext/data/batch.py", line 34, in __init__
    setattr(self, name, f

## Label Questions, add back to json file

In [4]:
# for each letter of the alphabet
for c in string.ascii_uppercase[:end_letter]:

    output_dir = wiki_output_file_dir + c

    # for each file in there (total: 100)
    for i in range(num_files): 
        
        sentences_labeled = output_dir + '/test_sents_labeled_' + '%02d' % i

        wiki_squad_file = output_dir + '/wiki_squad_' + '%02d' % i + '.json'
        
        #question_output = output_dir + '/test_sents_qg_' + '%02d' % i # THIS IS JUST FOR TESTING!!! 
        question_output = output_dir + '/test_questions_' + '%02d' % i + '.txt' # This is final!!
        question_output_qs = output_dir + '/test_questions_q_' + '%02d' % i + '.txt'
        
        with open(question_output) as f:
            pred_questions = f.read().splitlines()    
            pred_questions = pred_questions[::2] # start with the 1st line
            
        with open(question_output_qs, 'w') as f:
            for item in pred_questions:
                f.write("%s" % item) # removed newline character after this?
            
        with open(wiki_squad_file) as json_file:  
            data = json.load(json_file)

        #print(type(data))
        #data = pd.DataFrame.from_dict(data)
        wiki_df = data['data']
        #print(type(wiki_df))

        with open(question_output_qs) as f1:

            with open(sentences_labeled) as f2:
                for line_q,line_s in zip(f1,f2):
                    
                    line_tuple = eval(line_s)
                    nums = line_tuple[0]
                    #print(nums)
                    #sent = line_tuple[1]

                    doc = wiki_df[nums[0]] # pull the whole document
                    context = doc["paragraphs"][nums[1]]

                    context['qas'].append({"question": line_q, "answers": [], "id": str(nums)})
            
        with open(wiki_squad_file, 'w') as outfile:  
            json.dump(data, outfile)

## Run QA

In [8]:
sample_file = './wikipedia_squad/AA/wiki_squad_00.json'


In [9]:
!python ../../DrQA/scripts/pipeline/predict.py $sample_file

04/06/2019 04:10:17 PM: [ CUDA enabled (GPU -1) ]
04/06/2019 04:10:17 PM: [ Initializing pipeline... ]
04/06/2019 04:10:17 PM: [ Initializing document ranker... ]
04/06/2019 04:10:17 PM: [ Loading /home/julia_buffinton/DrQA/data/wikipedia/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz ]
04/06/2019 04:12:13 PM: [ Initializing document reader... ]
04/06/2019 04:12:13 PM: [ Loading model /home/julia_buffinton/DrQA/data/reader/multitask.mdl ]
04/06/2019 04:12:49 PM: [ Initializing tokenizers and document retrievers... ]
04/06/2019 04:12:51 PM: [ Loading queries from ./wikipedia_squad/AA/wiki_squad_00.json ]
Traceback (most recent call last):
  File "../../DrQA/scripts/pipeline/predict.py", line 109, in <module>
    queries.append(data['question'])
KeyError: 'question'


## Add answers back to file