### ** Don't forget to activate the appropriate environment!!
https://ipython.readthedocs.io/en/stable/install/kernel_install.html#kernels-for-different-environments

In [1]:
import json
import itertools
import pandas as pd
import ast
import string
import operator
from collections import defaultdict
import re
import os
import nltk
import math
import nltk

#nltk.download('stopwords')
#nltk.download('punkt')

### 0.1 Data File Directories

In [71]:
# assuming these directories are already created! If not, need to create them. 
data_files = './wikipedia_data'

# This should contain folders of files with lists of Wikipedia articles
wiki_dump = data_files + '/wikipedia_dump/'

# These should all be empty
wiki_squad = data_files + '/wikipedia_squad/'
labeled_sents = data_files + '/labeled_sentences/'
unlabeled_sents = data_files + '/unlabeled_sentences/'
questions = data_files + '/questions/'
answers = data_files + '/answers/'

## 1. Wikipedia Article Pre-Processing
To move our data through the pipeline and ensure that it is suitable for our app, we will format it like the SQuAD dataset.

In [75]:
# iterate through dump of wikipedia articles
# grouped (by folder) into 100 files with lists of articles
total_articles = 0

for foldername in os.listdir(wiki_dump):
    
    input_subfolder = wiki_dump + foldername
    output_subfolder = wiki_squad + foldername
    os.mkdir(output_subfolder)

    # these are not files, just folders
    print("Processing files in {} folder...".format(foldername))
    num_articles = 0
    
    # each file represents several (variable #) wikipedia articles
    for filename in os.listdir(input_subfolder):
        #print(filename)
        f = open(input_subfolder + '/' + filename)
        
        # each file of articles will become a separate .json of articles
        # this helps if we run into issues, we can just discard a whole file and move on
        
        # set up json format for squad-like listing of articles
        wikipedia_data_dict = {"data": [], "version" : 1.0}
        
        # save this to the 'wikipedia_squad' folder of correctly-formatted dicts of wikipedia articles
        output_file = output_subfolder + '/' + filename
       
        # each line represents a different wikipedia article
        # we will ignore the id and url for now, not needed
        
        for line in f:
            line_dict = ast.literal_eval(line)
            title = line_dict['title']
                              
            # for some reason, empty articles are included. They should be disregarded
            try:
                text = line_dict['text'].split("\n\n",1)[1] # title is duplicated within text as well
            except:
                print("Skipping article:",title)
            else:
                # arbitrary length, should eliminate articles like "disambiguation" articles, etc. 
                if len(text) > 1000: 
                    num_articles += 1
                    # Break text up into paragraphs
                    paras = text.split("\n\n")

                    context = [{'context': para.rstrip(), 'qas' : []} for para in paras]

                    wikipedia_data_dict['data'].append({'title' : title, 'paragraphs' : context})
                    
        # in case we don't have any articles in the file to add
        if (wikipedia_data_dict['data']): 
            with open(output_file, 'w') as outfile:  
                json.dump(wikipedia_data_dict, outfile)
            
    total_articles += num_articles
    print("Completing files in {} folder... {} articles processed.\n".format(foldername,num_articles))

print("Reformatting complete. Total {} articles processed for question generation.".format(total_articles))

Processing files in AA folder...
Skipping article: List of German proverbs
Skipping article: Floccinaucinihilipilification
Completing files in AA folder... 5280 articles processed.

Processing files in AB folder...
Skipping article: History of Christianity
Skipping article: Klaus Barbie
Skipping article: Kyoto Protocol
Skipping article: Index of philosophy articles (A–C)
Completing files in AB folder... 3358 articles processed.

Reformatting complete. Total 8638 articles processed for question generation.


## 2. Sentence Selection

### 2.1 Clean sentences, stem

In [58]:
def sents_and_weights(d,paragraphs):
        """Clean sentences, remove digit, punctuation, upper case to lower
        Args: paragraphs = list of dicts with contexts
        Return: sentences_processed: dict of cleaned sentences. key = number of sentence; value = list of stemmed words.
        """
        labeled_sentences = {}
        stemmed_sentences = {}
        
        # this needs to be a default dict so it returns 0 if word not found 
        word_dict = defaultdict(int)
        word_distr = defaultdict(int)

        # initialize for stemming
        stop_words = nltk.corpus.stopwords.words('english')
        stemmer = nltk.stem.PorterStemmer()
        tokenize = nltk.word_tokenize

        def stem_and_add(wd):
            word_dict[wd] += 1
            word_dict['total_count'] += 1
            return stemmer.stem(wd)
        
        # need to go through each 'paragraph' (context) to gather info about sentences
        for i,context in enumerate(paragraphs):

            # split paragraph into sentences, make sure to keep digits, etc. together
            sentences = context['context'].split('. ') #last one still has a period at the end

            for j,sentence in enumerate(sentences):
                # save list of unprocessed sentences for later
                labeled_sentences[(d,i,j)] = sentence
                            
                # Remove all digits
                sentence = ''.join([x for x in sentence if not x.isdigit()])
                # Remove all punctuation (OK to include periods since it's been split)
                sentence = ''.join([x for x in sentence if x not in string.punctuation])
                
                # Lowercase everything
                sentence = sentence.strip()
                sentence = sentence.lower()
                
                # Split into words & rejoin (remove extra spaces)
                sentence = ' '.join(sentence.split())
                
                tokenized_stemmed_sent = [stem_and_add(word) for word in nltk.tokenize.word_tokenize(sentence) 
                                          if not word in stop_words]
                
                # keep track of tokenized for calculating sentence weight in next step
                stemmed_sentences[(d,i,j)] = tokenized_stemmed_sent
                
                # update our word dictionary to be relative frequencies rather than absolute values
                for word, ct in word_dict.items():
                    # but keep our total count, we may want that later (not sure)
                    if not word == 'total_count':
                        word_distr[word] = word_dict[word] / word_dict['total_count']
                
        #print("article length:",word_dict['total_count'])
        #print("word dict:",word_distr)

        return labeled_sentences,stemmed_sentences,word_distr

In [59]:
def calc_sent_weight(word_dist, stemmed_sents):
        """Compute weight with respect to sentences
        Args:
                word_distribution: dict with word: weight
                stemmed_sents: list with 
        Return:
                sentence_weight: dict of weight of each sentence. key = sentence #, value = weight
        """
        sentences_weight = {}
        # Iterate through each word in each sentence, if word distribution and sentence id are in dictionary, 
        # add to existing word distribution. Else, sentence weight for given sentence equals current word distribution
          
        for key, words in stemmed_sents.items():
            #print(words)
            # Sentence weight equals sum of word distributions divided by length of cleaned sentence
            if len(words) == 0:
                weight = 0
            else:
                weight = sum([word_dist[word] for word in words]) / len(words)
            
            sentences_weight[key] = weight
            
        sentences_weight = sorted(sentences_weight.items(), key=operator.itemgetter(1), reverse=True)
        #print('sentence weight: ',sentences_weight)

        return sentences_weight

In [60]:
def topically_important_sentence(sentences_weight, labeled_sentences):
        """Select topically import sentences
        Args:
                sentence_weight: list of tuples, (sentence_num, sentence_weight) computed in sentence_weight
                paragraph: set of sentences
        Return:
                sentences_selected: dict, topically important sentences selected
        """
        final_sentences = {}
        
        total_sentences = len(sentences_weight)
        # how many sentences to retain
        num_sentences_selected = math.ceil(float(0.05) * total_sentences)
        #print('num sentences for this passage:',num_sentences_selected)
        
        # key of selected sentences (# order of sentence in paragraph)
        #sentences_selected_key = []
        
        # dictionary of all sentences 
        sentences_dict = {}
        flag = 0
        
        # select num_sentences_selected # of sentences from list of sentence weights
        #print(sentence_weight[0])
        selected_keys = [k for k,v in sentence_weight[0:num_sentences_selected]]
        
        #print("selected sentence(s):",selected_keys)


        for sent_key in selected_keys:
            pre_processed_sentence = labeled_sentences[sent_key]
            
            processed_sentence = pre_processed_sentence.lower() #lowercase
            processed_sentence = processed_sentence.replace('[[','')
            processed_sentence = processed_sentence.replace(']]','')
            processed_sentence = processed_sentence.replace(']','')
            processed_sentence = processed_sentence.replace('[','')
            processed_sentence = re.sub('(?<!\d)([.,!?()])(?<!\d)', r' \1 ', processed_sentence)
            processed_sentence = re.sub(r'\(','-lrb- ',processed_sentence) # replace left parens, add space after
            processed_sentence = re.sub(r'\)',' -rrb-',processed_sentence) # replace left parens, add space after
            processed_sentence = re.sub(r'\([^)]*\)', '',processed_sentence) #replace brackets in links
            processed_sentence = re.sub('(?<=\s)\"','`` ',processed_sentence) # replace first double quotes with ``
            processed_sentence = re.sub(r'\"', " ''", processed_sentence) # replace second double quote with two single quotes ''

            #print(processed_sentence)

            final_sentences[sent_key] = processed_sentence
            
        return final_sentences

In [86]:
total_skipped = 0
total_selected = 0

for foldername in os.listdir(wiki_squad):
    
    input_subfolder = wiki_squad + foldername
    output_subfolder_labeled = labeled_sents + foldername
    output_subfolder_unlabeled = unlabeled_sents + foldername

    os.mkdir(output_subfolder_labeled)
    os.mkdir(output_subfolder_unlabeled)

    # these are not files, just folders
    print("Completing sentence selection for files in {} folder...".format(foldername))
    
    num_skipped = 0
    num_selected = 0
    
    # each file represents several (variable #) wikipedia articles
    for filename in os.listdir(input_subfolder):
        #print(filename)
        input_file = input_subfolder + '/' + filename
              
        # save this to the 'wikipedia_squad' folder of correctly-formatted dicts of wikipedia articles
        output_file_labeled = open(output_subfolder_labeled + '/' + filename, "w")
        output_file_unlabeled = open(output_subfolder_unlabeled + '/' + filename, "w")

        with open(input_file) as json_file:  
            data = json.load(json_file)

        data = pd.DataFrame.from_dict(data)
        df = data['data']

        # for each article in the file
        for row,value in df.iteritems():
            # here is where we clean and stem words, build word distribution
            try:
                labeled_sentences, stemmed_sentences, word_distribution = sents_and_weights(row,value['paragraphs'])

                # use this word distribution to get weights for each sentence and calculate most important sentences 
                sentence_weight = calc_sent_weight(word_distribution,stemmed_sentences)

                # pull out most important sentences
                # and keep track of where they came from: (doc #, context #, sentence #)
                chosen_sentences = topically_important_sentence(sentence_weight,labeled_sentences)
            except: 
                num_skipped += 1
                print("Skipping article:",value['title'])
            else:
                num_selected += 1
                for sents in chosen_sentences.items():

                    #save selected sentences directly to file, for onmt model
                    output_file_unlabeled.write(str(sents[1])+'\n')
                    # keep track of their locations, though
                    output_file_labeled.write(str(sents)+'\n') 
                    
    total_skipped += num_skipped
    total_selected += num_selected
    print("Completing files in {} folder... {} articles processed, {} articles skipped.\n".format(foldername,num_selected,num_skipped))

print("Sentence selection complete. Total {} articles processed, {} skipped for question generation.".format(total_articles, total_skipped))

Completing sentence selection for files in AA folder...
Skipping article: Father Christmas
Skipping article: Abracadabra
Skipping article: Daoism–Taoism romanization issue
Skipping article: Dolmen
Skipping article: Dictionary
Skipping article: Equatorial Guinea
Skipping article: Dative case
Skipping article: Arabic alphabet
Skipping article: Capitalism
Skipping article: Cognitive science
Skipping article: Dacoity
Skipping article: Beastie Boys
Skipping article: Gossip
Skipping article: Cognate
Skipping article: Gnome
Skipping article: Ghost
Skipping article: Estampie
Skipping article: Faroese language
Skipping article: First aid
Skipping article: N,N-Dimethyltryptamine
Completing files in AA folder... 5260 articles processed, 20 articles skipped.

Completing sentence selection for files in AB folder...
Skipping article: Mummy
Skipping article: Mode (music)
Skipping article: Hogshead
Skipping article: Kludge
Skipping article: Internet slang
Skipping article: GIF
Skipping article: Goddes

## 3. Question Generation

For reference: https://github.com/drewserles/GenerationQ. Assuming training has already been completed.

### 3.1 Question Generation

#### Don't forget to switch kernels!

In [None]:
generationq_dir = '~/GenerationQ/model'
model = generationq_dir + '/trained/600rnn_step_14000.pt' # update depending on which model has lowest perplexity

for foldername in os.listdir(unlabeled_sents):
    
    input_subfolder = unlabeled_sents + foldername
    output_subfolder = questions + foldername
    
    #os.mkdir(output_subfolder)

    # these are not files, just folders
    print("Completing question generation for files in {} folder...".format(foldername))
        
    # each file represents several (variable #) wikipedia articles
    for filename in os.listdir(input_subfolder):
        
        # input file is unlabeled sentence
        input_file = input_subfolder + '/' + filename
              
        # save list of questions
        output_file = output_subfolder + '/' + filename
        
        !python $generationq_dir/test.py -model $model -src $input_file -output $output_file \
        -replace_unk -beam_size 3 -gpu 0 -batch_size 30

Completing question generation for files in AA folder...
  var = torch.tensor(arr, dtype=self.dtype, device=device)
  return torch.tensor(a, requires_grad=False)
Traceback (most recent call last):
  File "/home/julia_buffinton/GenerationQ/model/test.py", line 36, in <module>
    main(opt)
  File "/home/julia_buffinton/GenerationQ/model/test.py", line 24, in main
    attn_debug=opt.attn_debug)
  File "/home/julia_buffinton/GenerationQ/model/onmt/translate/translator.py", line 229, in translate
    for batch in data_iter:
  File "/home/julia_buffinton/anaconda3/envs/pytorch-env/lib/python2.7/site-packages/torchtext/data/iterator.py", line 157, in __iter__
    yield Batch(minibatch, self.dataset, self.device)
  File "/home/julia_buffinton/anaconda3/envs/pytorch-env/lib/python2.7/site-packages/torchtext/data/batch.py", line 34, in __init__
    setattr(self, name, field.process(batch, device=device))
  File "/home/julia_buffinton/anaconda3/envs/pytorch-env/lib/python2.7/site-packages/torcht

In [3]:
# update as needed, depending on which model has lowest perplexity
generationq_model = '~/GenerationQ/model/trained/600rnn_step_14000.pt'

for c in string.ascii_uppercase[:end_letter]:
    
    output_dir = wiki_output_file_dir + c

    for i in range(num_files): 
    
        wiki_squad_file = output_dir + '/wiki_squad_' + '%02d' % i + '.json'
        sentences_onmt = output_dir + '/test_sents_qg_' + '%02d' % i
        sentences_labeled = output_dir + '/test_sents_labeled_' + '%02d' % i
        
        question_output = output_dir + '/test_questions_' + '%02d' % i + '.txt'
    
        print "generating questions for file #" + str(i)
        print sentences_onmt
    
        !python $generationq_dir/test.py -model $generationq_model -src $sentences_onmt -output $question_output \
        -replace_unk -beam_size 3 -gpu 0 -batch_size 30

generating questions for file #0
./wikipedia_squad/AA/test_sents_qg_00
  var = torch.tensor(arr, dtype=self.dtype, device=device)
  return torch.tensor(a, requires_grad=False)
PRED AVG SCORE: -0.6408, PRED PPL: 1.8980
generating questions for file #1
./wikipedia_squad/AA/test_sents_qg_01
Traceback (most recent call last):
  File "/home/julia_buffinton/GenerationQ/model/test.py", line 36, in <module>
    main(opt)
  File "/home/julia_buffinton/GenerationQ/model/test.py", line 24, in main
    attn_debug=opt.attn_debug)
  File "/home/julia_buffinton/GenerationQ/model/onmt/translate/translator.py", line 229, in translate
    for batch in data_iter:
  File "/home/julia_buffinton/anaconda3/envs/pytorch-env/lib/python2.7/site-packages/torchtext/data/iterator.py", line 157, in __iter__
    yield Batch(minibatch, self.dataset, self.device)
  File "/home/julia_buffinton/anaconda3/envs/pytorch-env/lib/python2.7/site-packages/torchtext/data/batch.py", line 34, in __init__
    setattr(self, name, f

### 3.2 Remove "problem" files

### 3.3 Label questions, add back to json file

In [4]:
# for each letter of the alphabet
for c in string.ascii_uppercase[:end_letter]:

    output_dir = wiki_output_file_dir + c

    # for each file in there (total: 100)
    for i in range(num_files): 
        
        sentences_labeled = output_dir + '/test_sents_labeled_' + '%02d' % i

        wiki_squad_file = output_dir + '/wiki_squad_' + '%02d' % i + '.json'
        
        #question_output = output_dir + '/test_sents_qg_' + '%02d' % i # THIS IS JUST FOR TESTING!!! 
        question_output = output_dir + '/test_questions_' + '%02d' % i + '.txt' # This is final!!
        question_output_qs = output_dir + '/test_questions_q_' + '%02d' % i + '.txt'
        
        with open(question_output) as f:
            pred_questions = f.read().splitlines()    
            pred_questions = pred_questions[::2] # start with the 1st line
            
        with open(question_output_qs, 'w') as f:
            for item in pred_questions:
                f.write("%s" % item) # removed newline character after this?
            
        with open(wiki_squad_file) as json_file:  
            data = json.load(json_file)

        #print(type(data))
        #data = pd.DataFrame.from_dict(data)
        wiki_df = data['data']
        #print(type(wiki_df))

        with open(question_output_qs) as f1:

            with open(sentences_labeled) as f2:
                for line_q,line_s in zip(f1,f2):
                    
                    line_tuple = eval(line_s)
                    nums = line_tuple[0]
                    #print(nums)
                    #sent = line_tuple[1]

                    doc = wiki_df[nums[0]] # pull the whole document
                    context = doc["paragraphs"][nums[1]]

                    context['qas'].append({"question": line_q, "answers": [], "id": str(nums)})
            
        with open(wiki_squad_file, 'w') as outfile:  
            json.dump(data, outfile)

## 4. Answer Questions

### 4.1 Filter out paragraphs missing questions

In [1]:
sample_file = './sample_data/AA/wiki_squad_00.json'

In [23]:
for foldername in os.listdir('./sample_data/'):
    
    input_subfolder = './sample_data/' + foldername
    #output_subfolder = questions + foldername
    
    #os.mkdir(output_subfolder)

    # these are not files, just folders
    print("Filtering paragraphs for files in {} folder...".format(foldername))
        
    # each file represents several (variable #) wikipedia articles
    for filename in os.listdir(input_subfolder):
        #print(filename)
        
        input_file = input_subfolder + '/' + filename
        
        with open(input_file) as json_file:  
            data = json.load(json_file)

        wiki_df = data['data']
        
        #output_file = output_subfolder + '/' + filename
        
        for i,v in enumerate(wiki_df):
            #print(v['title'])
            #print(len(v['paragraphs']))
            v['paragraphs'] = [ para for para in v['paragraphs'] if para['qas'] ]
            #print(len(v['paragraphs']))
            
        with open(input_subfolder + '/' + 'wiki_squad_00_abbr.json', 'w') as outfile:  
            json.dump(data, outfile)
       

Filtering paragraphs for files in AA folder...
Anarchism
86
17
Autism
70
12
Albedo
41
5
A
20
2
Alabama
147
21
Achilles
60
11
Abraham Lincoln
148
26
Aristotle
91
12
An American in Paris
16
4
Academy Awards
70
12
Actrius
5
1
Animalia (book)
11
2
International Atomic Time
14
3
Altruism
56
8
Ayn Rand
55
13
Allan Dwan
11
2
Algeria
138
19
List of Atlas Shrugged characters
18
3
Anthropology
82
13
Agricultural science
11
1
Alchemy
75
15
Astronomer
10
2
ASCII
60
9
Animation
47
6
Apollo
178
28
Andre Agassi
82
18
Austroasiatic languages
41
5
Afroasiatic languages
37
4
Andorra
105
14
Arithmetic mean
19
3
American Football Conference
12
3
Animal Farm
53
8


### 4.2 DrQA question answering

#### Don't forget to use the right kernel!!

In [24]:
sample_file_abbr = './sample_data/AA/wiki_squad_00_abbr.json'

In [26]:
!python ../DrQA/scripts/pipeline/predict.py $sample_file

Traceback (most recent call last):
  File "../DrQA/scripts/pipeline/predict.py", line 16, in <module>
    from drqa import pipeline
ImportError: No module named drqa


## Add answers back to file