In [2]:
import json
import itertools
import pandas as pd
import ast
import string
import operator
from collections import defaultdict
import re

# encoding =utf8
#import sys
#reload(sys)
#sys.setdefaultencoding('utf8')

## Wikipedia Article Pre-Processing
To move our data through the pipeline and ensure that it is suitable for our app, we will format it like the SQuAD dataset.

In [3]:
wikipedia_data = {"data": [], "version" : 1.0}

num_files = 2 # update this as we know how many files we have

for i in range(num_files): 
    
    input_file = './AA_wiki_00_01/wiki_0' + str(i)
    output_file = './AA_wiki_00_01/wiki_squad_0' + str(i) + '.json'
    
    with open(input_file) as f:
        # each line represents a different wikipedia article
        # we will ignore the id and url for now, not needed
        for line in f:
            line_dict = ast.literal_eval(line)
            title = line_dict['title']
            text = line_dict['text'].split("\n\n",1)[1] # title is duplicated within text as well

            # Break text up into paragraphs
            paras = text.split("\n\n")

            context = [{'context': para, 'qas' : []} for para in paras]


            wikipedia_data['data'].append({'title' : title, 'paragraphs' : context})

    with open(output_file, 'w') as outfile:  
        json.dump(wikipedia_data, outfile)
    
#print(wikipedia_data)

## Sentence Selection

In [4]:
def sents_and_weights(d,paragraphs):
        """Clean sentences, remove digit, punctuation, upper case to lower
        Args: paragraphs = list of dicts with contexts
        Return: sentences_processed: dict of cleaned sentences. key = number of sentence; value = list of stemmed words.
        """
        labeled_sentences = {}
        stemmed_sentences = {}
        
        # this needs to be a default dict so it returns 0 if word not found 
        word_dict = defaultdict(int)
        word_distr = defaultdict(int)

        # initialize for stemming
        stop_words = nltk.corpus.stopwords.words('english')
        stemmer = nltk.stem.PorterStemmer()
        tokenize = nltk.word_tokenize

        def stem_and_add(wd):
            word_dict[wd] += 1
            word_dict['total_count'] += 1
            return stemmer.stem(wd)
        
        # need to go through each 'paragraph' (context) to gather info about sentences
        for i,context in enumerate(paragraphs):

            # split paragraph into sentences, make sure to keep digits, etc. together
            sentences = context['context'].split('. ') #last one still has a period at the end

            for j,sentence in enumerate(sentences):
                # save list of unprocessed sentences for later
                labeled_sentences[(d,i,j)] = sentence
                            
                # Remove all digits
                sentence = ''.join([x for x in sentence if not x.isdigit()])
                # Remove all punctuation (OK to include periods since it's been split)
                sentence = ''.join([x for x in sentence if x not in string.punctuation])
                
                # Lowercase everything
                sentence = sentence.strip()
                sentence = sentence.lower()
                
                # Split into words & rejoin (remove extra spaces)
                sentence = ' '.join(sentence.split())
                
                tokenized_stemmed_sent = [stem_and_add(word) for word in nltk.tokenize.word_tokenize(sentence) 
                                          if not word in stop_words]
                
                # keep track of tokenized for calculating sentence weight in next step
                stemmed_sentences[(d,i,j)] = tokenized_stemmed_sent
                
                # update our word dictionary to be relative frequencies rather than absolute values
                for word, ct in word_dict.items():
                    # but keep our total count, we may want that later (not sure)
                    if not word == 'total_count':
                        word_distr[word] = word_dict[word] / word_dict['total_count']
                
        #print("article length:",word_dict['total_count'])
        #print("word dict:",word_distr)

        return labeled_sentences,stemmed_sentences,word_distr

In [5]:
def calc_sent_weight(word_dist, stemmed_sents):
        """Compute weight with respect to sentences
        Args:
                word_distribution: dict with word: weight
                stemmed_sents: list with 
        Return:
                sentence_weight: dict of weight of each sentence. key = sentence #, value = weight
        """
        sentences_weight = {}
        # Iterate through each word in each sentence, if word distribution and sentence id are in dictionary, 
        # add to existing word distribution. Else, sentence weight for given sentence equals current word distribution
          
        for key, words in stemmed_sents.items():
            #print(words)
            # Sentence weight equals sum of word distributions divided by length of cleaned sentence
            if len(words) == 0:
                weight = 0
            else:
                weight = sum([word_dist[word] for word in words]) / len(words)
            
            sentences_weight[key] = weight
            
        sentences_weight = sorted(sentences_weight.items(), key=operator.itemgetter(1), reverse=True)
        #print('sentence weight: ',sentences_weight)

        return sentences_weight

In [6]:
def topically_important_sentence(sentences_weight, labeled_sentences):
        """Select topically import sentences
        Args:
                sentence_weight: list of tuples, (sentence_num, sentence_weight) computed in sentence_weight
                paragraph: set of sentences
        Return:
                sentences_selected: dict, topically important sentences selected
        """
        final_sentences = {}
        
        total_sentences = len(sentences_weight)
        # how many sentences to retain
        num_sentences_selected = math.ceil(float(0.05) * total_sentences)
        #print('num sentences for this passage:',num_sentences_selected)
        
        # key of selected sentences (# order of sentence in paragraph)
        #sentences_selected_key = []
        
        # dictionary of all sentences 
        sentences_dict = {}
        flag = 0
        
        # select num_sentences_selected # of sentences from list of sentence weights
        selected_keys = [k for k,v in sentence_weight[0:num_sentences_selected]]
        
        #print("selected sentence(s):",selected_keys)


        for sent_key in selected_keys:
            pre_processed_sentence = labeled_sentences[sent_key]
            
            processed_sentence = pre_processed_sentence.lower() #lowercase
            processed_sentence = processed_sentence.replace('[[','')
            processed_sentence = processed_sentence.replace(']]','')
            processed_sentence = processed_sentence.replace(']','')
            processed_sentence = processed_sentence.replace('[','')
            processed_sentence = re.sub('(?<!\d)([.,!?()])(?<!\d)', r' \1 ', processed_sentence)
            processed_sentence = re.sub(r'\(','-lrb- ',processed_sentence) # replace left parens, add space after
            processed_sentence = re.sub(r'\)',' -rrb-',processed_sentence) # replace left parens, add space after
            processed_sentence = re.sub(r'\([^)]*\)', '',processed_sentence) #replace brackets in links
            processed_sentence = re.sub('(?<=\s)\"','`` ',processed_sentence) # replace first double quotes with ``
            processed_sentence = re.sub(r'\"', " ''", processed_sentence) # replace second double quote with two single quotes ''

            #print(processed_sentence)

            final_sentences[sent_key] = processed_sentence
            
        return final_sentences

In [7]:
with open('./AA_wiki_00_01/wiki_squad_00.json') as json_file:  
    data = json.load(json_file)

# small file just for testing!! 
#with open('./sentence_selection/AA_wiki_00_01/wiki_squad_00_jb.json') as json_file:  
    #data = json.load(json_file)    
    
    
#type(data)
data = pd.DataFrame.from_dict(data)
df = data['data']

In [8]:
sentences_onmt = open('./test_sents_qg', "w")
sentences_labeled = open('./test_sents_labeled', "w")


# for each article in the file
for row,value in df.iteritems():
    # here is where we clean and stem words, build word distribution
    #print(value['title'])
    labeled_sentences, stemmed_sentences, word_distribution = sents_and_weights(row,value['paragraphs'])
    
    # use this word distribution to get weights for each sentence and calculate most important sentences 
    sentence_weight = calc_sent_weight(word_distribution,stemmed_sentences)
        
    # pull out most important sentences
    # and keep track of where they came from: (doc #, context #, sentence #)
    chosen_sentences = topically_important_sentence(sentence_weight,labeled_sentences)
    
    for sents in chosen_sentences.items():
        
        #save selected sentences directly to file, for onmt model
        sentences_onmt.write(str(sents[1])+'\n')
        # keep track of their locations, though
        sentences_labeled.write(str(sents)+'\n')