# Generating human 'attention' from source / summary input.

load the data

In [5]:
import json
f = open('/Users/haldenl/nlpcapstone/data/attn_vis_data.json', 'r')
data = json.load(f)

print(data[0].keys())

dict_keys(['p_gens', 'article_lst', 'abstract_str', 'decoded_lst', 'attn_dists'])


process

In [6]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [7]:
import numpy as np
np.set_printoptions(threshold=np.inf)


softmax function, from @nolanbconaway

In [8]:
def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats. 
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the 
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter, 
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

### Calculates the similarity betweens sentences in article and summary (softmaxed)

In [32]:
def getSentenceSimilarities(article_sentences, human_sentences):
    sentence_similarities = np.zeros([len(human_sentences), len(article_sentences)])
    
    
    for human_index, human_sent in enumerate(human_sentences):
        for article_index, article_sent in enumerate(article_sentences):
            a_sent = article_sent
            h_sent = human_sent
            
            similarity = h_sent.similarity(a_sent)
            sentence_similarities[human_index][article_index] = similarity
        
    sentence_similarities = softmax(sentence_similarities, theta=100, axis=1)
    return sentence_similarities
    

### Calculates 'attention' weights between article and summary

In [36]:
def getWeights(article, human):
    weights = np.zeros([len(human), len(article)])
    
    article_sentences = list(article.sents)
    human_sentences = list(human.sents)
        
    sentence_similarities = getSentenceSimilarities(article_sentences, human_sentences)
    
    count = 0
    human_rows = {}
    for human_index, human_sent in enumerate(human_sentences):
        human_rows[human_index] = {}
        for human_tok_index, human_token in enumerate(human_sent):
            human_rows[human_index][human_tok_index] = count
            count += 1
            
    count = 0
    article_columns = {}
    for article_index, article_sent in enumerate(article_sentences):
        article_columns[article_index] = {}
        for article_tok_index, article_token in enumerate(article_sent):
            article_columns[article_index][article_tok_index] = count
            count += 1
    
    for human_index, human_sent in enumerate(human_sentences):
        human_sent = nlp(human_sent.text)
        
        for article_index, article_sent in enumerate(article_sentences):
            article_sent = nlp(article_sent.text)
        
            sentence_sim = sentence_similarities[human_index][article_index]
            
            for human_tok_index, human_token in enumerate(human_sent):
                count += 1
                for article_tok_index, article_token in enumerate(article_sent):
                    similarity = 0
                    if (len(human_token.text) == 1 or len(article_token.text) == 1):
                        similarity = 0
                    else:
                        similarity = human_token.similarity(article_token)
                        
                        
                    weight = sentence_sim * similarity
    
                    row = human_rows[human_index][human_tok_index]
                    column = article_columns[article_index][article_tok_index]
                    weights[row][column] = weight                  
                  
    weights = softmax(weights, theta=100, axis=1)
    
    return weights

In [40]:
article_str="A super slimmer who swelled to 26 stone after eating a loaf of bread a day is now toasting her diet - and literally becoming half the woman she used to be. Michelle Quinn, 42, ballooned after munching her way through toast, sarnies and slices of bread all day. Her diet used to be made up of white toast with margarine for breakfast, sandwiches and crisps for lunch and fish and chips and takeaways for dinner. Michelle Quinn has lost half of her body weight after ditching her bread-based diet which saw her eat a loaf a day, she has also dropped from a clothes size 30 (left) to a size 12 (right) Before losing weight 43-year-old Michelle was a size 30, here she proudly holds up a pair of her old trousers But she gave up the bread and started a diet of breakfast of cereal or fruit and yoghurt, home-made soup for lunch and healthy versions of her favourite meals. Michelle, of South Shields, Tyneside, says she feels like a new woman after losing 12.5st and dropping from dress size 30 to size 12. She has been named Slimming World's Greatest Loser in the West Harton area of South Shields. She said: 'I feel like a new woman since losing weight. In fact, I look so different that people who I havent seen for a while often can't believe I'm the same person. 'For me though it's the change on the inside that's been the biggest - I'm happier, healthier and much more confident now. Michelle, pictured with her uncle Derek, joined a slimming group in 2013 in a bid to shift the weight, she says she had struggled with high blood pressure, back pain and that she got breathless easily Michelle was not fat as a child but piled on the pounds thanks to her diet of fish and chips and sandwhiches Now a size 12, Michelle no longer feels the need to eat a whole loaf of bread every day She continued: 'I still enjoy all my favourite meals like burgers and chips and roast dinners but I've learned how to make small changes like using lean meat or cooking with low calorie spray instead of oil or butter. 'It fits in really well with the rest of my family and we can all eat the same meals.' Michelle weighed 25st 3lbs when she joined the group in April 2013 and has since dropped to 12st 10lbs. She said: 'Before I lost the weight I hid behind a big bubbly personality. I'd pretend it didn't bother me that I was bigger than most other people, but that was far from the truth. 'I hated shopping for clothes and found just climbing up stairs and doing simple everyday tasks would leave me tired and out of breath.' Michelle's weight was also putting a huge strain on her health and she suffered with high blood pressure, chronic back pain and got breathless easily."
human_str="Michelle Quinn ballooned thanks to her addiction to bread and chips The 42-year-old would eat a whole loaf of bread a day In 2013 she weighed more than 25st and was a size 30 dress size After joining a slimming group her weight has dropped to 12 stone 10lbs"



In [41]:
article = nlp(article_str)  
human = nlp(human_str)

weights = getWeights(article, human)

print(weights)

attentionRecords = []
inputRecords = []
outputRecords = []

for human_index, human_token in enumerate(human):
    for article_index, article_token in enumerate(article):
        
        attn_weight = weights[human_index][article_index]
                
        attentionRecords.append({
            'inputIndex': article_index,
            'outputIndex': human_index,
            'weight': attn_weight
        })
        
for human_index, human_token in enumerate(human):
    outputRecords.append({
        'index': human_index,
        'token': human_token.text
    })
    
for article_index, article_token in enumerate(article):
    inputRecords.append({
        'index': article_index,
        'token': article_token.text
    })
            
output = {
    'attentionRecords': attentionRecords,
    'inputTokens': inputRecords,
    'outputTokens': outputRecords
}

with open('/Users/haldenl/nlpcapstone/data/hierarchical_similarity_data_{0}.json'.format(0), 'w') as out:
    json.dump(output, out, indent=2)


[[0.00177922 0.00177922 0.00177922 0.00177922 0.00177922 0.00177922
  0.00177922 0.00177922 0.00177922 0.00177922 0.00177922 0.00177922
  0.00177922 0.00177922 0.00177922 0.00177922 0.00177922 0.00177922
  0.00177922 0.00177922 0.00177922 0.00177922 0.00177922 0.00177922
  0.00177922 0.00177922 0.00177922 0.00177922 0.00177922 0.00177922
  0.00177922 0.00177922 0.00177922 0.00178278 0.00178278 0.00178278
  0.00178278 0.00178278 0.00178278 0.00178278 0.00178278 0.00178278
  0.00178278 0.00178278 0.00178278 0.00178278 0.00178278 0.00178278
  0.00178278 0.00178278 0.00178278 0.00178278 0.00178278 0.00178278
  0.00177793 0.00177793 0.00177793 0.00177793 0.00177793 0.00177793
  0.00177793 0.00177793 0.00177793 0.00177793 0.00177793 0.00177793
  0.00177793 0.00177793 0.00177793 0.00177793 0.00177793 0.00177793
  0.00177793 0.00177793 0.00177793 0.00177793 0.00177793 0.00177793
  0.00177793 0.00177793 0.00177793 0.00177793 0.00177793 0.00177677
  0.00177677 0.00177677 0.00177677 0.00177677 0.