# Generating human 'attention' from source / summary input.

load the data

In [7]:
import json
f = open('/Users/haldenl/nlpcapstone/data/attn_vis_data.json', 'r')
data = json.load(f)

print(data[0].keys())

dict_keys(['p_gens', 'article_lst', 'abstract_str', 'decoded_lst', 'attn_dists'])


process

In [8]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [9]:
import numpy as np
np.set_printoptions(threshold=np.inf)


softmax function, from @nolanbconaway

In [10]:
def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats. 
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the 
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter, 
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

### Calculates the similarity betweens sentences in article and summary (softmaxed)

In [35]:
def getSentenceSimilarities(article_sentences, human_sentences):
    sentence_similarities = np.zeros([len(human_sentences), len(article_sentences)])
    
    
    for human_index, human_sent in enumerate(human_sentences):
        for article_index, article_sent in enumerate(article_sentences):
            a_sent = article_sent
            h_sent = human_sent
            
            similarity = h_sent.similarity(a_sent)
            sentence_similarities[human_index][article_index] = similarity
        
    sentence_similarities = softmax(sentence_similarities, theta=20, axis=1)
    return sentence_similarities
    

### Calculates 'attention' weights between article and summary

In [40]:
def getWeights(article, human):
    weights = np.zeros([len(human), len(article)])
    
    article_sentences = list(article.sents)
    human_sentences = list(human.sents)
        
    sentence_similarities = getSentenceSimilarities(article_sentences, human_sentences)
    
    count = 0
    human_rows = {}
    for human_index, human_sent in enumerate(human_sentences):
        human_rows[human_index] = {}
        for human_tok_index, human_token in enumerate(human_sent):
            human_rows[human_index][human_tok_index] = count
            count += 1
            
    count = 0
    article_columns = {}
    for article_index, article_sent in enumerate(article_sentences):
        article_columns[article_index] = {}
        for article_tok_index, article_token in enumerate(article_sent):
            article_columns[article_index][article_tok_index] = count
            count += 1
    
    for human_index, human_sent in enumerate(human_sentences):
        human_sent = nlp(human_sent.text)
        
        for article_index, article_sent in enumerate(article_sentences):
            article_sent = nlp(article_sent.text)
        
            sentence_sim = sentence_similarities[human_index][article_index]
            
            for human_tok_index, human_token in enumerate(human_sent):
                count += 1
                for article_tok_index, article_token in enumerate(article_sent):
                    similarity = 0
                    if (len(human_token.text) == 1 or len(article_token.text) == 1):
                        similarity = 0
                    else:
                        similarity = human_token.similarity(article_token)
                        
                        
                    weight = sentence_sim * similarity
    
                    row = human_rows[human_index][human_tok_index]
                    column = article_columns[article_index][article_tok_index]
                    weights[row][column] = sentence_sim                  
                  
    weights = softmax(weights, theta=100, axis=1)
    
    return weights

In [41]:
for k, example in enumerate(data[:1]):
    article_str = "Twitter is on the verge of its initial public offering and everyone's knickers have been in a knot all week over the company's lack of management diversity -- that is, women. As The New York Times put it last week, \"The board? All white men. The investors? All men. The executive officers? All men but for the general counsel, Vijaya Gadde, who has had the job for five weeks.\" Question: Why is this a problem? First, there aren't enough women at Twitter or in the tech world because there aren't enough women anywhere. Some facts: Women make up 6% of chief executives at the leading 100 tech companies, and that has taken years to accomplish. Most startups have all-male boards. In 2012 women held 16.6% of Fortune 500 board seats. Women of color were 3.3% of the total. Fully one-tenth had no women serving on their boards at all. In the past five years, women and minorities have lost ground despite evidence strongly suggesting that gender parity and board diversity have positive effects on profitability. Twitter had more than a year to prepare for this IPO: Where are the women? The company's immediate response focused on a \"paucity of candidates.\" The tech industry has a well-documented pipeline problem, one largely the result of gender stereotypes that reach into the educational system. However, companies regularly draw executives from outside their own industries -- sometimes, yes, even women -- when seeking senior-level and board positions. It enables them to cross-pollinate ideas, diversify their expertise and innovate. For example, Apple has hired Burberry CEO Angela Ahrendts. During her tenure at Burberry, the company doubled its revenues and tripled its share price. Companies redefine \"pipeline\" every day -- particularly when profits are involved. The technology sector dresses itself up as progressive when in reality it shows every indication of being, at its core, powerfully retrograde. Despite investing in diversity programs, the management of tech firms is distinctly not diverse, and indeed the industry continues to \"dazzle\" with incidents in which men gleefully display their not even implicit biases. Last month's Titshare debacle (click and weep), as well as Business Insider's long-overdue firing of CTO Pax Dickinson (who'd for years been churning out such sexist, racist, tweets as \"Jesus gets raped by a pack of n****s. It's his own fault for dressing like a whore though\") are two recent examples. Second, sexism is seamlessly coupled with the distribution of capital. Bryan Goldberg's September launch of Bustle, a \"women's centric\" website (\"world news and politics alongside beauty tips\") was a blunt force case in point. After announcing the site with a tone-deaf post, Goldberg was widely mocked for his personal failure to grasp, among other things, how ridiculous his claim to be starting the \"first site of its kind\" for women was. The real issue isn't Goldberg's cluelessness, but the institutional biases that enabled him to raise $6.5 million when far more able, knowledgeable and experienced women can't. Companies with at least one woman founder make up only 13% of those funded. Venture capitalists are less likely to invest in startups if there are women involved in their management; investors actively reduce holdings in companies that appoint female directors. These are particular ironies since women-run startups use 40% less capital to launch. These facts don't reflect women lacking in confidence ideas. They illustrate discrimination, whether it is conscious or not. Third, while we think of Twitter as a tech company, it is a media company and part of a larger environment that does little to correct its failure at diversity. Year after year, studies such as one last week from the Directors Guild of America, as well as reports from Who Makes the News and the Women's Media Center Status on Women in Media document exactly how distorted mainstream media ownership, management and production remain. Only online is the situation improving. But even there the difference is largely gains in women-oriented \"pink-collar\" content. How can we separate these facts from ownership? We can't. As reported by the Federal Communications Commission, our media are almost entirely owned and managed by white men. Lastly, what does this have to do with speech? Everything. Male experiences, interests, expectations and voices, mainly white, inform the way we think, decide how our resources are dispersed and define our norms. The ghettoized status of women and minorities in media and technology, coupled with the lack of venture capital investments, means that our attempts to express ourselves are limited, misrepresented and regularly repackaged to make what we say palatable to a sexist status quo. And this status quo is entirely uninterested in the idea of women as capable, autonomous leaders who might change norms in unsettling, risky ways. Stating these facts baldly is not an indictment of white men as individuals. It is a description of systemic problems that we refuse to confront with systemic solutions. The presence of a carefully selected handful of women in tech, regardless of how determined, able and prominently visible they are, has for decades done little to alter the makeup of management and ownership. Gender diversity at Twitter, as elsewhere, isn't a priority because people make reasonable decisions about what they believe will be profitable and successful: Twitter is looking for experience and expertise within a comfort zone. \"This to me is not a gender issue, it's an innovation issue,\" Twitter CEO Dick Costolo has explained. Gender issues aren't innovation issues only if your gender is dominant and the norm. This rationale, and others like it, are exactly the kind of \"1,000 paper cuts\" marginalization that result in fewer women pursuing tech careers to begin with. Twitter is one part of a male dominated social structure, economy and culture, all of which rely on cradle-to-grave sexism to be profitable. That's tweetable by the way."
    human_str = " Soraya Chemaly: Twitter, on verge of IPO, is under fire with leadership that's virtually all men She says it reflects larger dearth of women in tech, media fields -- 6% of tech CEOs are women She says sexism abounds in male-dominated field and guides investment, leaving women out Chemaly: Controlling women's access makes men keepers of speech, keeps sexist status quo"
    article = nlp(article_str)  
    human = nlp(human_str)
    
    weights = getWeights(article, human)
        
    output = []
    for human_index, human_token in enumerate(human):
        for article_index, article_token in enumerate(article):
            record = {
                'outputIndex': human_index,
                'outputToken': human_token.text,
                'outputPos': human_token.pos_,
                'inputIndex': article_index,
                'inputToken': article_token.text,
                'inputPos': article_token.pos_,
                'weight': weights[human_index][article_index]
            }
                
            output.append(record)
    
    with open('/Users/haldenl/nlpcapstone/data/hierarchical_similarity_data_{0}.json'.format(k), 'w') as out:
        json.dump(output, out, indent=2)
        