In [1]:
# Jason Galvan

In [2]:
pwd

'/Users/batman/Downloads'

In [3]:
from spacy.util import minibatch, compounding
from pathlib import Path
import random
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

In [5]:
def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

In [6]:
def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

In [7]:
def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

In [8]:
def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

In [9]:
def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

In [10]:
def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

In [11]:
def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

In [12]:
def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

In [13]:
def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

In [14]:
def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

In [15]:
def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights)
    #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
    #    objs.extend(potentialNewObjs)
    #    v = potentialNewVerb

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

In [16]:
def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

In [17]:
def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])

def testSVOs():
    #nlp = English()

    tok = nlp("In a surprise blog post, Amazon said it will put the brakes on providing its facial recognition technology to police for one year, but refuses to say if the move applies to federal law enforcement agencies."  
              "The moratorium comes two days after IBM said in a letter it was leaving the facial recognition market altogether." 
              "Arvind Krishna, IBM's chief executive, cited a pursuit of justice and racial equity in light of the recent protests sparked by the killing of George Floyd by a white police officer in Minneapolis last month."  
              "Amazon's statement — just 102 words in length — did not say why it was putting the moratorium in place, but noted that Congress appears ready to work on stronger regulations governing the use of facial recognition — again without providing any details." 
              "It's likely in response to the Justice in Policing Act, a bill that would, if passed, restrict how police can use facial recognition technology."
              "We hope this one-year moratorium might give Congress enough time to implement appropriate rules, and we stand ready to help if requested, said Amazon in the unbylined blog post."
              "But the statement did not say if the moratorium would apply to the federal government, the source of most of the criticism against Amazon's facial recognition technology." 
              "Amazon also did not say in the statement what action it would take after the yearlong moratorium expires."
              "Amazon is known to have pitched its facial recognition technology, Rekognition, to federal agencies, like Immigration and Customs Enforcement." 
              "Last year, Amazon's cloud chief Andy Jassy said in an interview the company would provide Rekognition to any government department."
              "Amazon spokesperson Kristin Brown declined to comment further or say if the moratorium applies to federal law enforcement."
              "There are dozens of companies providing facial recognition technology to police, but Amazon is by far the biggest." 
              "Amazon has come under the most scrutiny after its Rekognition face-scanning technology showed bias against people of color.")
    svos = findSVOs(tok)
    #printDeps(tok)
    print(svos)

    


In [18]:
#Extract and print subject-verb-object (SVO) relations from each sentence 
if __name__ == "__main__":
    testSVOs()

[('it', 'put', 'brakes'), ('it', 'leaving', 'market'), ('krishna', 'cited', 'pursuit'), ('krishna', 'cited', 'equity'), ('it', 'putting', 'moratorium'), ('regulations', 'governing', 'use'), ('police', 'use', 'technology'), ('moratorium', 'give', 'congress'), ('moratorium', 'give', 'time'), ('time', 'implement', 'rules'), ('amazon', 'pitched', 'technology'), ('company', 'provide', 'rekognition'), ('company', 'provide', 'to'), ('companies', 'providing', 'technology'), ('technology', 'showed', 'bias')]


In [19]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [20]:
nlp = spacy.load('en_core_web_sm')

In [21]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [22]:
keyphrase_extractor = TextRank4Keyword()

In [23]:
#Apply TextRank for ranking and selecting key phrases, print the result

text = """In a surprise blog post, Amazon said it will put the brakes on providing its facial recognition technology to police for one year, but refuses to say if the move applies to federal law enforcement agencies.

The moratorium comes two days after IBM said in a letter it was leaving the facial recognition market altogether.

Arvind Krishna, IBM's chief executive, cited a pursuit of justice and racial equity in light of the recent protests sparked by the killing of George Floyd by a white police officer in Minneapolis last month.

Amazon's statement — just 102 words in length — did not say why it was putting the moratorium in place, but noted that Congress appears ready to work on stronger regulations governing the use of facial recognition — again without providing any details.

It's likely in response to the Justice in Policing Act, a bill that would, if passed, restrict how police can use facial recognition technology.

We hope this one-year moratorium might give Congress enough time to implement appropriate rules, and we stand ready to help if requested, said Amazon in the unbylined blog post.

But the statement did not say if the moratorium would apply to the federal government, the source of most of the criticism against Amazon's facial recognition technology.

Amazon also did not say in the statement what action it would take after the yearlong moratorium expires.

Amazon is known to have pitched its facial recognition technology, Rekognition, to federal agencies, like Immigration and Customs Enforcement.

Last year, Amazon's cloud chief Andy Jassy said in an interview the company would provide Rekognition to any government department.

Amazon spokesperson Kristin Brown declined to comment further or say if the moratorium applies to federal law enforcement.

There are dozens of companies providing facial recognition technology to police, but Amazon is by far the biggest. 

Amazon has come under the most scrutiny after its Rekognition face-scanning technology showed bias against people of color."""


tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN',"ADP"], window_size=8, lower=False)
tr4w.get_keywords(10)

Amazon - 3.8802230107695324
recognition - 3.063949530407874
technology - 2.771828162901035
police - 2.408977137971219
moratorium - 2.352823648224532
Rekognition - 1.8097523672725209
year - 1.7093370318643593
IBM - 1.3081532659520239
Congress - 1.2540659965274186
government - 1.2481422535786657
light - 1.2223783339281367
protests - 1.219482203432928


In [24]:
#Another TextRank Implementation 
#!conda install -c anaconda nltk

In [25]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda word__pos__chunk: word__pos__chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

In [26]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

In [27]:
def score_keyphrases_by_tfidf(texts, candidates='chunks'):
    import gensim, nltk
    
    # extract candidates from each text in texts, either chunks or words
    if candidates == 'chunks':
        boc_texts = [extract_candidate_chunks(text) for text in texts]
    elif candidates == 'words':
        boc_texts = [extract_candidate_words(text) for text in texts]
    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return corpus_tfidf, dictionary

In [28]:
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import operator
    import networkx, nltk
    
    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)[:n_keywords]}
                  #for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
                  
    #sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[:3]
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)
            
    return sorted(keyphrases.items(), key=operator.itemgetter(1), reverse=True)
    #return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)

In [29]:
text = """In a surprise blog post, Amazon said it will put the brakes on providing its facial recognition technology to police for one year, but refuses to say if the move applies to federal law enforcement agencies.

The moratorium comes two days after IBM said in a letter it was leaving the facial recognition market altogether.

Arvind Krishna, IBM's chief executive, cited a pursuit of justice and racial equity in light of the recent protests sparked by the killing of George Floyd by a white police officer in Minneapolis last month.

Amazon's statement — just 102 words in length — did not say why it was putting the moratorium in place, but noted that Congress appears ready to work on stronger regulations governing the use of facial recognition — again without providing any details.

It's likely in response to the Justice in Policing Act, a bill that would, if passed, restrict how police can use facial recognition technology.

We hope this one-year moratorium might give Congress enough time to implement appropriate rules, and we stand ready to help if requested, said Amazon in the unbylined blog post.

But the statement did not say if the moratorium would apply to the federal government, the source of most of the criticism against Amazon's facial recognition technology.

Amazon also did not say in the statement what action it would take after the yearlong moratorium expires.

Amazon is known to have pitched its facial recognition technology, Rekognition, to federal agencies, like Immigration and Customs Enforcement.

Last year, Amazon's cloud chief Andy Jassy said in an interview the company would provide Rekognition to any government department.

Amazon spokesperson Kristin Brown declined to comment further or say if the moratorium applies to federal law enforcement.

There are dozens of companies providing facial recognition technology to police, but Amazon is by far the biggest. 

Amazon has come under the most scrutiny after its Rekognition face-scanning technology showed bias against people of color."""

score_keyphrases_by_textrank(text)

[('amazon', 0.05295206215281476),
 ('moratorium', 0.03853730582425931),
 ('facial', 0.02501702815608999),
 ('technology', 0.023647286518997324),
 ('—', 0.021017787716854034),
 ('rekognition', 0.02031813115085264),
 ('federal', 0.019365986155854005)]

In [30]:
pip install sumy

Note: you may need to restart the kernel to use updated packages.


In [31]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

class TextSummary(object):

    def __init__(self, feeds_str, num_sents):
        self.summary = str()
        
        parser = PlaintextParser.from_string(feeds_str, Tokenizer("english"))
        summarizer = LexRankSummarizer()

        sentences = summarizer(parser.document, num_sents)  # Summarize the document with 5 sentences
        for sentence in sentences:
            self.summary += (sentence.__unicode__())

    def output(self):
        return self.summary

In [32]:
input_text = """In a surprise blog post, Amazon said it will put the brakes on providing its facial recognition technology to police for one year, but refuses to say if the move applies to federal law enforcement agencies.

The moratorium comes two days after IBM said in a letter it was leaving the facial recognition market altogether.

Arvind Krishna, IBM's chief executive, cited a pursuit of justice and racial equity in light of the recent protests sparked by the killing of George Floyd by a white police officer in Minneapolis last month.

Amazon's statement — just 102 words in length — did not say why it was putting the moratorium in place, but noted that Congress appears ready to work on stronger regulations governing the use of facial recognition — again without providing any details.

It's likely in response to the Justice in Policing Act, a bill that would, if passed, restrict how police can use facial recognition technology.

We hope this one-year moratorium might give Congress enough time to implement appropriate rules, and we stand ready to help if requested, said Amazon in the unbylined blog post.

But the statement did not say if the moratorium would apply to the federal government, the source of most of the criticism against Amazon's facial recognition technology.

Amazon also did not say in the statement what action it would take after the yearlong moratorium expires.

Amazon is known to have pitched its facial recognition technology, Rekognition, to federal agencies, like Immigration and Customs Enforcement.

Last year, Amazon's cloud chief Andy Jassy said in an interview the company would provide Rekognition to any government department.

Amazon spokesperson Kristin Brown declined to comment further or say if the moratorium applies to federal law enforcement.

There are dozens of companies providing facial recognition technology to police, but Amazon is by far the biggest. 

Amazon has come under the most scrutiny after its Rekognition face-scanning technology showed bias against people of color."""



In [33]:
#Apply LexRank to produce an extractive summary of 5 sentences.
text_to_sum = TextSummary(input_text,5)
print(text_to_sum.output())

In a surprise blog post, Amazon said it will put the brakes on providing its facial recognition technology to police for one year, but refuses to say if the move applies to federal law enforcement agencies.The moratorium comes two days after IBM said in a letter it was leaving the facial recognition market altogether.Amazon's statement — just 102 words in length — did not say why it was putting the moratorium in place, but noted that Congress appears ready to work on stronger regulations governing the use of facial recognition — again without providing any details.But the statement did not say if the moratorium would apply to the federal government, the source of most of the criticism against Amazon's facial recognition technology.There are dozens of companies providing facial recognition technology to police, but Amazon is by far the biggest.


In [59]:
#End of Assignment 5 