## Importing libraries

In [22]:
import collections
import itertools
import logging

import networkx as nx
import operator
from cytoolz import itertoolz
from spacy.tokens.span import Span as SpacySpan
from spacy.tokens.token import Token as SpacyToken

import compat
import extract
import vsm

LOGGER = logging.getLogger(__name__)

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

import nltk
from keras.preprocessing.text import Tokenizer

from gensim.models.phrases import Phrases, Phraser

import numpy as np
import spacy 
import textacy
nlp = spacy.load('en_core_web_sm')

## Text used
Citigroup analysts are predicting a full-on bear market within months based on historical trends, according to a new note by equity strategist Robert Buckland. Here's how you can protect your portfolio. Most everyone will suffer losses in a bear market short-sellers are winners, of course, but investors can decide now for themselves how much they are willing to risk says certified financial planner Alexander G. Koury of Values Quest, Inc. The first thing to do is check the current risk of the portfolio, Koury said. This will help the investor determine what would be the worst case scenario if the market were to go into a bear market. That means an investor will know how much they're willing to lose of their portfolio, and they can determine whether or not that is comfortable for them. Investors who don't plan to make withdrawals from their portfolios for decades could leave their investments be until the next bull market, but investors planning on retiring soon might want to limit their exposure. Koury recommends that investors should seek the help of either a financial planner or software to see if a reallocation is necessary to help them meet their goals. Set Aside What You Need To Live. In addition to limiting their exposure to equities, retirees and other investors living off of their portfolio's returns also should prioritize their living expenses over investing when the market's down. If you are taking income from your portfolio, always be sure you have a couple year's worth of withdrawals in money market or short term bonds, said Edward Snyder, certified financial planner at Oak Tree Advisors. The rest of your portfolio should be diversified among major asset classes, including intermediate term bonds. This should allow you to ride out a down market without having to sell stock investments while the market is down. Mentally Prepare Yourself. Your own bad investment decisions can cost your portfolio as much as market losses, certified financial planner Patrick Amey thinks. Prepare yourself emotionally to ride it out and tune out the noise, Amey said. Yes, your portfolio will go down in value. But you have the cash you need so you can give your portfolio the time it needs to recover. Stay consistent with you allocation and don't make knee jerk decision.

In [7]:
doc = nlp(u"Citigroup analysts are predicting a full-on bear market within months based on historical trends, according to a new note by equity strategist Robert Buckland. Here's how you can protect your portfolio. Most everyone will suffer losses in a bear market short-sellers are winners, of course, but investors can decide now for themselves how much they are willing to risk says certified financial planner Alexander G. Koury of Values Quest, Inc. The first thing to do is check the current risk of the portfolio, Koury said. This will help the investor determine what would be the worst case scenario if the market were to go into a bear market. That means an investor will know how much they're willing to lose of their portfolio, and they can determine whether or not that is comfortable for them. Investors who don't plan to make withdrawals from their portfolios for decades could leave their investments be until the next bull market, but investors planning on retiring soon might want to limit their exposure. Koury recommends that investors should seek the help of either a financial planner or software to see if a reallocation is necessary to help them meet their goals. Set Aside What You Need To Live. In addition to limiting their exposure to equities, retirees and other investors living off of their portfolio's returns also should prioritize their living expenses over investing when the market's down. If you are taking income from your portfolio, always be sure you have a couple year's worth of withdrawals in money market or short term bonds, said Edward Snyder, certified financial planner at Oak Tree Advisors. The rest of your portfolio should be diversified among major asset classes, including intermediate term bonds. This should allow you to ride out a down market without having to sell stock investments while the market is down. Mentally Prepare Yourself. Your own bad investment decisions can cost your portfolio as much as market losses, certified financial planner Patrick Amey thinks. Prepare yourself emotionally to ride it out and tune out the noise, Amey said. Yes, your portfolio will go down in value. But you have the cash you need so you can give your portfolio the time it needs to recover. Stay consistent with you allocation and don't make knee jerk decision.")

In [23]:
def key_terms_from_semantic_network(doc, normalize='lemma',
                                    window_width=2, edge_weighting='cooc',
                                    ranking_algo='pagerank', join_key_words=False,
                                    n_keyterms=10, **kwargs):
    """
    Extract key terms from a document by ranking nodes in a semantic network of
    terms, connected by edges and weights specified by parameters.
    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        normalize (str or callable): if 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if None, use the form of terms as they appeared in
            ``doc``; if a callable, must accept a ``spacy.Token`` and return a str,
            e.g. :func:`textacy.spacier.utils.get_normalized_text()`
        window_width (int): width of sliding window in which term
            co-occurrences are said to occur
        edge_weighting ('binary', 'cooc_freq'}): method used to
            determine weights of edges between nodes in the semantic network;
            if 'binary', edge weight is set to 1 for any two terms co-occurring
            within `window_width` terms; if 'cooc_freq', edge weight is set to
            the number of times that any two terms co-occur
        ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}):
            algorithm with which to rank nodes in the semantic network;
            `pagerank` is the canonical (and default) algorithm, but it prioritizes
            node centrality at the expense of node diversity; the other two
            attempt to balance centrality with diversity
        join_key_words (bool): if True, join consecutive key words
            together into longer key terms, taking the sum of the constituent words'
            scores as the joined key term's combined score
        n_keyterms (int or float): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            is converted to an integer by ``round(len(doc) * n_keyterms)``
    Returns:
        List[Tuple[str, float]]: sorted list of top ``n_keyterms`` key terms and
        their corresponding ranking scores
    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]
    """
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(round(len(doc) * n_keyterms))

    include_pos = {'NOUN', 'PROPN', 'ADJ'}
    if normalize == 'lemma':
        word_list = [word.lemma_ for word in doc]
        good_word_list = [word.lemma_ for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
    elif normalize == 'lower':
        word_list = [word.lower_ for word in doc]
        good_word_list = [word.lower_ for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
    elif not normalize:
        word_list = [word.text for word in doc]
        good_word_list = [word.text for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
    else:
        word_list = [normalize(word) for word in doc]
        good_word_list = [normalize(word) for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]

    # HACK: omit empty strings, which happen as a bug in spacy as of v1.5
    # and may well happen with ``normalize`` as a callable
    # an empty string should never be considered a keyterm
    good_word_list = [word for word in good_word_list if word]
    
    
    ## the function is called as it is as it's in the same notebook
    ## terms_to_semantic_network() is modified
    ## TERMS_TO_SEMANTIC_NETWORK IS MODIFIED
    graph = terms_to_semantic_network( good_word_list, window_width = window_width, edge_weighting = edge_weighting)

    # rank nodes by algorithm, and sort in descending order
   

    if ranking_algo == 'pagerank':
        word_ranks = nx.pagerank_scipy(graph, weight='weight')
    elif ranking_algo == 'divrank':
        word_ranks = rank_nodes_by_divrank(  graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5))
    elif ranking_algo == 'bestcoverage':
          word_ranks = rank_nodes_by_bestcoverage(
               graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0))

    # bail out here if all we wanted was key *words* and not *terms*
    
    
    if join_key_words is False:
        return [(word, score) for word, score in
                sorted(word_ranks.items(), key=operator.itemgetter(1), reverse=True)[:n_keyterms]]

    
    top_n = int(0.25 * len(word_ranks))
    top_word_ranks = {word: rank for word, rank in
                      sorted(word_ranks.items(), key=operator.itemgetter(1), reverse=True)[:top_n]}

    # join consecutive key words into key terms
    seen_joined_key_terms = set()
    joined_key_terms = []
    for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks):
        if key is True:
            words = list(group)
            term = ' '.join(words)
            if term in seen_joined_key_terms:
                continue
            seen_joined_key_terms.add(term)
            joined_key_terms.append((term, sum(word_ranks[word] for word in words)))

    return sorted(joined_key_terms, key=operator.itemgetter(1, 0), reverse=True)[:n_keyterms]

   

### keywords with embeddings

In [24]:
key_terms = key_terms_from_semantic_network( doc, normalize='lemma',
                                    window_width=2, edge_weighting='embedding',
                                    ranking_algo='pagerank', join_key_words=False,
                                    n_keyterms=10)   

SUCCESSFUL


In [25]:
key_terms

[('portfolio', 0.07369445602364152),
 ('market', 0.0576838968632258),
 ('investor', 0.028771518488359696),
 ('case', 0.022916503859436804),
 ('value', 0.022668765270240723),
 ('equity', 0.022646900586409592),
 ('planner', 0.02264690058640958),
 ('term', 0.022341742486389593),
 ('investment', 0.020783941447813422),
 ('intermediate', 0.020511739216136447)]

### keywords with co occurence frequency

In [28]:
key_terms_new = key_terms_from_semantic_network( doc, normalize='lemma',
                                    window_width=2, edge_weighting='cooc_freq',
                                    ranking_algo='pagerank', join_key_words=False,
                                    n_keyterms=10)   

SUCCESSFUL


In [29]:
key_terms_new

[('portfolio', 0.0667483389589195),
 ('market', 0.06015778159753895),
 ('investor', 0.04244749210697408),
 ('planner', 0.02634748492269935),
 ('financial', 0.024609262141485413),
 ('koury', 0.020062844208003718),
 ('investment', 0.019659376113500673),
 ('bear', 0.01867282743668402),
 ('equity', 0.016164106083899316),
 ('amey', 0.014749823019943726)]

## Helper Functions for terms_to_semantic_network which makes the graph

##### Helper function 1 :  Returns emedding_matrix with a dictionary that stores the index of every word

In [10]:
def get_word_embedding(terms):
    filename = 'C:/Users/hp/Word_embeddings/GoogleNews-vectors-negative300.bin'
    model = gensim.models.KeyedVectors.load_word2vec_format( filename , binary=True)

    words_to_index = {}
    i = 0;
    for word in terms:
        if not word in words_to_index:
            words_to_index[str(word)] = i
            i = i + 1
        else:
            continue
    
    embedding_matrix = np.zeros(( len(words_to_index) , 300 ))
    for word , i in words_to_index.items():
        try:
            embedding_vector = model[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        

    return (embedding_matrix , words_to_index)

##### Helper function 2 :  Returns a nested dictiornary with the cosine similarity between the words that are within the window size

In [17]:
def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    if norm_a == 0 or norm_b == 0 :
        return 0
    
    return dot_product / (norm_a * norm_b)


def get_cosine_mat(windows , embedding_matrix , words_to_index ):
    from sklearn.metrics.pairwise import cosine_similarity
    from scipy import spatial
    cosine_mat = collections.defaultdict(lambda: collections.defaultdict(float))
  
    for window in windows:
        for w1, w2 in itertools.combinations(sorted(window), 2):
            cosine_mat[w1][w2] = cos_sim(embedding_matrix[words_to_index[w1]] , embedding_matrix[words_to_index[w2]] )
          
            
    return cosine_mat 

##### Helper function 3 : Returns a nested dictiornary with the PMI between the words that are within the window size

In [12]:
from collections import defaultdict
import math

def count_of_single( Tokens , freqThreshold):          
    word_freq = defaultdict(int)
    fdist = defaultdict(int)
    for a in Tokens:
        fdist[a] += 1 
    
    for word , freq in sorted(fdist.items(), key=lambda k__v: (k__v[1],k__v[0])):
        if freq > freqThreshold:
            word_freq[word] = freq
    return word_freq

def count_of_bigrams( Tokens ,freqThreshold):
        bigram_freq = defaultdict(int)

        b = nltk.collocations.BigramCollocationFinder.from_words(Tokens)
        b.apply_freq_filter(freqThreshold)
       
        for bigram, freq in b.ngram_fd.items():
                bigram=" ".join([bigram[0], bigram[1]])
                bigram_freq[bigram] = freq
        return bigram_freq
    
def pmi(w1, w2, unigram_freq , bigram_freq):

    prob_word1 = unigram_freq[w1] / float(sum(unigram_freq.values()))
    prob_word2 = unigram_freq[w2] / float(sum(unigram_freq.values()))
    prob_word1_word2 = bigram_freq[" ".join([w1, w2])] / float(sum(bigram_freq.values()))

    if prob_word1_word2 == 0 :
        return 0

    try:

        return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)

    except: # Occurs when calculating PMI for Out-of-Vocab words.

        return 0

def get_pmi_mat(terms , windows ):
    pmi_mat = collections.defaultdict(lambda: collections.defaultdict(float))
    unigram_freq = count_of_single( terms , 0)
    bigram_freq = count_of_bigrams( terms , 0)
   
    for window in windows:
        for w1, w2 in itertools.combinations(sorted(window), 2):
            pmi_mat[w1][w2] = pmi( w1 , w2 ,  unigram_freq , bigram_freq )
         
            
    return pmi_mat        
    

##### Helper function 4 :  Returns a nested dictionary with the score between all the words within the window size 
       score = pmi * similarity

In [13]:
def get_final_weights( cosine_mat , pmi_mat ):
    
    final_weights = collections.defaultdict(lambda: collections.defaultdict(float))
    for (i , j) , (k , l) in zip(cosine_mat.items() ,pmi_mat.items()):
        for (a , b) , (c , d) in zip(j.items() , l.items()):
                final_weights[i][a] = b*d 
              
          
    return final_weights

#### Modified fucntion:
     1. Option " embedding " is added

In [27]:
def terms_to_semantic_network(terms, normalize='lemma', window_width=10, edge_weighting='cooc_freq'):
   
    """
    Transform an ordered list of non-overlapping terms into a semantic network,
    where each term is represented by a node with weighted edges linking it to
    other terms that co-occur within ``window_width`` terms of itself.
    Args:
        terms (List[str] or List[``spacy.Token``])
        normalize (str or Callable): If 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if false-y, use the form of terms as they appear
            in ``terms``; if a callable, must accept a ``spacy.Token`` and return
            a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`.
            .. note:: This is applied to the elements of ``terms`` *only* if
               it's a list of ``spacy.Token``.
        window_width (int): Size of sliding window over ``terms`` that determines
            which are said to co-occur. If 2, only immediately adjacent terms
            have edges in the returned network.
        edge_weighting ({'cooc_freq', 'binary'}): If 'cooc_freq', the nodes for
            all co-occurring terms are connected by edges with weight equal to
            the number of times they co-occurred within a sliding window;
            if 'binary', all such edges have weight = 1.
    Returns:
        ``networkx.Graph``: Nodes in this network correspond to individual terms;
        those that co-occur are connected by edges with weights determined
        by ``edge_weighting``.
    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to have normalized them so that
          like terms are counted together; for example, by applying
          :func:`textacy.spacier.utils.get_normalized_text()`
    """
    if window_width < 2:
        raise ValueError(
            '`window_width` = {} is invalid; value must be >= 2'.format(window_width))
    if not terms:
        raise ValueError(
            '`terms` = {} is invalid; it must contain at least 1 term '
            'in the form of a string or spacy token'.format(terms))

    # if len(terms) < window_width, cytoolz throws a StopIteration error
    # which we don't want
    if len(terms) < window_width:
        LOGGER.info(
            '`terms` has fewer items (%s) than the specified `window_width` (%s); '
            'setting window width to %s',
            len(terms), window_width, len(terms))
        window_width = len(terms)

    if isinstance(terms[0], compat.unicode_):
        windows = itertoolz.sliding_window(window_width, terms)
        to_return = list(windows)
    elif isinstance(terms[0], SpacyToken):
        if normalize == 'lemma':
            windows = ((tok.lemma_ for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        elif normalize == 'lower':
            windows = ((tok.lower_ for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        elif not normalize:
            windows = ((tok.text for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        else:
            windows = ((normalize(tok) for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
    else:
        raise TypeError(
            'items in `terms` must be strings or spacy tokens, not {}'.format(type(terms[0])))

  
  
    graph = nx.Graph()

    if edge_weighting == 'cooc_freq':
        cooc_mat = collections.defaultdict(lambda: collections.defaultdict(int))
        for window in to_return:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
               
                               
        graph.add_edges_from(
            (w1, w2, {'weight': weight})
            for w1, w2s in cooc_mat.items()
            for w2, weight in w2s.items())
    elif edge_weighting == 'binary':
        graph.add_edges_from(
            w1_w2 for window in windows
            for w1_w2 in itertools.combinations(window, 2))
    
    ## option of embedding is added
    elif edge_weighting == 'embedding':
        embedding_matrix , word_to_index = get_word_embedding(terms)
        cosine_mat = get_cosine_mat(to_return , embedding_matrix  , word_to_index )
        pmi_mat = get_pmi_mat( terms , to_return )
        final_weights = get_final_weights( cosine_mat , pmi_mat )
        
        for window in to_return:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                graph.add_weighted_edges_from(
                    (w1, w2, weight)
                    for w1, w2s in final_weights.items()
                    for w2, weight in w2s.items())
   
    print("SUCCESSFUL")
    return graph
  
   
  
    

In [None]:
##return_graph = terms_to_semantic_network(terms, normalize='lemma', window_width=2, edge_weighting='embedding')