In [2]:
import collections
import itertools
import logging

import networkx as nx
from cytoolz import itertoolz
from spacy.tokens.span import Span as SpacySpan
from spacy.tokens.token import Token as SpacyToken

import compat
import extract
import vsm

LOGGER = logging.getLogger(__name__)


In [3]:
import operator
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors
import nltk
from keras.preprocessing.text import Tokenizer
from gensim.models.phrases import Phrases, Phraser
import spacy 
import textacy
nlp = spacy.load('en_core_web_sm')

Using Theano backend.
  from ._conv import register_converters as _register_converters


## Text used
Machine learning is a field of computer science that uses statistical techniques to give computer systems the ability to "learn" (e.g., progressively improve performance on a specific task) with data, without being explicitly programmed.The name machine learning was coined in 1959 by Arthur Samuel.Evolved from the study of pattern recognition and computational learning theory in artificial intelligence, machine learning explores the study and construction of algorithms that can learn from and make predictions on data – such algorithms overcome following strictly static program instructions by making data-driven predictions or decisions, through building a model from sample inputs. Machine learning is employed in a range of computing tasks where designing and programming explicit algorithms with good performance is difficult or infeasible; example applications include email filtering, detection of network intruders, and computer vision.Machine learning is closely related to (and often overlaps with) computational statistics, which also focuses on prediction-making through the use of computers. It has strong ties to mathematical optimization, which delivers methods, theory and application domains to the field. Machine learning is sometimes conflated with data mining, where the latter subfield focuses more on exploratory data analysis and is known as unsupervised learning.Within the field of data analytics, machine learning is a method used to devise complex models and algorithms that lend themselves to prediction; in commercial use, this is known as predictive analytics. These analytical models allow researchers, data scientists, engineers, and analysts to "produce reliable, repeatable decisions and results" and uncover "hidden insights" through learning from historical relationships and trends in the data.


In [4]:
text = "Machine learning is a field of computer science that uses statistical techniques to give computer systems the ability to learn (e.g., progressively improve performance on a specific task) with data, without being explicitly programmed.The name machine learning was coined in 1959 by Arthur Samuel.Evolved from the study of pattern recognition and computational learning theory in artificial intelligence, machine learning explores the study and construction of algorithms that can learn from and make predictions on data – such algorithms overcome following strictly static program instructions by making data-driven predictions or decisions, through building a model from sample inputs. Machine learning is employed in a range of computing tasks where designing and programming explicit algorithms with good performance is difficult or infeasible; example applications include email filtering, detection of network intruders, and computer vision.Machine learning is closely related to (and often overlaps with) computational statistics, which also focuses on prediction-making through the use of computers. It has strong ties to mathematical optimization, which delivers methods, theory and application domains to the field. Machine learning is sometimes conflated with data mining, where the latter subfield focuses more on exploratory data analysis and is known as unsupervised learning.Within the field of data analytics, machine learning is a method used to devise complex models and algorithms that lend themselves to prediction; in commercial use, this is known as predictive analytics. These analytical models allow researchers, data scientists, engineers, and analysts to produce reliable, repeatable decisions and results and uncover hidden insights through learning from historical relationships and trends in the data."

In [5]:
doc = nlp(u"Machine learning is a field of computer science that uses statistical techniques to give computer systems the ability to learn (e.g., progressively improve performance on a specific task) with data, without being explicitly programmed.The name machine learning was coined in 1959 by Arthur Samuel.Evolved from the study of pattern recognition and computational learning theory in artificial intelligence, machine learning explores the study and construction of algorithms that can learn from and make predictions on data – such algorithms overcome following strictly static program instructions by making data-driven predictions or decisions, through building a model from sample inputs. Machine learning is employed in a range of computing tasks where designing and programming explicit algorithms with good performance is difficult or infeasible; example applications include email filtering, detection of network intruders, and computer vision.Machine learning is closely related to (and often overlaps with) computational statistics, which also focuses on prediction-making through the use of computers. It has strong ties to mathematical optimization, which delivers methods, theory and application domains to the field. Machine learning is sometimes conflated with data mining, where the latter subfield focuses more on exploratory data analysis and is known as unsupervised learning.Within the field of data analytics, machine learning is a method used to devise complex models and algorithms that lend themselves to prediction; in commercial use, this is known as predictive analytics. These analytical models allow researchers, data scientists, engineers, and analysts to produce reliable, repeatable decisions and results and uncover hidden insights through learning from historical relationships and trends in the data.")

### Function to calculate the keywords

In [26]:
def key_terms_from_semantic_network(doc, normalize='lemma',
                                    window_width=2, edge_weighting='binary',
                                    ranking_algo='pagerank', join_key_words=False,
                                    n_keyterms=10, **kwargs):
    """
    Extract key terms from a document by ranking nodes in a semantic network of
    terms, connected by edges and weights specified by parameters.
    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        normalize (str or callable): if 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if None, use the form of terms as they appeared in
            ``doc``; if a callable, must accept a ``spacy.Token`` and return a str,
            e.g. :func:`textacy.spacier.utils.get_normalized_text()`
        window_width (int): width of sliding window in which term
            co-occurrences are said to occur
        edge_weighting ('binary', 'cooc_freq'}): method used to
            determine weights of edges between nodes in the semantic network;
            if 'binary', edge weight is set to 1 for any two terms co-occurring
            within `window_width` terms; if 'cooc_freq', edge weight is set to
            the number of times that any two terms co-occur
        ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}):
            algorithm with which to rank nodes in the semantic network;
            `pagerank` is the canonical (and default) algorithm, but it prioritizes
            node centrality at the expense of node diversity; the other two
            attempt to balance centrality with diversity
        join_key_words (bool): if True, join consecutive key words
            together into longer key terms, taking the sum of the constituent words'
            scores as the joined key term's combined score
        n_keyterms (int or float): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            is converted to an integer by ``round(len(doc) * n_keyterms)``
    Returns:
        List[Tuple[str, float]]: sorted list of top ``n_keyterms`` key terms and
        their corresponding ranking scores
    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]
    """
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(round(len(doc) * n_keyterms))

        
        
    if edge_weighting == 'binary':
        include_pos = {'NOUN', 'PROPN', 'ADJ'}
        if normalize == 'lemma':
            word_list = [word.lemma_ for word in doc]
            good_word_list = [word.lemma_ for word in doc
                              if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
        elif normalize == 'lower':
            word_list = [word.lower_ for word in doc]
            good_word_list = [word.lower_ for word in doc
                              if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
        elif not normalize:
            word_list = [word.text for word in doc]
            good_word_list = [word.text for word in doc
                              if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
        else:
            word_list = [normalize(word) for word in doc]
            good_word_list = [normalize(word) for word in doc
                                  if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
        
        # HACK: omit empty strings, which happen as a bug in spacy as of v1.5
        # and may well happen with ``normalize`` as a callable
        # an empty string should never be considered a keyterm
    
    
        good_word_list = [word for word in good_word_list if word]    
        
        graph = terms_to_semantic_network( good_word_list, window_width = window_width, edge_weighting = edge_weighting , phrases = join_key_words )
        
        
       
        
        if ranking_algo == 'pagerank':
            word_ranks = nx.pagerank_scipy(graph , max_iter = 100 ,  weight = 'weight')
        elif ranking_algo == 'divrank':
            word_ranks = rank_nodes_by_divrank(  graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5))
        elif ranking_algo == 'bestcoverage':
            word_ranks = rank_nodes_by_bestcoverage( graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0))
            
            
        if edge_weighting == 'binary' and join_key_words is False:
            return [(word, score) for word, score in
                    sorted(word_ranks.items(), key=operator.itemgetter(1), reverse=True)[:n_keyterms]]
    
        elif edge_weighting == 'binary' and join_key_words is True:   
            top_n = int(0.25 * len(word_ranks))
            top_word_ranks = {word: rank for word, rank in sorted(word_ranks.items(), key=operator.itemgetter(1), reverse=True)[:top_n]}
            # join consecutive key words into key terms
            seen_joined_key_terms = set()
            joined_key_terms = []
            for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks):
                if key is True:
                    words = list(group)
                    term = ' '.join(words)
                    if term in seen_joined_key_terms:
                        continue
                    seen_joined_key_terms.add(term)
                    joined_key_terms.append((term, sum(word_ranks[word] for word in words)))        
            return sorted(joined_key_terms, key=operator.itemgetter(1, 0), reverse=True)[:n_keyterms]
        
        
    elif edge_weighting == 'embedding':
        good_word_list_noun_chunks = extract.noun_chunks(doc , drop_determiners = True, min_freq = 1 )
        good_word_list_ner = extract.named_entities( doc, include_types = None, exclude_types = None, drop_determiners = True, min_freq = 1 )
        
        list_noun_chunks = []
        for a in good_word_list_noun_chunks :
            list_noun_chunks.append(str(a))
            
        list_ner = []
        for a in good_word_list_ner :
            list_ner.append(str(a)) 
        
        list_nc_ner = [] 
        for a in list_noun_chunks:
            list_nc_ner.append(str(a))
        for a in list_ner:
            list_nc_ner.append(str(a))    
        
        phrases = []
        words = []

        for a in list_nc_ner:
            if len([b  for b in a.split()]) == 1:
                words.append(a)
            if len([b  for b in a.split()]) > 1:
                phrases.append(a)        
        
        
        
        if edge_weighting == 'embedding' and join_key_words == False:    
            graph = terms_to_semantic_network( words , window_width = window_width, edge_weighting = edge_weighting , phrases = join_key_words)
    
        elif edge_weighting == 'embedding' and join_key_words == True:
            graph = terms_to_semantic_network( phrases , window_width = window_width, edge_weighting = edge_weighting , phrases = join_key_words)
    
         
        if ranking_algo == 'pagerank':
            word_ranks = nx.pagerank_scipy(graph , max_iter = 100 ,  weight = 'weight')
        elif ranking_algo == 'divrank':
            word_ranks = rank_nodes_by_divrank(  graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5))
        elif ranking_algo == 'bestcoverage':
            word_ranks = rank_nodes_by_bestcoverage( graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.))
            
        
        return [(word, score) for word, score in
                sorted(word_ranks.items(), key=operator.itemgetter(1), reverse=True)[:n_keyterms]]
        
            
            

### Keywords with embeddings and phrases

In [27]:
terms = key_terms_from_semantic_network( doc, normalize='lemma',
                                    window_width=2, edge_weighting='embedding',
                                    ranking_algo='pagerank', join_key_words = True,
                                    n_keyterms=10)   
terms

[('machine learning', 0.061804724756885566),
 ('Machine learning', 0.05834873746330665),
 ('computer science', 0.05100664234397605),
 ('data scientists', 0.04700542195976324),
 ('data mining', 0.03611174636299395),
 ('computing tasks', 0.03405591738489863),
 ('sample inputs', 0.032206119162640906),
 ('Arthur Samuel', 0.032206119162640906),
 ('pattern recognition', 0.032206119162640906),
 ('historical relationships', 0.032206119162640906)]

### Keywords with binary and phrases

In [28]:
terms = key_terms_from_semantic_network( doc, normalize='lemma',
                                    window_width=2, edge_weighting='binary',
                                    ranking_algo='pagerank', join_key_words = True,
                                    n_keyterms=10)   
terms

[('computational learning theory', 0.0712335712229255),
 ('machine learning', 0.0647414744052546),
 ('datum analytic', 0.06261682489695981),
 ('datum', 0.047329322036000925),
 ('learning', 0.03934194663842974),
 ('computer', 0.0327936669527043),
 ('method use', 0.03241966295654762),
 ('algorithm', 0.027124563106213542),
 ('prediction', 0.026868210400336164),
 ('model', 0.02331510184695999)]

### Keywords with embeddings and not phrases just words

In [29]:
terms = key_terms_from_semantic_network( doc, normalize='lemma',
                                    window_width=2, edge_weighting='embedding',
                                    ranking_algo='pagerank', join_key_words = False,
                                    n_keyterms=10)   
terms

[('prediction', 0.08434577002010366),
 ('field', 0.07757674791782004),
 ('study', 0.07664986997651164),
 ('methods', 0.0641509069880738),
 ('algorithms', 0.06415090698807378),
 ('results', 0.06415090698807378),
 ('theory', 0.05265271718110291),
 ('detection', 0.046259448947606786),
 ('predictions', 0.045843984357979574),
 ('ability', 0.04395604395604395)]

### Keywords with binary and not phrases just words

In [30]:
terms = key_terms_from_semantic_network( doc, normalize='lemma',
                                    window_width=2, edge_weighting='binary',
                                    ranking_algo='pagerank', join_key_words = False,
                                    n_keyterms=10)   
terms

[('datum', 0.047329322036000925),
 ('learning', 0.03934194663842974),
 ('computer', 0.0327936669527043),
 ('algorithm', 0.027124563106213542),
 ('prediction', 0.026868210400336164),
 ('machine', 0.02539952776682486),
 ('model', 0.02331510184695999),
 ('performance', 0.018365129915958473),
 ('field', 0.01828901389385712),
 ('application', 0.017626060335524624)]

### Helper functions

In [25]:
def get_word_embedding(terms):
    filename = 'C:/Users/hp/Word_embeddings/GoogleNews-vectors-negative300.bin'
    model = gensim.models.KeyedVectors.load_word2vec_format( filename , binary=True)

    words_to_index = {}
    i = 0;
    for word in terms:
        if not word in words_to_index:
            words_to_index[str(word)] = i
            i = i + 1
        else:
            continue
   
   
    embedding_matrix = np.zeros(( len(words_to_index) , 300 ))
    for word , i in words_to_index.items():
        try:
            embedding_vector = model[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        

    return (embedding_matrix , words_to_index)
    

    

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    if norm_a == 0 or norm_b == 0 :
        return 0
    
    return dot_product / (norm_a * norm_b)


def get_cosine_mat(windows , embedding_matrix , words_to_index ):
    from scipy import spatial
    cosine_mat = collections.defaultdict(lambda: collections.defaultdict(float))
  
    for window in windows:
        for w1, w2 in itertools.combinations(sorted(window), 2):
               cosine_mat[w1][w2] = cos_sim(embedding_matrix[words_to_index[w1]] , embedding_matrix[words_to_index[w2]] )
           
            
    return cosine_mat 

In [16]:
from collections import defaultdict
import math

def count_of_single( Tokens , freqThreshold):          
    word_freq = defaultdict(int)
    fdist = defaultdict(int)
    for a in Tokens:
        fdist[a] += 1 
    
    for word , freq in sorted(fdist.items(), key=lambda k__v: (k__v[1],k__v[0])):
        if freq > freqThreshold:
            word_freq[word] = freq
    return word_freq

def count_of_bigrams( Tokens ,freqThreshold):
        bigram_freq = defaultdict(int)

        b = nltk.collocations.BigramCollocationFinder.from_words(Tokens)
        b.apply_freq_filter(freqThreshold)
       
        for bigram, freq in b.ngram_fd.items():
                bigram=" ".join([bigram[0], bigram[1]])
                bigram_freq[bigram] = freq
        return bigram_freq
    
def pmi(w1, w2, unigram_freq , bigram_freq):

    prob_word1 = unigram_freq[w1] / float(sum(unigram_freq.values()))
    prob_word2 = unigram_freq[w2] / float(sum(unigram_freq.values()))
    prob_word1_word2 = bigram_freq[" ".join([w1, w2])] / float(sum(bigram_freq.values()))

   
    #print("PMI FOR W1 AND W2 ")
    ##print(w1 , w2)
    #print("Probability of w1 ")
    #print( prob_word1)
    #print("Probability of w2 ")
    #print( prob_word2)
    #print("Probability of w1 and w2 joint ")
    #print( prob_word1_word2 )
    if prob_word1_word2 == 0 :
        return 0
    #print(math.log(prob_word1_word2/float(prob_word1*prob_word2),2))
    try:

        return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)

    except: # Occurs when calculating PMI for Out-of-Vocab words.

        return 0

def get_pmi_mat(terms , windows ):
    pmi_mat = collections.defaultdict(lambda: collections.defaultdict(float))
    unigram_freq = count_of_single( terms , 0)
    bigram_freq = count_of_bigrams( terms , 0)
   
    for window in windows:
        #print(" IN GET_PMI WINDOW IN WINDOWS")
        for w1, w2 in itertools.combinations(sorted(window), 2):
            pmi_mat[w1][w2] = pmi( w1 , w2 ,  unigram_freq , bigram_freq )
            
            
    return pmi_mat        
    

In [15]:
def get_final_weights( cosine_mat , pmi_mat ):
    
    final_weights = collections.defaultdict(lambda: collections.defaultdict(float))
    for (i , j) , (k , l) in zip(cosine_mat.items() ,pmi_mat.items()):
        for (a , b) , (c , d) in zip(j.items() , l.items()):
                final_weights[i][a] = b*d 
              
          
    return final_weights

In [14]:
def get_word_embedding_phrases(terms):
    filename = 'C:/Users/hp/Word_embeddings/GoogleNews-vectors-negative300.bin'
    model = gensim.models.KeyedVectors.load_word2vec_format( filename , binary=True)

   
    words_to_index_1 = {}
    i = 0;
    for a in terms:
        for b in a.split():
            if not b in words_to_index_1:
                words_to_index_1[str(b)] = i
                i = i + 1
            else:
                continue   
    """   
    print("words_to_index in phrase embeddings")
    for k, v in words_to_index_1.items():
        print(k, v)
        
    print(" LENGTH OF WORS_TO_INDEX IS : ")
    print(len(words_to_index_1))
    """
    
    embedding_matrix = np.zeros(( len(words_to_index_1) , 300 ))
    for word , i in words_to_index_1.items():
        try:
            embedding_vector = model[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
       
    
    phrases_to_index = {}
    i = 0;
    for a in terms:
            if not a in phrases_to_index:
                phrases_to_index[str(a)] = i
                i = i + 1
            else:
                continue 
    """    
    print()    
    print("phrases_to_index in phrase embeddings")
    print()
    for k, v in phrases_to_index.items():
        print(k, v) 
        
    print(" LENGTH OF PHRASES_TO_INDEX IS : ")
    print(len( phrases_to_index ))    
    """
    
    embedding_matrix_phrases = np.zeros(( len(words_to_index_1) , 300 ))
    for phrase , i in phrases_to_index.items():
        embedding_vector_final = np.zeros(300)
        for a in phrase.split():
            try:
                embedding_vector = model[a] 
            except KeyError:
                embedding_vector = None    
            if embedding_vector is not None:
                embedding_vector_final = np.add( embedding_vector_final , embedding_vector )   
        embedding_matrix_phrases[i] = embedding_vector_final
        
           
    return  embedding_matrix ,  words_to_index_1 , embedding_matrix_phrases ,  phrases_to_index
        

### Fucntion for building the graph

In [13]:
def terms_to_semantic_network(terms, normalize='lemma', window_width=10, edge_weighting='cooc_freq' ,  phrases = False ):
   
    """
    Transform an ordered list of non-overlapping terms into a semantic network,
    where each term is represented by a node with weighted edges linking it to
    other terms that co-occur within ``window_width`` terms of itself.
    Args:
        terms (List[str] or List[``spacy.Token``])
        normalize (str or Callable): If 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if false-y, use the form of terms as they appear
            in ``terms``; if a callable, must accept a ``spacy.Token`` and return
            a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`.
            .. note:: This is applied to the elements of ``terms`` *only* if
               it's a list of ``spacy.Token``.
        window_width (int): Size of sliding window over ``terms`` that determines
            which are said to co-occur. If 2, only immediately adjacent terms
            have edges in the returned network.
        edge_weighting ({'cooc_freq', 'binary'}): If 'cooc_freq', the nodes for
            all co-occurring terms are connected by edges with weight equal to
            the number of times they co-occurred within a sliding window;
            if 'binary', all such edges have weight = 1.
    Returns:
        ``networkx.Graph``: Nodes in this network correspond to individual terms;
        those that co-occur are connected by edges with weights determined
        by ``edge_weighting``.
    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to have normalized them so that
          like terms are counted together; for example, by applying
          :func:`textacy.spacier.utils.get_normalized_text()`
    """
    if window_width < 2:
        raise ValueError(
            '`window_width` = {} is invalid; value must be >= 2'.format(window_width))
    if not terms:
        raise ValueError(
            '`terms` = {} is invalid; it must contain at least 1 term '
            'in the form of a string or spacy token'.format(terms))

    # if len(terms) < window_width, cytoolz throws a StopIteration error
    # which we don't want
    if len(terms) < window_width:
        LOGGER.info(
            '`terms` has fewer items (%s) than the specified `window_width` (%s); '
            'setting window width to %s',
            len(terms), window_width, len(terms))
        window_width = len(terms)

    if isinstance(terms[0], compat.unicode_):
        windows = itertoolz.sliding_window(window_width, terms)
        to_return = list(windows)
    elif isinstance(terms[0], SpacyToken):
        if normalize == 'lemma':
            windows = ((tok.lemma_ for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        elif normalize == 'lower':
            windows = ((tok.lower_ for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        elif not normalize:
            windows = ((tok.text for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        else:
            windows = ((normalize(tok) for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
    else:
        raise TypeError(
            'items in `terms` must be strings or spacy tokens, not {}'.format(type(terms[0])))

    
    graph = nx.Graph()

    
    embedding_matrix_phrases = np.zeros(( len(terms) , 300 ))
    
    if edge_weighting == 'cooc_freq':
        cooc_mat = collections.defaultdict(lambda: collections.defaultdict(int))
        for window in to_return:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1

                               
        graph.add_edges_from(
            (w1, w2, {'weight': weight})
            for w1, w2s in cooc_mat.items()
            for w2, weight in w2s.items())
    elif edge_weighting == 'binary':
        graph.add_edges_from(
            w1_w2 for window in to_return
            for w1_w2 in itertools.combinations(window, 2))
    
    elif edge_weighting == 'embedding' and phrases == False:
        embedding_matrix , word_to_index = get_word_embedding(terms)
        cosine_mat = get_cosine_mat(to_return , embedding_matrix  , word_to_index )
        pmi_mat = get_pmi_mat( terms , to_return )
        final_weights = get_final_weights( cosine_mat , pmi_mat )
        
        for window in to_return:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                graph.add_weighted_edges_from(
                    (w1, w2,  weight)
                    for w1, w2s in final_weights.items()
                    for w2, weight in w2s.items())
      
    elif edge_weighting == 'embedding' and phrases == True:
        embedding_matrix ,  words_to_index_1 , embedding_matrix_phrases ,  phrases_to_index =  get_word_embedding_phrases(terms)
        cosine_mat = get_cosine_mat( to_return , embedding_matrix_phrases  , phrases_to_index )
        pmi_mat = get_pmi_mat( terms , to_return )
        final_weights = get_final_weights( cosine_mat , pmi_mat )
   
        for window in to_return:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                graph.add_weighted_edges_from(
                    (w1, w2,  weight)
                    for w1, w2s in final_weights.items()
                    for w2, weight in w2s.items())
                
    
    #for (u , v , d) in graph.edges(data='weight'):
        #if d < 0:
            #d = d*(-1)
           # graph[u][v]['weight'] = d
            #print("CHANGED")
            #print()
    
  
   
    return graph
    