# Keywords (W+J)

In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import spacy
import operator
nlp = spacy.load('en')
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk import word_tokenize
from termcolor import colored
import re
import string
from collections import defaultdict
from collections import OrderedDict
from collections import Counter
data=open('stopwords.txt','r')
data_read = data.read()
stop_words=data_read.replace('\n',' ').split()

## (0) funcion que limpia los textos
### Esta funcion es mas general y entrega el texto filtrado y un diccionario de palabras filtradas {word original: word modificada,...}

In [97]:
# noun_adj == True significa que filtramos unicamente sustantivos y adjetivos, en otro caso, dejamos todo salvo ...
def clean(text,noun_adj): ## filtramos unicamente los sustantivos y adjetivos
    text=re.sub("[\(\[].*?[\)\]]", "", text)
    sentences=sent_tokenize(text)
    sentences=[nlp(sentence.lower()) for sentence in sentences] ## lower() implica que se descartan los PROPN
    if noun_adj==True:
        sentences=[[(token.lemma_,token.text) for token in sentence if token.tag_=='NN' or token.tag_=='NNS' or token.tag_=='JJ'] for sentence in sentences]
    else:
        sentences=[[(token.lemma_,token.text) for token in sentence if token.lemma_ != '-PRON-' and token.is_punct==False and token.like_num==False] for sentence in sentences]
    TEXT=[item for sublist in sentences for item in sublist]
    TEXT=[(word_l,word_nl) for (word_l,word_nl) in TEXT if not word_l in stop_words or not word_nl in stop_words]
    TEXT=[(re.sub(r'[^a-zA-Z0-9]', '', word_l),word_nl) for (word_l,word_nl) in TEXT]
    TEXT=[(word_l,word_nl) for (word_l,word_nl) in TEXT if word_l.isdigit()==False and word_l!='' and word_nl.isdigit()==False and word_nl!='']
    dict_text=list(set(TEXT))
    dict_text={word_nl:word_l for (word_l,word_nl) in dict_text}
    return list(zip(*TEXT))[0],dict_text ## texto filtrado, diccionario de word original: word modificada

## ejemplo

In [98]:
text='Sequencing the Neanderthal genome (Green et al., 2010, Prüfer et al., 2014), the Denisovan genome (Reich et al., 2010), and several early modern human genomes from Eurasia (Fu et al., 2014, Fu et al., 2015) has confirmed that archaic hominins left their mark in the genomes of modern humans (Plagnol and Wall, 2006, Sankararaman et al., 2014, Vernot and Akey, 2014, Vernot et al., 2016). Present-day individuals in Eurasia inherited ∼2% of their genome from Neanderthals (Green et al., 2010), and individuals from Oceania inherited ∼5% of their genome from Denisovans (Reich et al., 2010). Suggestive evidence indicates that admixture from other unidentified hominin species occurred in Africa (Hammer et al., 2011, Hsieh et al., 2016, Lachance et al., 2012, Plagnol and Wall, 2006, Wall et al., 2009). To understand the functional, phenotypic, and evolutionary consequences of archaic admixture, it is necessary to identify the specific haplotypes and alleles that were inherited from archaic hominin ancestors (Huerta-Sánchez et al., 2014, Juric et al., 2016, Sankararaman et al., 2014, Simonti et al., 2016, Vernot and Akey, 2014). Approaches to identifying introgressed haplotypes include methods that specifically incorporate reference archaic hominin genome sequences and reference-free methods that do not utilize such information. An example of the former category is the method of Sankararaman et al. (2014), which identifies archaic haplotypes by comparing modern human haplotypes to a reference archaic sequence. The latter category of methods include the S∗ statistic (Plagnol and Wall, 2006), which searches for the mutational signature that ancient admixture leaves in the genomes of present-day humans. The S∗ approach is powerful for finding introgressed haplotypes in the absence of an archaic reference genome because it leverages the unusual mutational characteristics of introgressed haplotypes. Because of the long divergence time between Neanderthals and modern humans, Neanderthals carry many alleles that are specific to their lineage. Such alleles are present on introgressed haplotypes but are absent or rare in African genomes. Further, based on the recent timing of admixture, introgressed haplotypes are expected to be maintained without recombination over distances of approximately 50 kb on average (Sankararaman et al., 2012), resulting in high levels of linkage disequilibrium (LD) between Neanderthal-specific alleles in non-African human genomes. In this study, we develop an S∗-like method that has increased power and is suitable for large-scale genome-wide data. We apply the method to large sets of sequenced data from Eurasia and Oceania and identify putative archaic-specific alleles. We examine the rate at which these alleles match the sequenced archaic genomes and the role of the genes containing these alleles, to obtain insights into the history of the admixture events and their impact on modern human genomes.'
text_clean,dict_words=clean(text,True)
text_clean

('neanderthal',
 'genome',
 'denisovan',
 'genome',
 'early',
 'modern',
 'human',
 'genome',
 'eurasia',
 'archaic',
 'hominin',
 'mark',
 'genome',
 'modern',
 'human',
 'present',
 'day',
 'individual',
 'eurasia',
 'genome',
 'neanderthal',
 'individual',
 'oceania',
 'genome',
 'denisovan',
 'suggestive',
 'evidence',
 'unidentified',
 'hominin',
 'specie',
 'africa',
 'functional',
 'phenotypic',
 'evolutionary',
 'consequence',
 'archaic',
 'admixture',
 'specific',
 'haplotype',
 'allele',
 'archaic',
 'hominin',
 'ancestor',
 'approach',
 'introgressed',
 'haplotype',
 'method',
 'reference',
 'archaic',
 'hominin',
 'genome',
 'sequence',
 'reference',
 'free',
 'method',
 'information',
 'category',
 'method',
 'sankararaman',
 'haplotype',
 'modern',
 'human',
 'haplotype',
 'reference',
 'archaic',
 'sequence',
 'category',
 'method',
 's',
 'statistic',
 'mutational',
 'signature',
 'ancient',
 'admixture',
 'leaf',
 'genome',
 'present',
 'day',
 'human',
 's',
 'approac

## (1) grafo de palabras

In [99]:
# K es el largo de la ventana
# text_clean es una lista de palabras de un texto ya procesado por clean
# digraph indica el tipo de grafo- True = dirigido, False = no dirigido
def graph_weighted(text_clean,K,digraph):
    unique_words=list(set(text_clean))
    if digraph==True: ## grafo dirigido o no dirigido
        G=nx.DiGraph()
    else:
        G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in unique_words: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text_clean) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text_clean)):
                    if G.has_edge(text_clean[index],text_clean[index+k])==False:
                        G.add_edge(text_clean[index],text_clean[index+k],weight=1)
                    else:
                        x=G[text_clean[index]][text_clean[index+k]]['weight']
                        G[text_clean[index]][text_clean[index+k]]['weight']=x+1
    
    return G

In [100]:
graph_weighted(text_clean,8,True)

<networkx.classes.digraph.DiGraph at 0x7f3d2f351c18>

## (2) keywords segun pagerank

In [101]:
# K es el largo de la ventana
# text_clean es una lista de palabras de un texto ya procesado por clean
# number_keywords indica el numero de keywords
# digraph indica si queremos un grafo dirigido (True) o no dirigido (False)
def keywords_pagerank(text_clean,number_keywords,K,digraph):
    n=len(text_clean)
    G=graph_weighted(text_clean,K,digraph)
    keywords=nx.pagerank(G, alpha=0.85, weight='weight')
    if n<number_keywords: ## en el caso de que el texto sea muy corto (incluso menor al numero de keywords)
        number_keywords=n
    return list(list(zip(*sorted(keywords.items(), key=operator.itemgetter(1),reverse=True)))[0][:number_keywords])

In [102]:
keywords_pagerank(text_clean,10,8,True)

['genome',
 'haplotype',
 'archaic',
 'human',
 'allele',
 'method',
 'modern',
 'introgressed',
 'specific',
 'neanderthal']

## (3) keywords segun main core

In [103]:
# K es el largo de la ventana
# text_clean es una lista de palabras de un texto ya procesado por clean
# digraph es el tipo de grafo - True = grafo dirigido, False = grafo no dirigido
# en esta funcion, no es necesario indicar el numero de keywords. Fijamos un maximo de number_keywords
def keywords_kcore(text_clean,number_keywords,K,digraph):
    G=graph_weighted(text_clean,K,digraph)
    G.remove_edges_from(nx.selfloop_edges(G)) ## borramos ciclos para evitar que Networkx entregue un error
    main_core_nodes=list(nx.k_core(G).nodes())
    n=len(main_core_nodes)
    if n>number_keywords:
        return main_core_nodes[:number_keywords]
    else:
        return main_core_nodes

In [104]:
keywords_kcore(text_clean,10,8,True)

['present',
 'genome',
 'day',
 'neanderthal',
 'hominin',
 'mutational',
 'category',
 'method',
 'allele',
 's']

## (4) Main core rankeado segun Core Rank
### http://www.lix.polytechnique.fr/~anti5662/eacl_17_real_time_meladianos_tixier_nikolentzos_vazirgiannis.pdf

In [105]:
# K es el largo de la ventana
# text_clean es una lista de palabras de un texto ya procesado por clean
# digraph es el tipo de grafo - True = grafo dirigido, False = grafo no dirigido
def keywords_kcore_rank(text_clean,number_keywords,K,digraph):
    G=graph_weighted(text_clean,K,digraph)
    G.remove_edges_from(nx.selfloop_edges(G)) ## borramos ciclos para evitar que Networkx entregue un error
    core_number=nx.core_number(G) ## core number de los nodos de G
    core_rank={word:sum([core_number[w] for w in list(dict(G[word]).keys())]) for word in list(core_number.keys())}
    main_core_nodes=list(nx.k_core(G).nodes())
    ## ahora vemos el core rank de los nodos del main core
    main_core_rank=[]
    for node in main_core_nodes:
        main_core_rank+=[(node,core_rank[node])]
    ## ordenamos segun core rank decreciente
    main_core_rank=sorted(main_core_rank, key=lambda tup: tup[1],reverse=True)
    main_core_rank=list(zip(*main_core_rank))[0]
    ## elegimos number_keywords keywords
    n=len(main_core_rank)
    if n>number_keywords:
        return list(main_core_rank[:number_keywords])
    else:
        return list(main_core_rank)

In [106]:
keywords_kcore_rank(text_clean,10,8,True)

['genome',
 'haplotype',
 'archaic',
 'human',
 'method',
 'introgressed',
 'allele',
 'hominin',
 'admixture',
 'neanderthal']

## (5) n-gramas de keywords
## Con el conjunto de palabras del main core, revisamos el texto original en busqueda de n-gramas de palabras adyacentes. De esta forma, se extraen ngramas 'importantes', que sean representativos de las ideas centrales del texto. 

In [107]:
# K es el largo de la ventana
# text es el texto original
# text_clean es una lista de palabras de un texto ya procesado por clean
# dict_words es el diccionario que entrega clean
# digraph es el tipo de grafo - True = grafo dirigido, False = grafo no dirigido
# lemmatization es True o False
def main_core_ngrams(text,text_clean,dict_words,K,digraph,lemmatization):
    G=graph_weighted(text_clean,K,digraph)
    G.remove_edges_from(nx.selfloop_edges(G)) 
    d=nx.core_number(G) ## diccionario de palabras asociadas a un core
    max_core=max(d.values())
    text=[w.lower() for w in word_tokenize(text)]
    c=[]
    for word in text:
        if word in dict_words.keys():
            if d[dict_words[word]]==max_core:
                if lemmatization==True:
                    c+=[dict_words[word]]
                else:
                    c+=[word]
            else:
                c+=['X']
        else:
            c+=['X']
    s=' '.join(c)
    ngrams=[token.rstrip().lstrip() for token in s.split('X') if token!=' ' and token!='']
    L=[[len(word_tokenize(token)),token] for token in ngrams]
    o = OrderedDict()
    for x in L:
        o.setdefault(x[0], []).append(x[1])
    for key in o.keys():
        if len(o[key])>1:
            o[key]=list(list(zip(*Counter(o[key]).most_common()))[0])
        
    return dict(o)

In [108]:
main_core_ngrams(text,text_clean,dict_words,8,True,True)


{1: ['allele',
  'genome',
  'method',
  'admixture',
  'eurasia',
  'neanderthal',
  'category',
  'mutational',
  'hominin',
  'approach',
  's',
  'human',
  'specific',
  'present'],
 2: ['introgressed haplotype',
  'archaic hominin',
  'modern human',
  'neanderthal genome',
  'archaic admixture',
  'specific haplotype',
  'archaic haplotype',
  's approach',
  'human genome',
  'archaic genome'],
 3: ['modern human genome',
  'modern human haplotype',
  'reference archaic sequence',
  'archaic reference genome'],
 5: ['reference archaic hominin genome sequence']}

## (6) n-gramas de keywords para diferentes valores de k (cores)
## Para cada k (core), extraemos el equivalente de la funcion (5). Con esto, formamos un diccionario, en el cual las keys son k y los values son diccionarios al modo de la funcion (5).  

In [109]:
# K es el largo de la ventana
# text es el texto original
# text_clean es una lista de palabras de un texto ya procesado por clean
# dict_words es el diccionario que entrega clean
# digraph es el tipo de grafo - True = grafo dirigido, False = grafo no dirigido
def k_core_ngrams(text,text_clean,dict_words,K,digraph,lemmatization):
    G=graph_weighted(text_clean,K,digraph)
    G.remove_edges_from(nx.selfloop_edges(G)) 
    d=nx.core_number(G)
    k_core={}
    text=[w.lower() for w in word_tokenize(text)]
    for core in d.values():
        c=[]
        for word in text: ## con este for rodeamos las palabras del k core con 'X'
            if word in dict_words.keys():
                if d[dict_words[word]]==core:
                    if lemmatization==True:
                        c+=[dict_words[word]]
                    else:
                        c+=[word]
                else:
                    c+=['X']
            else:
                c+=['X']
        s=' '.join(c)
        ngrams=[token.rstrip().lstrip() for token in s.split('X') if token!=' ' and token!=''] ## split con 'X'
        L=[[len(word_tokenize(token)),token] for token in ngrams]
        o = OrderedDict()
        for x in L:
            o.setdefault(x[0], []).append(x[1])
        for key in o.keys():
            if len(o[key])>1:
                o[key]=list(list(zip(*Counter(o[key]).most_common()))[0]) ## ordenamos los resultados x frecuencia
    
        k_core[core]=dict(o)
        
    return k_core

In [110]:
k_core_ngrams(text,text_clean,dict_words,8,True,True)

{10: {1: ['early']},
 11: {1: ['putative',
   'rate',
   'role',
   'gene',
   'insight',
   'history',
   'event',
   'impact']},
 13: {1: ['datum', 'mark', 'study', 'slike', 'power', 'suitable'],
  2: ['large set']},
 14: {1: ['sankararaman',
   'denisovan',
   'individual',
   'oceania',
   'unidentified',
   'specie',
   'africa',
   'functional',
   'phenotypic',
   'ancestor',
   'information',
   'powerful',
   'absence',
   'unusual',
   'characteristic',
   'lineage',
   'absent',
   'rare',
   'african',
   'recombination',
   'distance',
   'kb',
   'average'],
  2: ['suggestive evidence',
   'evolutionary consequence',
   'recent timing',
   'high level',
   'linkage disequilibrium'],
  3: ['long divergence time']},
 15: {1: ['statistic', 'signature', 'ancient', 'leaf']},
 17: {1: ['allele',
   'genome',
   'method',
   'admixture',
   'eurasia',
   'neanderthal',
   'category',
   'mutational',
   'hominin',
   'approach',
   's',
   'human',
   'specific',
   'present'],


## (7) ahora extraemos ngramas que unen k-core con (k-1)-core
## En esta nueva funcion, se extiende (6) con el fin de incluir n-gramas formados por palabras que aparezcan en el main-core o en el (main-1)-core. 

In [111]:
# text es el texto original
# text_clean es una lista de palabras de un texto ya procesado por clean
# dict_words es el diccionario que entrega clean
# digraph es el tipo de grafo - True = grafo dirigido, False = grafo no dirigido
def main_core_combination_ngrams(text,text_clean,dict_words,K,digraph,lemmatization):
    G=graph_weighted(text_clean,K,digraph)
    G.remove_edges_from(nx.selfloop_edges(G)) 
    d=nx.core_number(G) ## diccionario de palabras asociadas a un core
    max_core=max(d.values())
    max_core_minus=sorted(list(set(d.values())),reverse=True)[1] ## el segundo core mas grande
    text=[w.lower() for w in word_tokenize(text)]
    c=[]
    for word in text:
        if word in dict_words.keys():
            if d[dict_words[word]]==max_core:
                if lemmatization==True:
                    c+=[dict_words[word]]
                else:
                    c+=[word]
            elif d[dict_words[word]]==max_core_minus:
                if lemmatization==True:
                    c+=[dict_words[word]]
                else:
                    c+=[word]
            else:
                c+=['X']
        else:
            c+=['X']
    s=' '.join(c)
    ngrams=[token.rstrip().lstrip() for token in s.split('X') if token!=' ' and token!='']
    L=[[len(word_tokenize(token)),token] for token in ngrams]
    o = OrderedDict()
    for x in L:
        o.setdefault(x[0], []).append(x[1])
    for key in o.keys():
        if len(o[key])>1:
            o[key]=list(list(zip(*Counter(o[key]).most_common()))[0])
        
    return dict(o)

In [112]:
main_core_combination_ngrams(text,text_clean,dict_words,8,True,True)

{1: ['allele',
  'genome',
  'method',
  'eurasia',
  'neanderthal',
  'admixture',
  'category',
  'hominin',
  'approach',
  'human',
  'mutational',
  'specific',
  'present'],
 2: ['introgressed haplotype',
  'archaic hominin',
  'modern human',
  'neanderthal genome',
  'archaic admixture',
  'specific haplotype',
  'archaic haplotype',
  's statistic',
  'mutational signature',
  's approach',
  'human genome',
  'archaic genome'],
 3: ['modern human genome',
  'modern human haplotype',
  'reference archaic sequence',
  'ancient admixture leaf',
  'archaic reference genome'],
 5: ['reference archaic hominin genome sequence']}