# Visualizacion de Keywords (W+J)

In [56]:
import matplotlib.pyplot as plt
import networkx as nx
import spacy
import operator
nlp = spacy.load('en')
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk import word_tokenize
from termcolor import colored
import re
import string
from yellowbrick.style.palettes import SEQUENCES
data=open('stopwords.txt','r')
data_read = data.read()
stop_words=data_read.replace('\n',' ').split()

## (0) funcion que limpia los textos
### Esta funcion es mas general y entrega el texto filtrado y un diccionario de word original: word modificada

In [57]:
def clean(text):
    text=re.sub("[\(\[].*?[\)\]]", "", text)
    sentences=sent_tokenize(text)
    sentences=[nlp(sentence.lower()) for sentence in sentences] ## lower() implica que se descartan los PROPN
    sentences=[[(token.lemma_,token.text) for token in sentence if token.tag_=='NN' or token.tag_=='NNS' or token.tag_=='JJ'] for sentence in sentences]
    TEXT=[item for sublist in sentences for item in sublist]
    TEXT=[(word_l,word_nl) for (word_l,word_nl) in TEXT if not word_l in stop_words or not word_nl in stop_words]
    TEXT=[(re.sub(r'[^a-zA-Z0-9]', '', word_l),word_nl) for (word_l,word_nl) in TEXT]
    TEXT=[(word_l,word_nl) for (word_l,word_nl) in TEXT if word_l.isdigit()==False and word_l!='' and word_nl.isdigit()==False and word_nl!='']
    dict_text=list(set(TEXT))
    dict_text={word_nl:word_l for (word_l,word_nl) in dict_text}
    return list(zip(*TEXT))[0],dict_text ## texto filtrado, diccionario de word original: word modificada



## ejemplo

In [58]:
text='Sequencing the Neanderthal genome (Green et al., 2010, Prüfer et al., 2014), the Denisovan genome (Reich et al., 2010), and several early modern human genomes from Eurasia (Fu et al., 2014, Fu et al., 2015) has confirmed that archaic hominins left their mark in the genomes of modern humans (Plagnol and Wall, 2006, Sankararaman et al., 2014, Vernot and Akey, 2014, Vernot et al., 2016). Present-day individuals in Eurasia inherited ∼2% of their genome from Neanderthals (Green et al., 2010), and individuals from Oceania inherited ∼5% of their genome from Denisovans (Reich et al., 2010). Suggestive evidence indicates that admixture from other unidentified hominin species occurred in Africa (Hammer et al., 2011, Hsieh et al., 2016, Lachance et al., 2012, Plagnol and Wall, 2006, Wall et al., 2009). To understand the functional, phenotypic, and evolutionary consequences of archaic admixture, it is necessary to identify the specific haplotypes and alleles that were inherited from archaic hominin ancestors (Huerta-Sánchez et al., 2014, Juric et al., 2016, Sankararaman et al., 2014, Simonti et al., 2016, Vernot and Akey, 2014). Approaches to identifying introgressed haplotypes include methods that specifically incorporate reference archaic hominin genome sequences and reference-free methods that do not utilize such information. An example of the former category is the method of Sankararaman et al. (2014), which identifies archaic haplotypes by comparing modern human haplotypes to a reference archaic sequence. The latter category of methods include the S∗ statistic (Plagnol and Wall, 2006), which searches for the mutational signature that ancient admixture leaves in the genomes of present-day humans. The S∗ approach is powerful for finding introgressed haplotypes in the absence of an archaic reference genome because it leverages the unusual mutational characteristics of introgressed haplotypes. Because of the long divergence time between Neanderthals and modern humans, Neanderthals carry many alleles that are specific to their lineage. Such alleles are present on introgressed haplotypes but are absent or rare in African genomes. Further, based on the recent timing of admixture, introgressed haplotypes are expected to be maintained without recombination over distances of approximately 50 kb on average (Sankararaman et al., 2012), resulting in high levels of linkage disequilibrium (LD) between Neanderthal-specific alleles in non-African human genomes. In this study, we develop an S∗-like method that has increased power and is suitable for large-scale genome-wide data. We apply the method to large sets of sequenced data from Eurasia and Oceania and identify putative archaic-specific alleles. We examine the rate at which these alleles match the sequenced archaic genomes and the role of the genes containing these alleles, to obtain insights into the history of the admixture events and their impact on modern human genomes.'
text_clean,dict_words=clean(text)
text_clean

('neanderthal',
 'genome',
 'denisovan',
 'genome',
 'early',
 'modern',
 'human',
 'genome',
 'eurasia',
 'archaic',
 'hominin',
 'mark',
 'genome',
 'modern',
 'human',
 'present',
 'day',
 'individual',
 'eurasia',
 'genome',
 'neanderthal',
 'individual',
 'oceania',
 'genome',
 'denisovan',
 'suggestive',
 'evidence',
 'unidentified',
 'hominin',
 'specie',
 'africa',
 'functional',
 'phenotypic',
 'evolutionary',
 'consequence',
 'archaic',
 'admixture',
 'specific',
 'haplotype',
 'allele',
 'archaic',
 'hominin',
 'ancestor',
 'approach',
 'introgressed',
 'haplotype',
 'method',
 'reference',
 'archaic',
 'hominin',
 'genome',
 'sequence',
 'reference',
 'free',
 'method',
 'information',
 'category',
 'method',
 'sankararaman',
 'haplotype',
 'modern',
 'human',
 'haplotype',
 'reference',
 'archaic',
 'sequence',
 'category',
 'method',
 's',
 'statistic',
 'mutational',
 'signature',
 'ancient',
 'admixture',
 'leaf',
 'genome',
 'present',
 'day',
 'human',
 's',
 'approac

## (1) grafo de palabras

In [59]:
# K es el largo de la ventana
# text_clean es una lista de palabras de un texto ya procesado por clean
# digraph indica el tipo de grafo- True = dirigido, False = no dirigido
def graph_weighted(text_clean,K,digraph):
    unique_words=list(set(text_clean))
    if digraph==True: ## grafo dirigido o no dirigido
        G=nx.DiGraph()
    else:
        G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in unique_words: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text_clean) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text_clean)):
                    if G.has_edge(text_clean[index],text_clean[index+k])==False:
                        G.add_edge(text_clean[index],text_clean[index+k],weight=1)
                    else:
                        x=G[text_clean[index]][text_clean[index+k]]['weight']
                        G[text_clean[index]][text_clean[index+k]]['weight']=x+1
    
    return G

In [60]:
graph_weighted(text_clean,8,True)

<networkx.classes.digraph.DiGraph at 0x7f85448c21d0>

## Visualizamos el main core o el main crust
## (2) main core o main crust en el texto

In [61]:
# K es el largo de la ventana
# text es el texto original
# text_clean es una lista de palabras de un texto ya procesado por clean
# dict_words es el diccionario que entrega clean
# digraph es el tipo de grafo - True = grafo dirigido, False = grafo no dirigido
# main indica si queremos visualizar el main core (True) o el main crust (False)
def main_crust_text(text,text_clean,dict_words,K,digraph,main):
    G=graph_weighted(text_clean,K,digraph)
    G.remove_edges_from(nx.selfloop_edges(G)) 
    d=nx.core_number(G)
    min_core=min(d.values())
    d={word:d[word] for word in d.keys()}
    max_core=max(d.values())
    text=[w.lower() for w in word_tokenize(text)]
    c=[]
    for word in text:
        if word in dict_words.keys():
            if main==True:
                if d[dict_words[word]]==max_core:
                    c+=[colored(word, 'red',attrs=['bold'])]
                else:
                    c+=[colored(word,attrs=['dark'])]
            else:
                if d[dict_words[word]]==max_core:
                    c+=[colored(word,attrs=['dark'])]
                else:
                    c+=[colored(word,'red',attrs=['bold'])]
        else:
            c+=[colored(word,attrs=['dark'])]
    s=' '.join(c)
    print(s)

In [62]:
main_crust_text(text,text_clean,dict_words,8,True,False)

[2msequencing[0m [2mthe[0m [2mneanderthal[0m [2mgenome[0m [2m([0m [2mgreen[0m [2met[0m [2mal.[0m [2m,[0m [2m2010[0m [2m,[0m [2mprüfer[0m [2met[0m [2mal.[0m [2m,[0m [2m2014[0m [2m)[0m [2m,[0m [2mthe[0m [1m[31mdenisovan[0m [2mgenome[0m [2m([0m [2mreich[0m [2met[0m [2mal.[0m [2m,[0m [2m2010[0m [2m)[0m [2m,[0m [2mand[0m [2mseveral[0m [1m[31mearly[0m [2mmodern[0m [2mhuman[0m [2mgenomes[0m [2mfrom[0m [2meurasia[0m [2m([0m [2mfu[0m [2met[0m [2mal.[0m [2m,[0m [2m2014[0m [2m,[0m [2mfu[0m [2met[0m [2mal.[0m [2m,[0m [2m2015[0m [2m)[0m [2mhas[0m [2mconfirmed[0m [2mthat[0m [2marchaic[0m [2mhominins[0m [2mleft[0m [2mtheir[0m [1m[31mmark[0m [2min[0m [2mthe[0m [2mgenomes[0m [2mof[0m [2mmodern[0m [2mhumans[0m [2m([0m [2mplagnol[0m [2mand[0m [2mwall[0m [2m,[0m [2m2006[0m [2m,[0m [1m[31msankararaman[0m [2met[0m [2mal.[0m [2m,[0m [2m2014[0m [2m,[0m 

## Ahora entregamos una lista de listas con info de palabra y core (main, crust o nada)
## (3) idem que (2) pero en forma de lista

In [63]:
# K es el largo de la ventana
# text es el texto original
# text_clean es una lista de palabras de un texto ya procesado por clean
# dict_words es el diccionario que entrega clean
# digraph es el tipo de grafo - True = grafo dirigido, False = grafo no dirigido
def main_crust_list_text(text,text_clean,dict_words,K,digraph):
    G=graph_weighted(text_clean,K,digraph)
    G.remove_edges_from(nx.selfloop_edges(G)) 
    d=nx.core_number(G)
    min_core=min(d.values())
    d={word:d[word] for word in d.keys()}
    max_core=max(d.values())
    text=[w.lower() for w in word_tokenize(text)]
    c=[]
    for word in text:
        if word in dict_words.keys():
            if d[dict_words[word]]==max_core:
                c+=[[word,1]]
            else:
                c+=[[word,0]]
        else:
            c+=[[word,-1]]
    return c

In [64]:
main_crust_list_text(text,text_clean,dict_words,8,True)

[['sequencing', -1],
 ['the', -1],
 ['neanderthal', 1],
 ['genome', 1],
 ['(', -1],
 ['green', -1],
 ['et', -1],
 ['al.', -1],
 [',', -1],
 ['2010', -1],
 [',', -1],
 ['prüfer', -1],
 ['et', -1],
 ['al.', -1],
 [',', -1],
 ['2014', -1],
 [')', -1],
 [',', -1],
 ['the', -1],
 ['denisovan', 0],
 ['genome', 1],
 ['(', -1],
 ['reich', -1],
 ['et', -1],
 ['al.', -1],
 [',', -1],
 ['2010', -1],
 [')', -1],
 [',', -1],
 ['and', -1],
 ['several', -1],
 ['early', 0],
 ['modern', 1],
 ['human', 1],
 ['genomes', 1],
 ['from', -1],
 ['eurasia', 1],
 ['(', -1],
 ['fu', -1],
 ['et', -1],
 ['al.', -1],
 [',', -1],
 ['2014', -1],
 [',', -1],
 ['fu', -1],
 ['et', -1],
 ['al.', -1],
 [',', -1],
 ['2015', -1],
 [')', -1],
 ['has', -1],
 ['confirmed', -1],
 ['that', -1],
 ['archaic', 1],
 ['hominins', 1],
 ['left', -1],
 ['their', -1],
 ['mark', 0],
 ['in', -1],
 ['the', -1],
 ['genomes', 1],
 ['of', -1],
 ['modern', 1],
 ['humans', 1],
 ['(', -1],
 ['plagnol', -1],
 ['and', -1],
 ['wall', -1],
 [',', -1]

In [53]:
#def visualization_kcore(text_clean,K,digraph):
#    G=graph_weighted(text_clean,K,digraph)
#    G.remove_edges_from(nx.selfloop_edges(G)) ## borramos ciclos para evitar que Networkx entregue un error
#    d=nx.core_number(G)
#    G=G.to_undirected()
#    number_colors=len(list(d.values()))
#    min_core=min(list(d.values()))
#    color_names=SEQUENCES['OrRd'][6]
#    colors={x:color_names[d[x]-min_core] for x in list(d.keys())}
#    pos = nx.spring_layout(G)
#    labels={i:i for i in G.nodes}
#    nx.draw_networkx_nodes(G, pos, G.nodes, node_size = 175,node_color=list(colors.values()))
#    nx.draw_networkx_edges(G, pos, alpha=0.1,width=0.,edge_color='k')
#    nx.draw_networkx_labels(G,pos,labels,font_size=5)
#    plt.axis('off')
#    plt.savefig('k_core.eps', format='eps', transparent=True, bbox_inches='tight',dpi=800)
#    plt.show()