# importancia de oraciones

## librerias varias

In [54]:
import numpy as np
import random
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
import itertools
from scipy import spatial
import networkx as nx
import operator
import re
from nltk.corpus import stopwords # stopwords de nltk 
stop_words = set(stopwords.words('english'))

## vectores GLOVE
### https://nlp.stanford.edu/projects/glove/

In [55]:
vectors = {}
f = open('glove.6B.300d.txt') ## 300 dimensiones con 6B palabras (hay versiones con vocabularios mas grandes en el link)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    vectors[word] = coefs
f.close()

## vectores aleatorios para las palabras fuera de vocabulario (se mantienen iguales para el mismo texto)

In [61]:
def glove_text(text):
    text=list(set([w.lower() for w in tokenizer.tokenize(text)]))
    text=[w for w in text if w not in stop_words]
    text=[w for w in text if not w.isdigit()]
    glove_vectors={}
    for w in text:
        if w not in vectors.keys():
            vector=[1]*300
            vector=[x*random.uniform(-1.5,1.5) for x in vector]
            vector=np.array(vector)
            glove_vectors[w]=vector
        else:
            glove_vectors[w]=vectors[w]
    return glove_vectors

## funcion para construir el grafo de oraciones, donde el peso de las aristas indica la similitud

In [62]:
def simglove(vector_i,vector_j):
    return 1-spatial.distance.cosine(vector_i.reshape(1,-1),vector_j.reshape(1,-1))

def sent_distance_graph(text): 
    b = {'et al.': ''} ## saco los 'et al.' pq dan problemas con el sent_tokenize de nltk, debe haber una mejor solucion
    for x,y in b.items():
        text = text.replace(x, y) 
    glove_vectors=glove_text(text)
    sentences=sent_tokenize(text)
    
    ### representacion de las oraciones
    clean_sentences=[]
    for sentence in sentences: 
        sent=list(set([w.lower() for w in tokenizer.tokenize(sentence)]))
        sent=[w for w in sent if w not in stop_words]
        sent=[w for w in sent if not w.isdigit()]
        clean_sentences+=[sent]
    
    ### vectores de las oraciones
    vector_sentences=[]
    for sentence in clean_sentences: 
        vector_sent=[glove_vectors[word] for word in sentence]
        vector_sentences+=[np.mean(vector_sent, axis=0)]
    
    ### grafo de distancias
    n=len(sentences)
    pairs=list(itertools.product(range(n),range(n)))
    G=nx.Graph()
    for node in range(n):
        G.add_node(node)
    for pair in pairs:
        node_i=pair[0]
        node_j=pair[1]
        if node_i!=node_j:
            G.add_edge(node_i,node_j,weight=simglove(vector_sentences[node_i],vector_sentences[node_j]))
        else:
            G.add_edge(node_i,node_j,weight=0)
    return G

## funcion que entrega las K oraciones mas importantes

In [88]:
## esta funcion entrega una lista de oraciones
def sentences_pagerank(text,K):
    G=sent_distance_graph(text)
    b = {'et al.': ''} 
    for x,y in b.items():
        text = text.replace(x, y) 
    sentences=sent_tokenize(text)
    importance_sentences=nx.pagerank(G, alpha=0.85, weight='weight')
    ranking=list(list(zip(*sorted(importance_sentences.items(), key=operator.itemgetter(1),reverse=True)))[0])
    return [sentences[r] for r in ranking][:K] ## del ranking de oraciones elegimos las K primeras
    #return ranking ## si queremos que se recupere el ranking de oraciones (de las mas a las menos importante)
  

### ejemplo - intro sacada de CELL

In [91]:
text1='Sequencing the Neanderthal genome (Green et al., 2010, Prüfer et al., 2014), the Denisovan genome (Reich et al., 2010), and several early modern human genomes from Eurasia (Fu et al., 2014, Fu et al., 2015) has confirmed that archaic hominins left their mark in the genomes of modern humans (Plagnol and Wall, 2006, Sankararaman et al., 2014, Vernot and Akey, 2014, Vernot et al., 2016). Present-day individuals in Eurasia inherited ∼2% of their genome from Neanderthals (Green et al., 2010), and individuals from Oceania inherited ∼5% of their genome from Denisovans (Reich et al., 2010). Suggestive evidence indicates that admixture from other unidentified hominin species occurred in Africa (Hammer et al., 2011, Hsieh et al., 2016, Lachance et al., 2012, Plagnol and Wall, 2006, Wall et al., 2009).'
text2='To understand the functional, phenotypic, and evolutionary consequences of archaic admixture, it is necessary to identify the specific haplotypes and alleles that were inherited from archaic hominin ancestors (Huerta-Sánchez et al., 2014, Juric et al., 2016, Sankararaman et al., 2014, Simonti et al., 2016, Vernot and Akey, 2014). Approaches to identifying introgressed haplotypes include methods that specifically incorporate reference archaic hominin genome sequences and reference-free methods that do not utilize such information. An example of the former category is the method of Sankararaman et al. (2014), which identifies archaic haplotypes by comparing modern human haplotypes to a reference archaic sequence. The latter category of methods include the S∗ statistic (Plagnol and Wall, 2006), which searches for the mutational signature that ancient admixture leaves in the genomes of present-day humans.' 
text3='The S∗ approach is powerful for finding introgressed haplotypes in the absence of an archaic reference genome because it leverages the unusual mutational characteristics of introgressed haplotypes. Because of the long divergence time between Neanderthals and modern humans, Neanderthals carry many alleles that are specific to their lineage. Such alleles are present on introgressed haplotypes but are absent or rare in African genomes.'
text4='Further, based on the recent timing of admixture, introgressed haplotypes are expected to be maintained without recombination over distances of approximately 50 kb on average (Sankararaman et al., 2012), resulting in high levels of linkage disequilibrium (LD) between Neanderthal-specific alleles in non-African human genomes. In this study, we develop an S∗-like method that has increased power and is suitable for large-scale genome-wide data. We apply the method to large sets of sequenced data from Eurasia and Oceania and identify putative archaic-specific alleles. We examine the rate at which these alleles match the sequenced archaic genomes and the role of the genes containing these alleles, to obtain insights into the history of the admixture events and their impact on modern human genomes.'

In [92]:
print(sentences_pagerank(text1,1))
print(sentences_pagerank(text2,1))
print(sentences_pagerank(text3,1))
print(sentences_pagerank(text4,1))

['Sequencing the Neanderthal genome (Green , 2010, Prüfer , 2014), the Denisovan genome (Reich , 2010), and several early modern human genomes from Eurasia (Fu , 2014, Fu , 2015) has confirmed that archaic hominins left their mark in the genomes of modern humans (Plagnol and Wall, 2006, Sankararaman , 2014, Vernot and Akey, 2014, Vernot , 2016).']
['Approaches to identifying introgressed haplotypes include methods that specifically incorporate reference archaic hominin genome sequences and reference-free methods that do not utilize such information.']
['The S∗ approach is powerful for finding introgressed haplotypes in the absence of an archaic reference genome because it leverages the unusual mutational characteristics of introgressed haplotypes.']
['We examine the rate at which these alleles match the sequenced archaic genomes and the role of the genes containing these alleles, to obtain insights into the history of the admixture events and their impact on modern human genomes.']
