# GoW bipartite (W+J)

In [218]:
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms import bipartite
import spacy
nlp = spacy.load('en')
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk import word_tokenize
import numpy as np
import operator
import re
import string
import itertools
import json
import statistics
import random
from collections import Counter
data=open('stopwords.txt','r')
data_read = data.read()
stop_words=data_read.replace('\n',' ').split()

## funcion que limpia los textos

In [219]:
def clean(text): 
    text=re.sub("[\(\[].*?[\)\]]", "", text)
    text=text.replace('al.','')
    sentences=sent_tokenize(text)
    sentences=[nlp(sentence.lower()) for sentence in sentences] 
    
    sentences=[[token.lemma_ for token in sentence if token.lemma_ != '-PRON-' and token.is_punct==False and token.like_num==False] for sentence in sentences]

    sentences=[[word for word in sentence if not word in stop_words] for sentence in sentences]
    sentences=[[(re.sub(r'[^a-zA-Z0-9]', '', word)) for word in sentence] for sentence in sentences]  
    sentences=[[word for word in sentence if word.isdigit()==False and word!=''] for sentence in sentences]
    sentences=[sentence for sentence in sentences if len(sentence)>1]
    
    return sentences

### ejemplo

In [255]:
text='Sequencing the Neanderthal genome (Green et al., 2010, Prüfer et al., 2014), the Denisovan genome (Reich et al., 2010), and several early modern human genomes from Eurasia (Fu et al., 2014, Fu et al., 2015) has confirmed that archaic hominins left their mark in the genomes of modern humans (Plagnol and Wall, 2006, Sankararaman et al., 2014, Vernot and Akey, 2014, Vernot et al., 2016). Present-day individuals in Eurasia inherited ∼2% of their genome from Neanderthals (Green et al., 2010), and individuals from Oceania inherited ∼5% of their genome from Denisovans (Reich et al., 2010). Suggestive evidence indicates that admixture from other unidentified hominin species occurred in Africa (Hammer et al., 2011, Hsieh et al., 2016, Lachance et al., 2012, Plagnol and Wall, 2006, Wall et al., 2009). To understand the functional, phenotypic, and evolutionary consequences of archaic admixture, it is necessary to identify the specific haplotypes and alleles that were inherited from archaic hominin ancestors (Huerta-Sánchez et al., 2014, Juric et al., 2016, Sankararaman et al., 2014, Simonti et al., 2016, Vernot and Akey, 2014). Approaches to identifying introgressed haplotypes include methods that specifically incorporate reference archaic hominin genome sequences and reference-free methods that do not utilize such information. An example of the former category is the method of Sankararaman et al. (2014), which identifies archaic haplotypes by comparing modern human haplotypes to a reference archaic sequence. The latter category of methods include the S∗ statistic (Plagnol and Wall, 2006), which searches for the mutational signature that ancient admixture leaves in the genomes of present-day humans. The S∗ approach is powerful for finding introgressed haplotypes in the absence of an archaic reference genome because it leverages the unusual mutational characteristics of introgressed haplotypes. Because of the long divergence time between Neanderthals and modern humans, Neanderthals carry many alleles that are specific to their lineage. Such alleles are present on introgressed haplotypes but are absent or rare in African genomes. Further, based on the recent timing of admixture, introgressed haplotypes are expected to be maintained without recombination over distances of approximately 50 kb on average (Sankararaman et al., 2012), resulting in high levels of linkage disequilibrium (LD) between Neanderthal-specific alleles in non-African human genomes. In this study, we develop an S∗-like method that has increased power and is suitable for large-scale genome-wide data. We apply the method to large sets of sequenced data from Eurasia and Oceania and identify putative archaic-specific alleles. We examine the rate at which these alleles match the sequenced archaic genomes and the role of the genes containing these alleles, to obtain insights into the history of the admixture events and their impact on modern human genomes.'
text_clean=clean(text)
text_clean

[['sequence',
  'neanderthal',
  'genome',
  'denisovan',
  'genome',
  'early',
  'modern',
  'human',
  'genome',
  'eurasia',
  'confirm',
  'archaic',
  'hominin',
  'leave',
  'mark',
  'genome',
  'modern',
  'human'],
 ['present',
  'day',
  'individual',
  'eurasia',
  'inherit',
  'genome',
  'neanderthal',
  'individual',
  'oceania',
  'inherit',
  'genome',
  'denisovan'],
 ['suggestive',
  'evidence',
  'admixture',
  'unidentified',
  'hominin',
  'specie',
  'occur',
  'africa'],
 ['understand',
  'functional',
  'phenotypic',
  'evolutionary',
  'consequence',
  'archaic',
  'admixture',
  'identify',
  'specific',
  'haplotype',
  'allele',
  'inherit',
  'archaic',
  'hominin',
  'ancestor'],
 ['approach',
  'identify',
  'introgressed',
  'haplotype',
  'include',
  'method',
  'specifically',
  'incorporate',
  'reference',
  'archaic',
  'hominin',
  'genome',
  'sequence',
  'reference',
  'free',
  'method',
  'utilize',
  'information'],
 ['category',
  'method'

## grafo de palabras bipartito

In [221]:
def bipartite_graph(text_clean):
    words=[item for sublist in text_clean for item in sublist]
    unique_words=list(set(words))
    B = nx.Graph() ## creamos el grafo!
    ## bipartito!
    B.add_nodes_from(range(len(text_clean)), bipartite=0) ## sentences
    B.add_nodes_from(unique_words, bipartite=1) ## words
    
    for i in range(len(text_clean)):
        for word in unique_words:
            sentence=text_clean[i]
            n=sentence.count(word)
            if n>0:
                if B.has_edge(i,word)==False:
                    B.add_edge(i,word,weight=n) ## el peso de la arista es el numero de veces que aparece la palabra
        
    return B

In [222]:
B=bipartite_graph(text_clean)

In [223]:
B.nodes

NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 'distance', 'time', 'sequence', 'powerful', 'develop', 'putative', 'divergence', 'inherit', 'kb', 'result', 'find', 'linkage', 'lineage', 'functional', 'unidentified', 'scale', 'statistic', 'sankararaman', 'gene', 'impact', 'leverage', 'average', 'insight', 'admixture', 'specie', 'role', 'category', 'recombination', 'unusual', 'signature', 'carry', 'recent', 'compare', 'absence', 'suggestive', 'phenotypic', 's', 'leave', 'understand', 'evolutionary', 'suitable', 'increase', 'present', 'consequence', 'evidence', 'hominin', 'incorporate', 'rate', 'apply', 'obtain', 'identify', 'disequilibrium', 'long', 'set', 'history', 'confirm', 'mutational', 'mark', 'occur', 'method', 'specific', 'africa', 'power', 'approach', 'absent', 'level', 'free', 'oceania', 'maintain', 'modern', 'characteristic', 'event', 'specifically', 'denisovan', 'ancient', 'include', 'slike', 'study', 'datum', 'search', 'wide', 'timing', 'individual', 'examine', 'eura

In [224]:
B.edges(data=True)

EdgeDataView([(0, 'sequence', {'weight': 1}), (0, 'leave', {'weight': 1}), (0, 'hominin', {'weight': 1}), (0, 'confirm', {'weight': 1}), (0, 'mark', {'weight': 1}), (0, 'modern', {'weight': 2}), (0, 'denisovan', {'weight': 1}), (0, 'eurasia', {'weight': 1}), (0, 'human', {'weight': 2}), (0, 'genome', {'weight': 4}), (0, 'archaic', {'weight': 1}), (0, 'early', {'weight': 1}), (0, 'neanderthal', {'weight': 1}), (1, 'inherit', {'weight': 2}), (1, 'present', {'weight': 1}), (1, 'oceania', {'weight': 1}), (1, 'denisovan', {'weight': 1}), (1, 'individual', {'weight': 2}), (1, 'eurasia', {'weight': 1}), (1, 'genome', {'weight': 2}), (1, 'neanderthal', {'weight': 1}), (1, 'day', {'weight': 1}), (2, 'unidentified', {'weight': 1}), (2, 'admixture', {'weight': 1}), (2, 'specie', {'weight': 1}), (2, 'suggestive', {'weight': 1}), (2, 'evidence', {'weight': 1}), (2, 'hominin', {'weight': 1}), (2, 'occur', {'weight': 1}), (2, 'africa', {'weight': 1}), (3, 'inherit', {'weight': 1}), (3, 'functional', 

## proyectamos el grafo sobre el conjunto de oraciones

In [225]:
def projected_bipartite_graph(text_clean): 
    n=len(text_clean)
    B=bipartite_graph(text_clean)
    return bipartite.weighted_projected_graph(B,range(n))

In [226]:
B_p=projected_bipartite_graph(text_clean)

In [227]:
B_p.edges(data=True)

EdgeDataView([(0, 1, {'weight': 4}), (0, 2, {'weight': 1}), (0, 3, {'weight': 2}), (0, 4, {'weight': 4}), (0, 5, {'weight': 4}), (0, 6, {'weight': 2}), (0, 7, {'weight': 2}), (0, 8, {'weight': 3}), (0, 9, {'weight': 1}), (0, 10, {'weight': 3}), (0, 11, {'weight': 1}), (0, 12, {'weight': 3}), (0, 13, {'weight': 5}), (1, 3, {'weight': 1}), (1, 4, {'weight': 1}), (1, 6, {'weight': 3}), (1, 7, {'weight': 1}), (1, 8, {'weight': 1}), (1, 9, {'weight': 2}), (1, 10, {'weight': 2}), (1, 11, {'weight': 1}), (1, 12, {'weight': 2}), (1, 13, {'weight': 1}), (2, 3, {'weight': 2}), (2, 4, {'weight': 1}), (2, 6, {'weight': 1}), (2, 10, {'weight': 1}), (2, 13, {'weight': 1}), (3, 4, {'weight': 4}), (3, 5, {'weight': 3}), (3, 6, {'weight': 1}), (3, 7, {'weight': 2}), (3, 8, {'weight': 2}), (3, 9, {'weight': 2}), (3, 10, {'weight': 4}), (3, 12, {'weight': 4}), (3, 13, {'weight': 3}), (4, 5, {'weight': 6}), (4, 6, {'weight': 3}), (4, 7, {'weight': 6}), (4, 9, {'weight': 3}), (4, 10, {'weight': 3}), (4, 11

## rankeamos las oraciones

In [253]:
def rank(text_clean):
    B_p=projected_bipartite_graph(text_clean)
    d=nx.pagerank(B_p, alpha=0.85, weight='weight')
    return sorted(d.items(), key=operator.itemgetter(1),reverse=True) ## mayor a menor importancia!

In [256]:
len(text_clean)

14

In [257]:
rank(text_clean)

[(4, 0.10090806784340152),
 (10, 0.09487792192982422),
 (0, 0.09006187168360215),
 (13, 0.08976139258612068),
 (5, 0.08396239373350745),
 (3, 0.07898088981927877),
 (12, 0.07820254613793864),
 (6, 0.07451327789414043),
 (7, 0.07056890131302163),
 (9, 0.06429445937476125),
 (1, 0.05327081263983303),
 (8, 0.0526889660313069),
 (11, 0.041765459961139895),
 (2, 0.026143039052123493)]

In [258]:
text_clean=text_clean+[['Lion','dog','cat']]
text=text+' '+' '.join(['Lion','dog','cat']) ## detectar una oracion rara

In [259]:
len(text_clean)

15

In [260]:
rank(text_clean)

[(4, 0.09983838272559105),
 (10, 0.09387216533657756),
 (0, 0.08910720117780081),
 (13, 0.08880984862244024),
 (5, 0.08307229471951444),
 (3, 0.07814366451476414),
 (12, 0.07737355109349507),
 (6, 0.0737234257269099),
 (7, 0.0698207981654502),
 (9, 0.06361289048233003),
 (1, 0.05270606889255221),
 (8, 0.052130407893090105),
 (11, 0.041322709183303497),
 (2, 0.025865884746438868),
 (14, 0.010600706719741941)]