# keywords

## datos - corpus introducciones revista CELL

In [433]:
import ast
import re
from nltk import sent_tokenize
import spacy
nlp = spacy.load('en')
import numpy as np
import networkx as nx
import random
import operator
import itertools
import matplotlib.pyplot as plt

In [2]:
data = open('corpus_cell.json','r')
data_cell=data.read()
data_cell = ast.literal_eval(data_cell)
introductions=[' '.join(item['introduction']) for item in data_cell]
introductions=[re.sub("[\(\[].*?[\)\]]", "", item) for item in introductions]
introductions=[x for x in introductions if len(x)>0]

In [14]:
data=open('stopwords.txt','r')
data_read = data.read()
stop_words=data_read.replace('\n',' ').split()

## clean text

In [327]:
def clean(text):
    sentences=sent_tokenize(text)
    sentences=[nlp(sentence) for sentence in sentences]
    ## nos quedamos, en esta version, solo con sustantivos (propios y comunes) y adjetivos!
    sentences=[[token.lemma_ for token in sentence if token.pos_=='NOUN' or token.pos_=='ADJ' or token.pos_=='PROPN'] for sentence in sentences]
    text=[item for sublist in sentences for item in sublist]
    text=[word for word in text if not word in stop_words]
    return text


In [640]:
text=introductions[0]
text


'The emerging outbreak of Zika virus in the Americas has brought this once obscure pathogen to the forefront of global healthcare. Mostly transmitted by Aedes aegypti and A.\xa0albopictus mosquitoes, Zika virus infections have been further spread by international travel and have expanded to large, heavily populated regions of South, Central, and North America . Correlations between the increase in Zika virus infections, the development of fetal microcephaly , and Guillain-Barré syndrome have resulted in the declaration of a public health emergency by the World Health Organization  and a call for fast-tracked development of Zika virus diagnostics . Synthetic biology is an emerging discipline that has great potential to respond to such pandemics. The increasing ability of synthetic biologists to repurpose and engineer natural biological components for practical applications has led to new opportunities for molecular diagnostics . We previously developed two biotechnologies that dramatica

## grafos!

In [641]:
def graph_weighted(text,K):
    text=clean(text)
    unique_words=list(set(text))
    G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in text: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text)):
                    if G.has_edge(text[index],text[index+k])==False:
                        G.add_edge(text[index],text[index+k],weight=1)
                    else:
                        x=G[text[index]][text[index+k]]['weight']
                        G[text[index]][text[index+k]]['weight']=x+1
    
    return G

## extraccion de keywords!

In [426]:
def adjacency(list_keywords,text):
    text=clean(text)
    new_list=list_keywords[:]
    for pair in list(itertools.permutations(list_keywords,2)):
        index_0=text.index(pair[0])
        index_1=text.index(pair[1])
        if index_0==index_1-1:
            if pair[0] in new_list:
                new_list.remove(pair[0])
            if pair[1] in new_list:
                new_list.remove(pair[1])
            if pair[0]+' '+pair[1] not in new_list:
                new_list+=[pair[0]+' '+pair[1]]
        if index_0==index_1+1:
            if pair[0] in new_list:
                new_list.remove(pair[0])
            if pair[1] in new_list:
                new_list.remove(pair[1])
            if pair[1]+' '+pair[0] not in new_list:
                new_list+=[pair[1]+' '+pair[0]]
           
    return new_list

### pagerank

In [428]:
def keywords_pagerank(text,number,adjacency_bool):
    G=graph_weighted(text,4)
    keywords=nx.pagerank(G, alpha=0.85, weight='weight')
    if adjacency_bool==True:
        return adjacency(list(list(zip(*sorted(keywords.items(), key=operator.itemgetter(1),reverse=True)))[0][:number]),text)
    else:
        return list(zip(*sorted(keywords.items(), key=operator.itemgetter(1),reverse=True)))[0][:number]

        
keywords_pagerank(text,5,False)

('virus', 'zika', 'diagnostic', 'sensor', 'detection')

### betweenness centrality

In [429]:
def keywords_betweenness_centrality(text,number,adjacency_bool):
    G=graph_weighted(text,4)
    keywords=nx.betweenness_centrality(G)
    if adjacency_bool==True:
        return adjacency(list(list(zip(*sorted(keywords.items(), key=operator.itemgetter(1),reverse=True)))[0][:number]),text)
    else:
        return list(zip(*sorted(keywords.items(), key=operator.itemgetter(1),reverse=True)))[0][:number]

        

In [430]:
keywords_betweenness_centrality(text,5,True)

['diagnostic', 'sensor', 'detection', 'zika virus']

### visualizacion

In [642]:
def visualization(text,K,number_keywords):
    G=graph_weighted(text,K)
    keywords=nx.pagerank(G, alpha=0.85, weight='weight')
    list_keywords=keywords_pagerank(text,number_keywords,False)
    G = G.subgraph(list_keywords)

    pos = nx.spring_layout(G)
    labels={i:i for i in G.nodes}
    nx.draw_networkx_nodes(G, pos, G.nodes, node_size = [keywords[k] * 10000 for k in G.nodes])
    nx.draw_networkx_edges(G, pos, alpha=0.5,width=0.5)
    nx.draw_networkx_labels(G,pos,labels,font_size=12)
    plt.axis('off')
    #plt.savefig('keywords.eps', format='eps', transparent=True, bbox_inches='tight',dpi=800)
    plt.show()

from ipywidgets import *
style = {'description_width': 'initial'}

interact(visualization,text=text,K=widgets.IntSlider(min=1,max=20,step=1,value=4,description='window size', style=style),number_keywords=widgets.IntSlider(min=1,max=30,step=1,value=15,description='number of keywords', style=style))


A Jupyter Widget

<function __main__.visualization>

# sentence detection!

In [628]:
def clean_verb(text):
    sentences=sent_tokenize(text)
    sentences=[nlp(sentence) for sentence in sentences]
    ## nos quedamos, en esta version, solo con sustantivos (propios y comunes) y adjetivos!
    sentences=[[token for token in sentence if token.lemma_!='-PRON-'] for sentence in sentences]
    sentences=[[token.lemma_ for token in sentence if token.pos_=='VERB' or token.pos_=='NOUN' or token.pos_=='ADJ' or token.pos_=='PROPN'] for sentence in sentences]
    text=[item for sublist in sentences for item in sublist]
    #text=[word for word in text if not word in stop_words]
    return text

def graph_weighted_detection(text,K):
    text=clean_verb(text)
    unique_words=list(set(text))
    G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in text: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text)):
                    if G.has_edge(text[index],text[index+k])==False and G.has_edge(text[index+k],text[index])==False:
                        G.add_edge(text[index],text[index+k],weight=1)
                    else:
                        x=G[text[index]][text[index+k]]['weight']
                        G[text[index]][text[index+k]]['weight']=x+1
    
    return G

In [637]:
texto='In the language as recorded in a modern English dictionary the great majority of words are borrowed; but the words we ordinarily use in speaking are largely of English origin, although for the most part somewhat changed in form since their first introduction into England.'
#texto='Although (as has been shown above) it would be incorrect to say that English was derived from Latin, or French, or Greek, of from anything else but the original language of the Teutonic branch of the Indo-European language, nevertheless Latin, French and Greek have not been without great and lasting influence on our vocabulary.'
#texto='The majority of words recorded in a modern English dictionary have been borrowed from other languages.'
#texto='The majority of words recorded in a modern English dictionary have been borrowed from other languages. However, the words ordinarily used in speaking are largely of English origin. Most words have somewhat changed in form since their first introduction into England.'
#texto='The emerging outbreak of Zika virus in the Americas has brought this once obscure pathogen to the forefront of global healthcare . Mostly transmitted by Aedes aegypti and A.   albopictus mosquitoes , Zika virus infections have been further spread by international travel and have expanded to large , heavily populated regions of South , Central , and North America . '
G=graph_weighted_detection(texto,10)
import community
partition=community.best_partition(G)

In [639]:
from termcolor import colored

colors = ['blue', 'red', 'green', 'magenta','grey','cyan','yellow','white']
C=[]
#texto=clean_verb(texto)
texto_nlp=nlp(texto)
for token in texto_nlp:
    if token.lemma_ in partition.keys():
        C+=[colored(token.text, colors[partition[token.lemma_]],attrs=['bold'])]
    else:
        C+=[colored(token.text, colors[7],attrs=['bold'])]

#print(' '.join(C))

def communities(texto,K):
    G=graph_weighted_detection(texto,K)
    
    partition=community.best_partition(G)
    C=[]
    texto_nlp=nlp(texto)
    texto=clean_verb(texto)
    ###############################
    #for i in range(len(texto_nlp)):
    #    if texto_nlp[i].lemma_ in partition.keys() and texto_nlp[i-1].lemma_ in partition.keys() and texto_nlp[i+1].lemma_ in partition.keys():
    #        if partition[texto_nlp[i-1].lemma_]==partition[texto_nlp[i+1].lemma_] and partition[texto_nlp[i].lemma_]!=partition[texto_nlp[i-1].lemma_]:
    #            partition[texto_nlp[i].lemma_]=partition[texto_nlp[i+1].lemma_]
    ###############################
    #text_clean=clean_verb(texto)
    for i in range(len(texto_nlp)):
        if texto_nlp[i].lemma_ in partition.keys():
            C+=[colored(texto_nlp[i].text, colors[partition[texto_nlp[i].lemma_]],attrs=['bold'])]
        else:
            C+=[colored(texto_nlp[i].text, colors[7],attrs=['bold'])]

    print(' '.join(C))

communities(texto,10)

[1m[37mIn[0m [1m[37mthe[0m [1m[34mlanguage[0m [1m[37mas[0m [1m[34mrecorded[0m [1m[37min[0m [1m[37ma[0m [1m[34mmodern[0m [1m[34mEnglish[0m [1m[34mdictionary[0m [1m[37mthe[0m [1m[34mgreat[0m [1m[34mmajority[0m [1m[37mof[0m [1m[34mwords[0m [1m[31mare[0m [1m[34mborrowed[0m [1m[37m;[0m [1m[37mbut[0m [1m[37mthe[0m [1m[34mwords[0m [1m[37mwe[0m [1m[37mordinarily[0m [1m[34muse[0m [1m[37min[0m [1m[31mspeaking[0m [1m[31mare[0m [1m[37mlargely[0m [1m[37mof[0m [1m[34mEnglish[0m [1m[31morigin[0m [1m[37m,[0m [1m[37malthough[0m [1m[37mfor[0m [1m[37mthe[0m [1m[31mmost[0m [1m[31mpart[0m [1m[37msomewhat[0m [1m[31mchanged[0m [1m[37min[0m [1m[31mform[0m [1m[37msince[0m [1m[37mtheir[0m [1m[31mfirst[0m [1m[31mintroduction[0m [1m[37minto[0m [1m[31mEngland[0m [1m[37m.[0m
