# graph of words
### referencia  http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=907E7B3B0E507B2DDF531A314D215114?doi=10.1.1.348.6172&rep=rep1&type=pdf

In [6]:
import json
import re
from nltk import sent_tokenize
import spacy
nlp = spacy.load('en')
import numpy as np
import networkx as nx
import random
import operator
import itertools
import matplotlib.pyplot as plt
from nltk.corpus import stopwords # stopwords de nltk 
stop_words = set(stopwords.words('english'))

### clean text - elegimos si filtramos los textos para dejar unicamente sustantivos (comunes + propios) y adjetivos o si filtramos solo stop_words

In [3]:
# filter_nouns_adj indica el tipo de filtrado
# True dejamos sustantivos (comunes y propios) y adjetivos
# False filtramos solo stopwords

def clean(text,filter_nouns_adj):
    sentences=sent_tokenize(text)
    sentences=[nlp(sentence) for sentence in sentences]
    
    if filter_nouns_adj==True:
        sentences=[[token.lemma_ for token in sentence if token.pos_=='NOUN' or token.pos_=='ADJ' or token.pos_=='PROPN'] for sentence in sentences]
    else:
        sentences=[[token.lemma_ for token in sentence] for sentence in sentences]

    text=[item for sublist in sentences for item in sublist]
    text=[word for word in text if not word in stop_words]
    return text

In [113]:
text='the cat killed the dog with a knife hello man woman crocodile. The lion killed the cat with a hummer'
clean(text,True)

['cat', 'dog', 'knife', 'man', 'woman', 'crocodile', 'lion', 'cat', 'hummer']

### graph of words

In [114]:
# K es el largo de la ventana
# filter_nouns_adj indica el tipo de filtrado
def graph_weighted(text,K,filter_nouns_adj):
    text=text.replace('\xa0','-')
    text=clean(text,filter_nouns_adj)
    unique_words=list(set(text))
    G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in text: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text)):
                    if G.has_edge(text[index],text[index+k])==False:
                        G.add_edge(text[index],text[index+k],weight=1)
                    else:
                        x=G[text[index]][text[index+k]]['weight']
                        G[text[index]][text[index+k]]['weight']=x+1
    
    return G

In [115]:
graph_weighted(text,2,False)

<networkx.classes.graph.Graph at 0x7f5b9c219e80>

### Keywords a partir de graph of words. Se utiliza pagerank como algoritmo para encontrar nodos importantes

In [119]:
# K es el largo de la ventana
# filter_nouns_adj indica el tipo de filtrado
# number_keywords indica el numero de keywords
def keywords_pagerank(text,number_keywords,filter_nouns_adj,K):
    G=graph_weighted(text,K,filter_nouns_adj)
    keywords=nx.pagerank(G, alpha=0.85, weight='weight')
    return list(list(zip(*sorted(keywords.items(), key=operator.itemgetter(1),reverse=True)))[0][:number_keywords])
        

In [120]:
keywords_pagerank(text,4,True,2)

['cat', 'knife', 'woman', 'lion']

### corpus de keywords+papers

In [41]:
data = json.load(open('corpus_cell.json'))

In [55]:
data_keywords={}
for i in range(len(data)):
    if len(data[i]['introduction'])>0 and len(data[i]['keywords'])>0:
        data_keywords[i]=[data[i]['introduction'],data[i]['keywords']]

In [147]:
introduction_keywords={}

for key in data_keywords.keys():
    n=0
    for w in data_keywords[key][1]:
        if w.upper()!=w:
            n=1
            break
    if n==1:
        introduction_keywords[key]=[re.sub("[\(\[].*?[\)\]]", "", ' '.join(data_keywords[key][0])),[w.lower() for w in data_keywords[key][1]]]

In [148]:
keyword_keyword={}
for key in introduction_keywords.keys():
    keyword_keyword[key]=['; '.join(keywords_pagerank(introduction_keywords[key][0],15,True,4)),'; '.join(introduction_keywords[key][1])]

In [150]:
keyword_keyword[79][1]

'cd8; t cell; exhaustion; dysfunction; cancer; single-cell; crispr/cas9; gata-3; metallothioneins; zinc; tils; tumor'

In [157]:
import pandas as pd

lista_keywords=[]
for key in introduction_keywords.keys():
    lista_keywords+=[keyword_keyword[key]]

df = pd.DataFrame(lista_keywords)
## columnas - prediccion + original
df.to_csv('keywords.csv', index=False, header=False, sep='\t')

In [160]:
introductions=[introduction_keywords[key][0] for key in introduction_keywords.keys()]
df = pd.DataFrame(introductions)
df.to_csv('introductions.csv', index=False, header=False, sep='\t')