# graph of words

## definimos los grafos

In [3]:
import community
import ast
import re
from nltk import sent_tokenize
import spacy
nlp = spacy.load('en')
import numpy as np
import networkx as nx
import random
import operator
import itertools
import matplotlib.pyplot as plt
from nltk.corpus import stopwords # stopwords de nltk 
stop_words = set(stopwords.words('english'))

In [34]:
# filter_nouns_adj indica el tipo de filtrado
# True dejamos sustantivos (comunes y propios) y adjetivos
# False filtramos solo stopwords

def clean(text,filter_nouns_adj):
    sentences=sent_tokenize(text)
    sentences=[nlp(sentence) for sentence in sentences]
    
    if filter_nouns_adj==True:
        sentences=[[token.lemma_ for token in sentence if token.pos_=='NOUN' or token.pos_=='ADJ' or token.pos_=='PROPN'] for sentence in sentences]
    else:
        sentences=[[token.lemma_ for token in sentence] for sentence in sentences]

    text=[item for sublist in sentences for item in sublist]
    text=[word for word in text if not word in stop_words]
    return text

In [35]:
# K es el largo de la ventana
# filter_nouns_adj indica el tipo de filtrado
def graph_weighted(text,K,filter_nouns_adj):
    text=clean(text,filter_nouns_adj)
    unique_words=list(set(text))
    G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in text: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text)):
                    if G.has_edge(text[index],text[index+k])==False:
                        G.add_edge(text[index],text[index+k],weight=1)
                    else:
                        x=G[text[index]][text[index+k]]['weight']
                        G[text[index]][text[index+k]]['weight']=x+1
    
    return G

## datos de diferentes Q

### Q4

In [36]:
data = open('omics_temp.json','r')
data_omics=data.read()
data_omics = ast.literal_eval(data_omics)

In [37]:
DATA=[]

for k in range(len(data_omics)):
    n=len(data_omics[k]['sections'])
    data_omics_k={}
    for i in range(n):
        title=data_omics[k]['sections'][i]['title']
        if title!='Abstract' and title!='Keywords' and title!='References':
            if len(data_omics[k]['sections'][i]['paragraphs'])>1:
                data_omics_k[data_omics[k]['sections'][i]['title']]=[' '.join(data_omics[k]['sections'][i]['paragraphs'])]
            else:
                data_omics_k[data_omics[k]['sections'][i]['title']]=data_omics[k]['sections'][i]['paragraphs']
    DATA+=[data_omics_k]
    

### papers completos Q4!

In [40]:
complete_papers_Q4=[' '.join([item for sublist in list(paper.values()) for item in sublist]) for paper in DATA]
complete_papers_Q4=[re.sub("[\(\[].*?[\)\]]", "", paper) for paper in complete_papers_Q4]
complete_papers_Q4=[paper for paper in complete_papers_Q4 if len(paper)>1]

In [41]:
len(complete_papers_Q4)

2302

### Q1

In [4]:
data = open('cell_full_xml.json','r')
data_cell=data.read()
data_cell = ast.literal_eval(data_cell)

In [5]:
def exctractparragraphs(data):
   #titles is a string
   outputlist = []
   for i in data:
       for j in i[u'sections']:
            temp1 = j[u'paragraphs']
            if len(temp1) > 0:
                for k in temp1:
                    outputlist.append(k)
            else:
                temp1 = j[u'subsections']
                for k in temp1:
                    temp2 = k[u'paragraphs']
                    if len(k)> 0:
                        for l in temp2:
                            outputlist.append(l)
   return outputlist

In [6]:
DATA_cell=[]
for i in range(len(data_cell)):
    DATA_cell+=[exctractparragraphs([data_cell[i]])]

### papers completos Q1

In [7]:
complete_papers_cell=[' '.join(paper) for paper in DATA_cell]
complete_papers_cell=[re.sub("[\(\[].*?[\)\]]", "", paper) for paper in complete_papers_cell]
complete_papers_cell=[paper for paper in complete_papers_cell if len(paper)>1]
complete_papers_cell=[paper.replace('__REF','') for paper in complete_papers_cell]
complete_papers_cell=[paper.replace('REF__','') for paper in complete_papers_cell]

In [8]:
len(complete_papers_cell)

2118

## Rasgos de los grafos

### primero calculamos los grafos!

In [None]:
from joblib import Parallel, delayed
import multiprocessing
n=multiprocessing.cpu_count()
#grafos_omics=[graph_weighted(paper,4,False) for paper in complete_papers_Q4]
grafos_omics=Parallel(n_jobs=n)(delayed(graph_weighted)(paper,4,False) for paper in complete_papers_Q4)

In [None]:
grafos_cell=Parallel(n_jobs=n)(delayed(graph_weighted)(paper,4,False) for paper in complete_papers_cell)

### rasgos de los grafos

In [None]:
def graph_features(graph):
    
    features={}
    features['clustering']=nx.clustering(G, weight='weight') ## diccionario
    features['betweenness_centrality']=nx.betweenness_centrality(G, normalized=False,weight='weight') ## diccionario
    features['closeness_centrality']=nx.closeness_centrality(G) ## diccionario
    fearures['core_number']=nx.core_number(G) ## diccionario
    features['degree_centrality']=nx.degree_centrality(G) ## diccionario
    features['diameter']=nx.diameter(G)
    fearures['number_connected_components']=nx.number_connected_components(G)
    partition = community.best_partition(G)
    features['number_communities']=float(len(set(partition.values())))
    features['number_triangles']=nx.triangles(G)/float(3)
    
    return features