# graph of words

## datos - corpus introducciones revista CELL

In [1]:
import ast
import re
from nltk import sent_tokenize
import spacy
nlp = spacy.load('en')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
import networkx as nx
import community

In [2]:
data = open('corpus_cell.json','r')
data_cell=data.read()
data_cell = ast.literal_eval(data_cell)
introductions=[' '.join(item['introduction']) for item in data_cell]
introductions=[re.sub("[\(\[].*?[\)\]]", "", item) for item in introductions]
introductions=[x for x in introductions if len(x)>0]

## clean text

In [8]:
data=open('stopwords.txt','r')
data_read = data.read()
stop_words=data_read.replace('\n',' ').split()

def clean(text):
    text=tokenizer.tokenize(text)
    text=[word.lower() for word in text]
    text=[word for word in text if word not in stop_words]
    text=[word for word in text if not word.isnumeric()]
    return text

n=multiprocessing.cpu_count()
texts=Parallel(n_jobs=n)(delayed(clean)(text) for text in introductions)

In [9]:
texts[0]


['emerging',
 'outbreak',
 'zika',
 'virus',
 'americas',
 'brought',
 'obscure',
 'pathogen',
 'forefront',
 'global',
 'healthcare',
 'transmitted',
 'aedes',
 'aegypti',
 'albopictus',
 'mosquitoes',
 'zika',
 'virus',
 'infections',
 'spread',
 'international',
 'travel',
 'expanded',
 'large',
 'heavily',
 'populated',
 'regions',
 'south',
 'central',
 'north',
 'america',
 'correlations',
 'increase',
 'zika',
 'virus',
 'infections',
 'development',
 'fetal',
 'microcephaly',
 'guillain',
 'barré',
 'syndrome',
 'resulted',
 'declaration',
 'public',
 'health',
 'emergency',
 'world',
 'health',
 'organization',
 'call',
 'fast',
 'tracked',
 'development',
 'zika',
 'virus',
 'diagnostics',
 'synthetic',
 'biology',
 'emerging',
 'discipline',
 'great',
 'potential',
 'respond',
 'pandemics',
 'increasing',
 'ability',
 'synthetic',
 'biologists',
 'repurpose',
 'engineer',
 'natural',
 'biological',
 'components',
 'practical',
 'applications',
 'led',
 'opportunities',
 'mol

## grafos!

In [10]:
def graph_unweighted(text,K):
    text=clean(text)
    unique_words=list(set(text))
    G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in text: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text)):
                    if G.has_edge(text[index],text[index+k])==False:
                        G.add_edge(text[index],text[index+k])
    
    return G

def graph_weighted(text,K):
    text=clean(text)
    unique_words=list(set(text))
    G=nx.Graph()
    for word in unique_words:
        G.add_node(word)
    for word in text: ## recorremos el texto y encontramos los indices de todas las aparicions de word (index_word)
        index_word=[index for index, value in enumerate(text) if value == word]
        ## ahora buscamos las palabras vecinas en una ventana de largo K (hacia adelante)
        for index in index_word:
            for k in range(1,K+1):
                if index+k in range(len(text)):
                    if G.has_edge(text[index],text[index+k])==False:
                        G.add_edge(text[index],text[index+k],weight=1)
                    else:
                        x=G[text[index]][text[index+k]]['weight']
                        G[text[index]][text[index+k]]['weight']=x+1
    
    return G

## construimos los grafos y extraemos algunos rasgos de 
### https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.centrality.html

In [11]:
def features(text):
    d={}
    G=graph_unweighted(text,4)
    d['degree']=nx.degree_centrality(G)
    d['closeness']=nx.closeness_centrality(G)
    d['betweenness']=nx.betweenness_centrality(G)
    partition=community.best_partition(G.to_undirected())
    d['number_communities']=len(list(set(partition.values())))
    
    return d

graph_features=Parallel(n_jobs=multiprocessing.cpu_count())(delayed(features)(text) for text in introductions)

In [7]:
graph_features[0]

{'betweenness': {'48': 0.0018815695650554925,
  '7': 0.0010555616626659811,
  'ability': 0.00031066851971415176,
  'abiotic': 0.0016076681106606634,
  'accurate': 0.001743278506868565,
  'acid': 0.012422858663428791,
  'active': 0.0008721118849199547,
  'addressed': 0.001171427638210541,
  'aedes': 0.004372496752111652,
  'aegypti': 0.006325085064876688,
  'albopictus': 0.004798808628448773,
  'algorithms': 0.000782158857817708,
  'america': 0.010221997846321038,
  'americas': 0.003010152806771594,
  'amplification': 0.05843237439774792,
  'antibody': 0.0009119979766195564,
  'applications': 0.002046380137342479,
  'approaches': 0.0007428219356763294,
  'assemble': 0.0038793228148341205,
  'attained': 0.00047592369082462787,
  'automate': 0.0014680100889753974,
  'barriers': 0.002054777727559894,
  'barré': 0.01038159235570388,
  'base': 0.0068301517611687465,
  'based': 0.12510229588990007,
  'bind': 0.0005301883076436343,
  'biological': 0.0007548340979687019,
  'biologists': 0.00077