In [1]:
import collections
import glob
import itertools
import re
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from multiprocessing import Pool
import community
import operator
%matplotlib inline

In [2]:
def popular(iterable, limit):
    """
    A little utility to compute popular values on an iterable.
    """
    return collections.Counter(iterable).most_common(limit)

In [3]:
def article_text_to_dict(article_text: str):
    """
    Translates an article text into a dict.
    """
    data = collections.defaultdict(list)
    field = ''
    for line in re.split(r'\n+', article_text):
        # Fix little bug with isi files
        if line.startswith('null'):
            line = line[4:]
        name = line[:2]
        value = line[3:]
        if not name.isspace():
            field = name
        if not field.isspace() and field != 'ER':
            data[field].append(value)
    return dict(data)

In [4]:
class WosToolsError(Exception):
    """
    All the errors go here.
    """
    pass


class Article(object):
    """
    Abstract a WOS article.
    """

    def __init__(self, article_text):
        self._article_text = article_text
        self._data = article_text_to_dict(article_text)

    def __getattr__(self, name):
        if name not in self._data and not hasattr(self._data, name):
            raise AttributeError('{self.__class__.__name__} does not have an attribute %s' %name)
        if name not in self._data:
            return getattr(self._data, name)
        return self._data[name]

    def __hasattr__(self, name):
        return name in self._data


class CollectionLazy(object):
    """
    A collection of WOS text files.
    """

    def __init__(self, *filenames):
        self.filenames = filenames

    @classmethod
    def from_glob(cls, pattern):
        """
        Creates a new collection from a pattern using glob.
        """
        return cls(*glob.glob(pattern))

    @property
    def files(self):
        """
        Iterates over all files in the collection
        """
        for filename in self.filenames:
            try:
                with open(filename) as filehandle:
                    yield filehandle
            except FileNotFoundError:
                raise WosToolsError('The file %s was not found' %filename)

    @property
    def article_texts(self):
        """
        Iterates over all the single article texts in the colection.
        """
        for filehandle in self.files:
            data = filehandle.read()
            for article_text in data.split('\n\n')[1:]:
                if article_text == 'EF':
                    continue
                yield article_text

    @property
    def articles(self):
        """
        Iterates over all articles.
        """
        for article_text in self.article_texts:
            yield Article(article_text)

    @property
    def authors(self):
        """
        Iterates over all article authors, including duplicates
        """
        authors = (
            article.AF
            for article in self.articles
            if hasattr(article, 'AF')
        )
        return itertools.chain(*authors)

    @property
    def coauthors(self):
        """
        Iterates over coauthor pairs.
        """
        authors_by_article = (
            article.AF
            for article in self.articles
            if hasattr(article, 'AF')
        )
        return itertools.chain(*(
            itertools.combinations(sorted(authors), 2)
            for authors in authors_by_article
        ))

    def completeness(self, key=None):
        """
        Computes the completeness of the collection by key.
        """
        counters = collections.defaultdict(int)
        total = 0
        for article in self.articles:
            total += 1
            for key in article.keys():
                counters[key] += 1
        return {key: val/total for key, val in counters.items()}

    
class Collection(CollectionLazy):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._article_texts = None

    @property
    def article_texts(self):
        """
        Iterates over all the single article texts in the colection.
        """
        if self._article_texts == None:
            self._article_texts = list(set(super().article_texts))
        return self._article_texts


# Building the graph

In [5]:
a = Collection.from_glob('../WoK/*.txt')
authors_list = list(set(list(a.authors)))
coauthors_list = list(a.coauthors)

In [6]:
from pprint import pprint
print(len(coauthors_list))
pprint(coauthors_list[:10])
pprint(authors_list[:10])

43276
[('Abe, T', 'Aoi, H'),
 ('Abe, T', 'Miura, K'),
 ('Abe, T', 'Muraoka, H'),
 ('Abe, T', 'Nakamura, Y'),
 ('Aoi, H', 'Miura, K'),
 ('Aoi, H', 'Muraoka, H'),
 ('Aoi, H', 'Nakamura, Y'),
 ('Miura, K', 'Muraoka, H'),
 ('Miura, K', 'Nakamura, Y'),
 ('Muraoka, H', 'Nakamura, Y')]
['Takano, Yuta',
 'Moon, Jaekyun',
 'Egawa, Genta',
 'Wu Hai-shun',
 'Smith, DJ',
 'Nguyen, H',
 'Kagami, T',
 'Rettner, C. T.',
 'Dahmen, K. A.',
 'Tang, YH']


In [27]:
from igraph import Graph

authors_key = {name: i for i, name in enumerate(authors_list)}
coauthor_edges = [
    (authors_key[source], authors_key[target])
    for source, target in coauthors_list
]

g = Graph()
g.add_vertices(len(authors_key))
g.add_edges(coauthor_edges)
g.vs['label'] = authors_list

In [28]:
from igraph import plot
_ = plot(g, 'something.png')

In [29]:
print(g.vs.degree())
g.simplify()
print(g.vs.degree())

[4, 4, 22, 4, 5, 11, 31, 3, 8, 1, 4, 6, 16, 15, 13, 2, 61, 2, 5, 9, 3, 1, 22, 10, 22, 6, 27, 10, 4, 13, 10, 5, 4, 4, 6, 4, 3, 6, 3, 17, 4, 4, 92, 14, 5, 6, 16, 4, 4, 8, 6, 7, 29, 5, 5, 3, 4, 14, 2, 4, 7, 16, 8, 4, 4, 2, 3, 14, 2, 8, 21, 9, 9, 7, 19, 5, 11, 10, 6, 5, 1, 5, 15, 9, 7, 6, 4, 2, 4, 6, 16, 4, 4, 9, 7, 22, 3, 9, 4, 11, 5, 2, 2, 1, 8, 13, 6, 1, 14, 4, 1, 12, 4, 3, 2, 1, 3, 13, 3, 21, 3, 6, 4, 7, 11, 5, 6, 4, 4, 0, 5, 5, 70, 4, 13, 8, 5, 4, 2, 7, 3, 6, 7, 3, 1, 17, 4, 13, 4, 1, 3, 5, 48, 17, 4, 4, 6, 3, 46, 12, 4, 4, 0, 5, 2, 6, 5, 9, 5, 5, 11, 48, 4, 22, 11, 21, 18, 12, 13, 12, 4, 6, 43, 4, 1, 3, 13, 9, 21, 2, 12, 8, 5, 2, 8, 7, 27, 10, 16, 13, 9, 9, 4, 3, 1, 4, 0, 6, 22, 5, 5, 2, 61, 5, 23, 3, 1, 3, 5, 6, 5, 7, 2, 10, 4, 14, 4, 9, 13, 27, 3, 1, 8, 13, 20, 6, 12, 6, 4, 5, 4, 3, 5, 7, 2, 43, 116, 4, 7, 7, 2, 14, 4, 11, 7, 6, 2, 4, 9, 2, 14, 6, 4, 1, 3, 4, 4, 6, 4, 16, 3, 61, 7, 4, 26, 61, 7, 4, 6, 5, 5, 7, 4, 1, 9, 7, 17, 4, 31, 9, 4, 4, 23, 8, 3, 4, 3, 6, 3, 2, 11, 4, 5, 2, 3,

# Clustering

In [31]:
components = g.components()
components.subgraphs()
plot?

In [33]:
for graph in components.subgraphs():
    if graph.vcount() > 20:
        plot(graph, f'graph-{graph.vcount()}.svg')
        pprint(list(zip(graph.vs.degree(), graph.vs['label'])))

[(4, 'Moon, Jaekyun'),
 (8, 'Egawa, Genta'),
 (3, 'Rettner, C. T.'),
 (5, 'Dahmen, K. A.'),
 (4, 'Critchley, Kevin'),
 (15, 'Watanabe, Katsuro'),
 (14, 'Jain, S.'),
 (21, 'Smith, Neil'),
 (10, 'Chang, Mu-Tung'),
 (5, 'Chan, Kheong-Sann'),
 (3, 'Myint, Lin Min Min'),
 (6, 'Moriya, Tomohiro'),
 (17, 'Ozatay, O.'),
 (72, 'Ikeda, Y.'),
 (5, 'Khatami, Seyed Mehrdad'),
 (11, 'Hu, Bing'),
 (4, 'Zhan, L.'),
 (5, 'Zuo, L.'),
 (3, 'Ise, Kazuyuki'),
 (7, 'Natsiopoulos, G.'),
 (13, 'Grosso, David'),
 (8, 'Kang, Ho Kwan'),
 (6, 'Zhang, C.'),
 (7, 'van de Veerdonk, Rene'),
 (19, 'Oenning, Travis'),
 (10, 'Bashir, Muhammad Asif'),
 (7, 'Radhakrishnan, Rathnakumar'),
 (8, 'Chen, Bingjin'),
 (20, 'Eleftheriou, Evangelos'),
 (9, 'Narisawa, T.'),
 (4, 'Tanaka, Shuji'),
 (12, 'Dill, Frederick'),
 (4, 'Ren, S. L.'),
 (3, 'Liu, E. J.'),
 (13, 'Zakai, Rehan'),
 (4, 'Thompson, G. B.'),
 (10, 'Heinonen, O.'),
 (5, 'Gan, F. X.'),
 (4, 'Murata, Yuto'),
 (4, 'Tabakovic, Ibro'),
 (43, 'Gao, He'),
 (11, 'Carlotti, 

 (14, 'Saito, Hitoshi'),
 (12, 'Nishio, Kazuyuki'),
 (6, 'Singh, Amritpal'),
 (8, 'Shiroyama, T.'),
 (9, 'Chang, Thomas'),
 (12, 'Nabavi, Sheida'),
 (13, 'Fernandez-de-Castro, Juan'),
 (4, 'Tahk, Young-Wook'),
 (18, 'Mao, S.'),
 (8, 'Mai, Ken'),
 (16, 'Kiely, James'),
 (9, 'Ishida, Yoichi'),
 (5, 'Ku, David'),
 (6, 'Nguyen, T. N. Anh'),
 (3, 'Sakai, Masanori'),
 (9, 'Long, H. H.'),
 (14, 'Marinero, Ernesto E.'),
 (18, 'Tanaka, Atsushi'),
 (9, 'Nolan, Thomas P.'),
 (5, 'Matsuyama, Kimihide'),
 (14, 'Angelakeris, M.'),
 (3, 'Ando, Ayano'),
 (7, 'Inaba, Hiroshi'),
 (4, 'Goh, Jing Qiang'),
 (30, 'Xiao, Shuaigang'),
 (6, 'Li, Yanbo'),
 (36, 'Hirano, Toshiki'),
 (6, 'Tada, Yasuhiko'),
 (18, 'Kaiser, C.'),
 (18, 'Srinivasan, K.'),
 (4, 'Li, Jianming'),
 (8, 'Hinata, Shintaro'),
 (7, 'Mutoh, H.'),
 (25, 'Childress, J. R.'),
 (4, 'Dai, Xiangyu'),
 (21, 'Takahashi, Migaku'),
 (5, 'Boone, C. T.'),
 (13, 'Nishiyama, K.'),
 (3, 'Chun, Dongwon'),
 (26, 'Wei, Dan'),
 (2, 'Li, Kwok Hung'),
 (5, 'Esash

 (28, 'Honda, N'),
 (19, 'Judy, JH'),
 (7, 'Zhu, FW'),
 (10, 'Oikawa, S'),
 (22, 'Sunder, A'),
 (4, 'Kodama, H'),
 (3, 'Piramanayagm, SN'),
 (4, 'Xia, W'),
 (5, 'Chen, DQ'),
 (7, 'Stoll, H'),
 (6, 'Jeong, S'),
 (2, 'Moon, J'),
 (4, 'Hee, CH'),
 (15, 'Tamaru, S'),
 (7, 'Ferre, J'),
 (14, 'Wang, HW'),
 (22, 'Swanson, B'),
 (33, 'Heinonen, O'),
 (6, 'Ganesan, S'),
 (6, 'Nakatani, Y'),
 (10, 'Ishimoto, T'),
 (4, 'Lee, TH')]
[(61, 'Ivanova, NA'),
 (61, 'Amaglobeli, NS'),
 (61, 'Zaitseva, OS'),
 (61, 'Samsonov, VA'),
 (61, 'Larichev, AN'),
 (61, 'Rukovichkin, VP'),
 (61, 'Vorobiev, AP'),
 (61, 'Gramenitsky, IM'),
 (61, 'Pleskach, AV'),
 (61, 'Leflat, AK'),
 (61, 'Orfanitsky, SV'),
 (61, 'Bogolyubsky, MY'),
 (61, 'Kurchaninov, LL'),
 (61, 'Lyutov, SI'),
 (61, 'Volkov, VY'),
 (61, 'Esakia, SM'),
 (61, 'Tolmachev, VT'),
 (61, 'Zapolsky, VN'),
 (61, 'Boguslavsky, IV'),
 (61, 'Selikov, AV'),
 (61, 'Nedev, S'),
 (61, 'Ermolov, PF'),
 (61, 'Chekulaev, SV'),
 (61, 'Zotkin, SA'),
 (61, 'Vischnevskaya

In [None]:
sub_graphs = list(nx.connected_component_subgraphs(G))

In [None]:
aut_limit = 20
for i,sg in enumerate(sub_graphs):
    sg_list = sorted(list(sg.degree()), key=lambda x: x[1], reverse=True)
    
    if (len(sg_list) >= aut_limit):
        
        print("The subgraph {} have {} nodes. The principal authors are {} with {} links and {} with {} links."
              .format(i,sg.number_of_nodes(),sg_list[0][0],sg_list[0][1],sg_list[1][0],sg_list[1][1]))
        
        print('The histogram of the authors production is ...')
        sg_data = np.array(sorted([sg_list[i][1] for i in range(aut_limit)], reverse=True))
        sg_data_val = [sg_list[i][0] for i in range(aut_limit)]
        plt.figure(figsize=(16,9))
        plt.bar(range(len(sg_data)), sg_data, align='center')
        plt.xticks(range(len(sg_data)), sg_data_val, size='small', rotation='vertical')
        plt.grid(True)
        plt.title("Subgraph {}".format(i))
        plt.show()
        
        print('The subgraph {} looks like ...'.format(i))
        elarge=[(u,v) for (u,v,d) in sg.edges(data=True) if d['weight'] >2]
        esmall=[(u,v) for (u,v,d) in sg.edges(data=True) if d['weight'] <=2]
        pos = nx.spring_layout(sg, iterations=50)
        #pos = nx.circular_layout(G)
        plt.figure(figsize=(8, 8))
        plt.title('Graph {}'.format(i))
        nx.draw_networkx_nodes(sg,pos,node_size=5, node_color='k')
        nx.draw_networkx_edges(sg,pos,edgelist=elarge, width=5, edge_color='g')
        nx.draw_networkx_edges(sg,pos,edgelist=esmall, width=1,alpha=0.5,edge_color='k',style='dashed')
        plt.axis('off')
        plt.axis('equal')
        plt.show()

I want to make a subplot that will not have a fixed value of subplots, but that automatically do the division. I already made this approximation but it is not what I want

In [None]:
def partitions(nodes, n):
    "Partitions the nodes into n subsets"
    nodes_iter = iter(nodes)
    while True:
        partition = tuple(itertools.islice(nodes_iter,n))
        if not partition:
            return
        yield partition

In [None]:
def btwn_pool(G_tuple):
    return nx.betweenness_centrality_source(*G_tuple)

In [None]:
def between_parallel(G, processes = None):
    p = Pool(processes=processes)
    part_generator = 4*len(p._pool)
    node_partitions = list(partitions(G.nodes(), int(len(G)/part_generator)))
    num_partitions = len(node_partitions)
 
    bet_map = p.map(btwn_pool,
                        zip([G]*num_partitions,
                        [True]*num_partitions,
                        [None]*num_partitions,
                        node_partitions))
 
    bt_c = bet_map[0]
    for bt in bet_map[1:]:
        for n in bt:
            bt_c[n] += bt[n]
    return bt_c

In [None]:
for sg in sub_graphs:
    if (sg.number_of_nodes() >= 20):

        bt = between_parallel(sg)
        top = 5

        max_nodes =  sorted(bt.items(), key = lambda v: -v[1])[:top]
        bt_values = [5]*len(sg.nodes())
        bt_colors = [0]*len(sg.nodes())
        for i in range(len(max_nodes)):
            bt_values[i] = 150
            bt_colors[i] = 2
        
        pos = nx.spring_layout(sg)
        plt.axis("off")
        nx.draw_networkx(sg, pos = pos, cmap = plt.get_cmap("rainbow"), node_color = bt_colors, node_size = bt_values, with_labels = False)
        plt.show()