In [None]:
import numpy as np
import pandas
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
# Load data
data = pandas.read_csv('publications.csv')

In [None]:
# Parse topics
import ast
frames = []
for index, row in data.iterrows():
    frame = {}
    s = ast.literal_eval(row['all_topics'])
    subs = s[1:]
    subs.sort()
    frame['id'] = row['article_id']
    frame['main'] = row['main_topic'].split('.')[0]
    frame['subs'] = subs
    frame['simstr'] = ','.join(list(subs))
    frames.append(frame)
print(frames)

df = pandas.DataFrame(frames)

In [None]:
from difflib import SequenceMatcher

def calculate_similarity(a,b):
    n = a.split(',')
    m = b.split(',')
    return SequenceMatcher(None, n, m).ratio()


# Calculate edge weights
def generate_edge_weights(df: pandas.DataFrame):
    edges = {}
    for thing in df.itertuples():
        if len(thing.simstr) == 0:
            continue
        
        df2 = df[df['main'] != thing.main]
        df2['weight'] = df['simstr'].apply(lambda x: calculate_similarity(thing.simstr, x))
        links = df2[df2['weight'] >= 0.5]
        linked_topics = list(links['main'].value_counts().index[links['main'].value_counts() >= 2])


        for topic in linked_topics:
            pair = (thing.main, topic)
            if pair in edges.keys():
                edges[pair] +=1
            elif pair[::-1] in edges.keys():
                pair = pair[::-1]
                edges[pair] +=1
            else:
                edges[pair] = 1
    return edges

edges = generate_edge_weights(df)
print(edges)

In [None]:
# Generate adjancency matrix
x = set()
for pair in edges.keys():
    a,b = pair
    x.add(a)
    x.add(b)

x = list(x)
side = len(x)
matrix = np.zeros((side, side))
print(x)
for i, row in enumerate(matrix):
    for j, col in enumerate(row):
        pair = (x[i], x[j])
        if pair in edges.keys():
            matrix[i][j] = edges[pair]
        if pair[::-1] in edges.keys():
            matrix[i][j] = edges[pair[::-1]]
adj_matrix = pandas.DataFrame(matrix, x, x)

print(adj_matrix)
adj_matrix.to_excel('adj_matrix.xlsx')

In [None]:
%%script false --no-raise-error
# Normalize weights
max_val = max(edges.values())
min_val = min(edges.values())

for pair, weight in edges.items():
    edges[pair] = (weight - min_val) / max_val

print(edges)

In [None]:
# Generate tuples
tuples = []
nodes = set()
for pair, weight in edges.items():
    a, b = pair
    nodes.add(a)
    nodes.add(b)
    tuple = (a, b, weight)
    tuples.append(tuple)
# Create graph

graph = nx.Graph()
graph.add_nodes_from(list(nodes))
graph.add_weighted_edges_from(tuples)

In [None]:
# Print key characteristics
__length = len(graph.nodes)
__connected = [x for x in nx.connected.connected_components(graph)]
__cent = nx.betweenness_centrality(graph)
_max_cent = max(__cent.values())
_inf_node = [x for x in __cent if __cent[x] == _max_cent]
print(graph)
print('Degree centrality {:.2f}'.format(sum(nx.degree_centrality(graph).values()) / __length))
print('Closeness {:.2f}'.format(sum(nx.closeness_centrality(graph).values()) / __length))
print('Betweenness {:.2f}'.format(sum(nx.betweenness_centrality(graph).values()) / __length))
print('Clustering coefficient {:.2f}'.format( sum(nx.clustering(graph).values()) / __length))
print('Connected components', len(__connected))
print('Diameter', nx.distance_measures.diameter(graph))
print('Average shortest path length {:.2f}'.format(nx.average_shortest_path_length(graph)))
print('Most influential node', _inf_node[0])
# diameter, pathlength, clustering coefficient, connected components

In [None]:
# Graph the result
nx.draw(graph, pos=nx.kamada_kawai_layout(graph), with_labels=True)

In [None]:
# Generate communities
import itertools
comp = nx.community.girvan_newman(graph)
limited = itertools.takewhile(lambda x: len(x) <= 100, comp)
communities = [ list(x) for x in next(comp)]
singles = []
multis = []
for com in communities:
    if len(com) == 1:
        singles.append(com[0])
    else:
        multis.append(com)
print('Singles', len(singles), ', Multis', len(multis))

In [None]:
colors = ['red', 'green', 'blue', 'orange', 'yellow', 'magenta', 'cyan']
color_map = []
for node in graph:
    if node in singles:
        color_map.append('grey')
    else:
        for i, m in enumerate(multis):
            if node in m:
                color_map.append(colors[i])
nx.draw(graph, node_color=color_map, pos=nx.kamada_kawai_layout(graph), with_labels=True)
plt.title('Communities by color')
plt.show()

In [None]:
nx.draw(graph, node_color=color_map, pos=nx.spring_layout(graph), with_labels=True)
plt.title('Communities by color')
plt.show()

In [None]:
# Partition quality
print("Graph modularity {:.2f}".format(nx.community.modularity(graph, communities)))
print('Graph partition quality: Coverage {0:.2f}, Performance {1:.2f}'.format(*nx.community.partition_quality(graph, communities)))