In [87]:
import pandas as pd
import os
import networkx as nx
import itertools
from netwulf import visualize
import netwulf as nw
from collections import Counter
import matplotlib.pyplot as plt

In [88]:
author_df = pd.read_csv("C:\\Users\\mansj\\IQL\\data\\databank_deposit_version_2\\exrx_author_final.csv")

#### Add column full_name 

In [104]:
author_df["full_name"] = author_df['author_given_name'] + " " + author_df['author_surname'].fillna('')
author_df.head()

Unnamed: 0,ID,Title,author_given_name,author_surname,author_id,full_name
0,1,Physical activity in depressed elderly. a syst...,Gioia,Mura,7005321937,Gioia Mura
1,1,Physical activity in depressed elderly. a syst...,Mauro Giovanni,Carta,7006714673,Mauro Giovanni Carta
2,2,Exercise for depression,Gary. M.,Cooney,26028761700,Gary. M. Cooney
3,2,Exercise for depression,Kerry,Dwan,25027473800,Kerry Dwan
4,2,Exercise for depression,Carolyn A.,Greig,7003441733,Carolyn A. Greig


#### Author with same id have different names

In [105]:
author_df[author_df.author_id == 7201720357]

Unnamed: 0,ID,Title,author_given_name,author_surname,author_id,full_name
46,11,Moderators of response in exercise treatment f...,A. L.,Dunn,7201720357,A. L. Dunn
146,26,Physical activity and incident depression: a m...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
391,88,Exercise and depression: meeting standards to ...,A.,Dunn,7201720357,A. Dunn
429,95,The DOSE study: a clinical trial to examine ef...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
484,106,Exercise treatment for depression: Efficacy an...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
1094,219,Exercise as an augmentation treatment for nonr...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
2502,108,The effect of exercise dose on quality of life...,AL,Dunn,7201720357,AL Dunn


#### Grouping on author_id to get the different author names

In [116]:
list_authors  = pd.DataFrame(author_df.groupby('author_id')['full_name'].apply(list))
list_authors.loc[7201720357]

full_name    [A. L. Dunn, Andrea L. Dunn, A. Dunn, Andrea L...
Name: 7201720357, dtype: object

#### Convert it to a list and then to a dictionary to replace the author names with one common name


In [117]:
auth_list = list_authors.full_name.to_list()

auth_dict = dict()

for i in auth_list:
    i = list(set(i))
    if len(i) > 1:
        auth_dict[i[0]] = i[1:]

#### Function replaces the names of author with same id with the selected key in the dictionary
def replace(auth_name, final_dict):
    for k, v in final_dict.items():
        if auth_name in v:
            auth_name = k
            
    return auth_name
    

author_df.full_name = author_df.full_name.apply(lambda x : replace (x, auth_dict))

#### Check if the author names are changed to one name

In [118]:

author_df[author_df.author_id == 7201720357]

Unnamed: 0,ID,Title,author_given_name,author_surname,author_id,full_name
46,11,Moderators of response in exercise treatment f...,A. L.,Dunn,7201720357,Andrea L. Dunn
146,26,Physical activity and incident depression: a m...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
391,88,Exercise and depression: meeting standards to ...,A.,Dunn,7201720357,Andrea L. Dunn
429,95,The DOSE study: a clinical trial to examine ef...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
484,106,Exercise treatment for depression: Efficacy an...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
1094,219,Exercise as an augmentation treatment for nonr...,Andrea L.,Dunn,7201720357,Andrea L. Dunn
2502,108,The effect of exercise dose on quality of life...,AL,Dunn,7201720357,Andrea L. Dunn


#### Check if the author with same id are replaced with a common name

In [121]:
id_df_1 = author_df[['author_id','full_name']]
merged_df_1 = id_df_1.merge(id_df_1, how='left', left_on='author_id', right_on='author_id')
auth_dict_1= merged_df_1[merged_df_1['full_name_x'] != merged_df_1['full_name_y']]
auth_dict_1

Unnamed: 0,author_id,full_name_x,full_name_y


In [78]:
# Create the nodes and edges of the graph from the given dataframe by grouping on the paper ID
G = nx.Graph()
edges = pd.DataFrame(author_df.groupby('ID')['full_name'].apply(list))['full_name'].to_list()

In [122]:
# Counting the number of times the two authors have worked together on a paper
new  = []
for i in edges:
    if len(i) >= 2:
        for subset in itertools.combinations(i,2):
            if len(subset) == 2:
                new.append(subset)
sorted_list = []
for i in new:
    sorted_list.append(tuple(sorted(list(i))))
    
count_auth = Counter(sorted_list)

In [123]:
# Adding the edges to the graph with the weights (number of times the authors have worked together)
for k, v in count_auth.items():
    G.add_edge(k[0], k[1],  weight = v,  length = v)
G.remove_nodes_from(list(nx.isolates(G)))

### Visualized with the help of netwulf. The network data tuned by the visualization is posted back to Python. 
- The visualization function returns two dictionaries.
1. the first containing information about the stylized network.
2. The second containing information about the visualization control configuration which can be used to reproduce the same visualization.


In [124]:
#Uncomment and run the below code to configure the visualization in a different way (different from the set config)

# styled_network, config = nw.visualize(G)


In [125]:
## Config that remains same for the generated visualizations
config = {'zoom': 0.7417314814814815,
 'node_charge': -27.82260101010101,
 'node_gravity': 0.1011679292929293,
 'link_distance': 23.213087752525254,
 'link_distance_variation': 0,
 'node_collision': True,
 'wiggle_nodes': False,
 'freeze_nodes': False,
 'node_fill_color': '#212726',
 'node_stroke_color': '#1f1b1b',
 'node_label_color': '#063a5b',
 'display_node_labels': False,
 'scale_node_size_by_strength': True,
 'node_size': 10.230113636363637,
 'node_stroke_width': 1.356460437710438,
 'node_size_variation': 0.5,
 'link_color': '#7c7c7c',
 'link_width': 3.5610255260942756,
 'link_alpha': 0.29941708754208757,
 'link_width_variation': 0.22592803030303032,
 'display_singleton_nodes': True,
 'min_link_weight_percentile': 0,
 'max_link_weight_percentile': 1}

#### Run the below code to generate the visualization.
- The visualization will open in a new window
- To revert back to the code, select the post to python option in the menu, located on the right side of the window.

In [68]:
nw.visualize(G, config=config)

(None, None)

#### Visualizing the main connected component of the graph

In [126]:
giant = max(nx.connected_components(G), key=len)

giant_list = list(giant)

N = nx.Graph()

for node in G.nodes():
    if node in giant_list:
        for k, v in G.edges(node):
            N.add_edge(k,v, weight = G[k][v]["weight"])

### Visualized with the help of netwulf. The network data tuned by the visualization is posted back to Python. 
- The visualization function returns two dictionaries.
1. the first containing information about the stylized network.
2. The second containing information about the visualization control configuration which can be used to reproduce the same visualization.


In [38]:
#Uncomment and run the below code to configure the visualization in a different way
# styled_network, config = nw.visualize(N)

#### Below configuration can be used to reproduce the final visualization created for the co-author network (main component - largest connected component)

In [84]:
config_gaint = {'zoom': 1,
 'node_charge': -38.16603535353536,
 'node_gravity': 0.09254840067340067,
 'link_distance': 38.26709448653199,
 'link_distance_variation': 0,
 'node_collision': True,
 'wiggle_nodes': True,
 'freeze_nodes': False,
 'node_fill_color': '#1b1d1d',
 'node_stroke_color': '#555555',
 'node_label_color': '#0c5968',
 'display_node_labels': False,
 'scale_node_size_by_strength': True,
 'node_size': 10.661090067340067,
 'node_stroke_width': 1.2702651515151513,
 'node_size_variation': 0.4845138888888889,
 'link_color': '#7c7c7c',
 'link_width': 3.044026199494949,
 'link_alpha': 0.48904671717171716,
 'link_width_variation': 0.5,
 'display_singleton_nodes': True,
 'min_link_weight_percentile': 0,
 'max_link_weight_percentile': 1}

In [127]:
nw.visualize(N, config= config_gaint)

In [128]:
# plt.figure(figsize=(300, 300))
# edges = N.edges()
# weights = [N[u][v]['weight'] for u,v in edges]
# nx.draw(N, font_size = 30, with_labels = True, width = weights)
# plt.savefig('main_component.jpg')

#### Top 10 authors with most number of co-authors

In [135]:
c_a = Counter(author_df["full_name"].to_list())
c_a = sorted(c_a.items(), key = lambda x : x[1], reverse=True)
num_auth_df = pd.DataFrame(c_a, index = range(1,len(c_a)+1))
num_auth_df.rename(columns={0 : "author_name", 1 : "number_of_coauthors"}, inplace=True)
num_auth_df[:10]

Unnamed: 0,author_name,number_of_coauthors
1,Davy Vancampfort,27
2,S. Rosenbaum,27
3,B. Stubbs,26
4,Felipe Schuch,19
5,Philip B. Ward,17
6,Michel Probst,13
7,J. A. Blumenthal,12
8,Michael Babyak,11
9,Marc De Hert,10
10,Murali Doraiswamy,9


In [140]:
# function to get the results of centrality (degree, betweenness, closeness) in the dataframe
def df_table(centrality_name, column_name):
    c_list = sorted(centrality_name.items(), key = lambda x : x[1], reverse=True)

    c_dict = dict()
    for i in c_list[:10]:
        c_dict[i[0]] = round(i[1]*1000,2)
    final_df = pd.DataFrame(list(c_dict.items()), index = range(1,11))
    final_df.rename(columns= {0 : "author_name", 1: column_name}, inplace = True)
    return final_df

In [141]:
#degree centrality
degree_unwe = nx.degree_centrality(G)
degree_df = df_table(degree_unwe, "degree_centrality")
degree_df

Unnamed: 0,author_name,degree_centrality
1,B. Stubbs,27.99
2,S. Rosenbaum,27.99
3,Davy Vancampfort,25.92
4,J. A. Blumenthal,24.36
5,Michael Babyak,20.22
6,Michel Probst,19.18
7,Brenda W.J.H. Penninx,19.18
8,Felipe Schuch,17.63
9,Philip B. Ward,16.59
10,Andrea L. Dunn,16.07


In [142]:
#betweeness centrality
betweeness_cent = nx.betweenness_centrality(G)
between_df = df_table(betweeness_cent, "betweenness_centrality")
between_df

Unnamed: 0,author_name,betweenness_centrality
1,Andrea L. Dunn,11.81
2,B. Stubbs,10.05
3,Kenneth R. Fox,8.03
4,Melissa A. Napolitano,7.62
5,Debbie. A. Lawlor,4.9
6,P. W. Ku,4.1
7,L. J. Chen,4.1
8,J. A. Blumenthal,3.97
9,Davy Vancampfort,2.83
10,Fiona Gaughran,2.82


In [143]:
#closeness centrality
closeness_centrality = nx.closeness_centrality(G)
closeness_df = df_table(closeness_centrality, "closeness_centrality")
closeness_df

Unnamed: 0,author_name,closeness_centrality
1,B. Stubbs,57.02
2,Andrea L. Dunn,55.3
3,Davy Vancampfort,53.63
4,S. Rosenbaum,53.22
5,Felipe Schuch,52.1
6,Philip B. Ward,51.99
7,Mats Hallgren,51.99
8,Joseph Firth,51.72
9,André F. Carvalho,51.56
10,Marcelo P. Fleck,49.85


In [150]:
eigenvector_centrality = nx.eigenvector_centrality(G)
eigen_list = sorted(eigenvector_centrality.items(), key = lambda x : x[1], reverse=True)

eigen_dict = dict()
for i in eigen_list[:10]:
    eigen_dict[i[0]] = round(i[1]*100,2)
final_df = pd.DataFrame(list(eigen_dict.items()), index = range(1,11))
final_df.rename(columns= {0 : "author_name", 1: "eigenvector_centrality"}, inplace = True)
final_df

Unnamed: 0,author_name,eigenvector_centrality
1,B. Stubbs,26.25
2,Davy Vancampfort,25.79
3,S. Rosenbaum,25.69
4,Fiona Gaughran,22.65
5,Felipe Schuch,19.89
6,Philip B. Ward,19.76
7,Michel Probst,18.74
8,Joseph Firth,17.86
9,André F. Carvalho,17.82
10,Marc De Hert,16.17


In [157]:
effective_size = nx.effective_size(G)
effective_list = sorted(effective_size.items(), key = lambda x : x[1], reverse=True)

effective_dict = dict()
for i in effective_list[:10]:
    effective_dict[i[0]] = round(i[1],2)
final_ef_df = pd.DataFrame(list(effective_dict.items()), index = range(1,11))
final_ef_df.rename(columns= {0 : "author_name", 1: "effective_size"}, inplace = True)
final_ef_df


Unnamed: 0,author_name,effective_size
1,S. Rosenbaum,44.19
2,B. Stubbs,43.78
3,Davy Vancampfort,39.36
4,J. A. Blumenthal,34.83
5,Michel Probst,27.92
6,Brenda W.J.H. Penninx,27.32
7,Michael Babyak,26.18
8,Felipe Schuch,23.35
9,Andrea L. Dunn,22.29
10,Philip B. Ward,21.25


In [144]:
# # constraint = nx.constraint(G)
# c_list = sorted(constraint.items(), key = lambda x : x[1], reverse=True)

# c_dict = dict()
# for i in c_list[:10]:
#     c_dict[i[0]] = round(i[1],2)
# con_df = pd.DataFrame(list(c_dict.items()), columns={"author_name", "constraint"}, index = range(1,11))
