In [1]:
import pandas as pd
import os
import networkx as nx
import csv

In [2]:
file = os.getcwd()+r'\topic_doc.xlsx'

In [3]:
df = pd.read_excel(file, usecols=['topic', 'sub_topic'])

In [4]:
df.head()

Unnamed: 0,topic,sub_topic
0,43,19
1,35,4
2,4,30
3,23,43
4,10,4


In [5]:
namefile = os.getcwd() + r'\topic names.xlsx'

In [6]:
name_df = pd.read_excel(namefile, usecols=['topic', 'name'])

In [7]:
name_df.head()

Unnamed: 0,topic,name
0,0,membrane protein
1,1,hormone receptors
2,2,viral vectors
3,3,hepatocarcinogen
4,4,gene muations in HCC


In [8]:
source = df.apply(lambda x : name_df.iloc[ x['topic'], 1], axis=1)
dest = df.apply(lambda x : name_df.iloc[ x['sub_topic'], 1], axis=1)

In [9]:
edges_df = pd.DataFrame()

In [10]:
edges_df['source'] = source
edges_df['dest'] = dest

In [11]:
edges_df.head()

Unnamed: 0,source,dest
0,cytochrome P450,angiogenesis in hepatocellular carcinoma
1,RFA,gene muations in HCC
2,gene muations in HCC,proliferation of HCC cells
3,TACE,cytochrome P450
4,apoptosis,gene muations in HCC


In [12]:
G = nx.Graph()

for i, edge in edges_df.iterrows():
    s = edge['source']
    d = edge['dest']
    if G.has_edge(s, d):
        G[s][d]['weight'] +=1
    else:
        G.add_edge(s, d, weight=1)

In [13]:
save_file_name = os.getcwd() + r'\topic_edges_for_gephi.csv'

with open(save_file_name, mode='w', buffering=-1, encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    for i, edge in edges_df.iterrows():
        s = edge['source']
        d = edge['dest']
        writer.writerow((s,d))

In [14]:
edge_weight=[]
for s, t, w in G.edges(data=True):
    edge_weight.append(w['weight'])

In [15]:
edge_weight.sort(reverse=True)

In [16]:
print('max:', max(edge_weight))
print('mean:', sum(edge_weight)/len(edge_weight))
print('medean:', edge_weight[int(len(edge_weight)/2)])
print('min:', min(edge_weight))

max: 1681
mean: 59.33940556088207
medean: 12
min: 1


In [17]:
G.degree()

DegreeView({'cytochrome P450': 49, 'angiogenesis in hepatocellular carcinoma': 42, 'RFA': 44, 'gene muations in HCC': 47, 'proliferation of HCC cells': 40, 'TACE': 49, 'apoptosis': 48, 'pathology': 49, 'management of HCC': 48, 'liver fibrosis - alcohol, fatty liver': 49, 'serum AFP': 29, 'iron metabolism': 32, 'protein kinase': 49, 'epithelial-to-mesenchymal transition': 46, 'varix': 45, 'hepatotoxicity': 44, 'hepatitis virus - carcinogenesis': 46, 'imaging with contrast dye': 49, 'hormone receptors': 48, 'survival analysis, prognosis': 44, 'ascites': 47, 'vascular procedures': 46, 'DM': 48, 'case reports': 48, 'chromosomal abnormality': 43, 'hepatitis virus infection': 34, 'animal study - carcinogenesis': 47, 'telomerase activity': 49, 'etc': 32, 'hepatectomy': 35, 'membrane protein': 43, 'bile duct': 43, 'role of mitochondria': 37, 'erythoropoietin': 48, 'lipoprotein metabolism': 45, 'gene & protein expression': 47, 'immunotherapy': 46, 'genotypes associated with HCC': 30, 'researche

In [18]:
node_dict = {}
for i, row in edges_df.iterrows():
    s = row['source']
    t = row['dest']
    node_dict[s] = node_dict.get(s, {})
    node_dict[s][t] = node_dict[s].get(t, 0)+1    

In [19]:
topic_df = pd.read_excel('topic_year_matrix.xlsx', usecols=['topic_name'])

In [20]:
dsm_df = topic_df
for s_name in topic_df['topic_name']:
    weight_list = []
    for t_name in topic_df['topic_name']:
        weight = node_dict[s_name].get(t_name, 0)
        weight_list.append(weight)
    dsm_df[s_name] = weight_list

In [21]:
dsm_df.to_excel('topic_matrix.xlsx')

In [22]:
import time

In [23]:
####degree centrality
degree_centrality = nx.degree_centrality(G)

In [24]:
####betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)

In [25]:
for s_name in topic_df['topic_name']:
    print(s_name,'\t',degree_centrality[s_name],'\t',betweenness_centrality[s_name])

proliferation of HCC cells 	 0.8163265306122448 	 0.0023779585086553677
survival analysis, prognosis 	 0.8979591836734693 	 0.003087189447773782
apoptosis 	 0.9795918367346939 	 0.004670564228878202
imaging with contrast dye 	 0.9999999999999999 	 0.005626950911955447
gene & protein expression 	 0.9591836734693877 	 0.004829711076738001
TACE 	 0.9999999999999999 	 0.005626950911955447
disease overview 	 0.6326530612244897 	 0.0003191145351940899
gene muations in HCC 	 0.9591836734693877 	 0.003947954422787178
risk factors 	 0.8979591836734693 	 0.002950406189012981
hepatitis virus infection 	 0.6938775510204082 	 0.0005651107166409035
pathology 	 0.9999999999999999 	 0.005626950911955447
hepatectomy 	 0.7142857142857142 	 0.0005004303067632041
chemotherapy 	 0.7142857142857142 	 0.001242691378357973
management of HCC 	 0.9795918367346939 	 0.005463460328749214
immunotherapy 	 0.9387755102040816 	 0.003638442508549884
vascular procedures 	 0.9387755102040816 	 0.003606765819608715
membr

In [26]:
network_df = topic_df
degree_list = []
betweenness_list = []
for t_name in topic_df['topic_name']:
    degree_list.append(degree_centrality[t_name])
    betweenness_list.append(betweenness_centrality[t_name])


In [27]:
network_df['degree_centrality'] = degree_list
network_df['betweenness_centrality'] = betweenness_list

In [28]:
network_df.to_excel('network_measures.xlsx')