# Imports

In [1]:
# Import pandas
import pandas as pd

# Import matplotlib
import matplotlib.pyplot as plt

# Import Network X
import networkx as nx

# Paths for in/out files

In [2]:
# Path of IN-labels
names_path = '../data/interim/id_name_tree_without_SCR.pkl'

# Path for IN-tags
tags_path = '../data/interim/df_geoid_meshid__no_omim_scr.pkl'

# Path of OUT-labels - for Gephi
gephi_node_labels = 'gephi_node_labels.csv'

# Path of OUT-edges - for Gephi
gephi_weighted_edges = 'gephi_weighted_edges.gexf'

# Read names_df

In [3]:
# Read
names_df = pd.read_pickle(names_path)

# Echo
names_df.head()

Unnamed: 0,mesh_id,mesh_heading,mesh_treenumbers,category
0,D000001,Calcimycin,D03.633.100.221.173,D
1,D000002,Temefos,D02.705.400.625.800,D
2,D000003,Abattoirs,J01.576.423.200.700.100,J
3,D000004,Abbreviations as Topic,L01.559.598.400.556.131,L
4,D000005,Abdomen,A01.923.047,A


# Construct Gephi labels

In [4]:
# Check all entries
print('Shape -Before-: ', names_df.shape)

# Check non-None entries
gephi_labels = names_df.dropna(axis=0)

# Check for duplicates
gephi_labels = gephi_labels.drop_duplicates(subset='mesh_id', keep='first')

# Check all entries - again
print('Shape  -After-: ', gephi_labels.shape)

# Copy only id and label
gephi_labels = gephi_labels.drop(columns='mesh_treenumbers category'.split())

# Rename for csv-file
gephi_labels = gephi_labels.rename(columns={'mesh_id':'id', 'mesh_heading':'label'})

# Save to .csv
gephi_labels.to_csv(gephi_node_labels, index=False)

# Echo
gephi_labels.head()

Shape -Before-:  (59746, 4)
Shape  -After-:  (29349, 4)


Unnamed: 0,id,label
0,D000001,Calcimycin
1,D000002,Temefos
2,D000003,Abattoirs
3,D000004,Abbreviations as Topic
4,D000005,Abdomen


# Read tags_df

In [5]:
# Read
tags_df = pd.read_pickle(tags_path)

# Echo
tags_df.head()

Unnamed: 0,geo_id,nsamples,date,mesh_id,category
0,200000001,38.0,2001/01/22,D014407,A
1,200000001,38.0,2001/01/22,D008297,Sex
2,200000001,38.0,2001/01/22,D008545,C
3,200000001,38.0,2001/01/22,D008875,M
4,200000001,38.0,2001/01/22,D009361,C


# Filter DataFrame - Do it here since it still looks like the original thing...

In [6]:
# Construct date filter
mask_date = tags_df['date']==tags_df['date']

# Construct category filter
mask_category = tags_df['category']=='D'#tags_df['category']

# Filter data
tags_df = tags_df[mask_date & mask_category]

# Eliminate filterning columns
tags_df.drop(columns='date category'.split(), inplace=True)

# Clean Data

In [7]:
# Check shape
print('Shape -before-: ', tags_df.shape)

# Drop NaNs
tags_df.dropna(axis=0,inplace=True)

# Delete duplicates
tags_df = tags_df.drop_duplicates()

# Only select summaries with +1 tag
tags_by_summary = tags_df['geo_id mesh_id'.split()].groupby('geo_id').count().reset_index() # Count tags per summary
good_summaries = tags_by_summary[tags_by_summary['mesh_id']>1] # Select abstracts with more than one tag
clean_tags = pd.merge(tags_df, good_summaries, on='geo_id') # Inner Join
clean_tags = clean_tags.drop(columns='mesh_id_y') # Drop column from inner join
clean_tags = clean_tags.rename(columns={'mesh_id_x':'mesh_id'}) # Rename key column
                             
# Check shape -again
print('Shape  -after-: ', tags_df.shape)

# Echo
tags_df.head()

Shape -before-:  (130753, 3)
Shape  -after-:  (130686, 3)


Unnamed: 0,geo_id,nsamples,mesh_id
9,200000001,38.0,D012333
28,200000010,4.0,D007501
29,200000010,4.0,D010100
44,200000010,4.0,D012333
50,200000014,765.0,D004272


# Construct DataFrame with Weighted Links

In [8]:
# Construct all-with-all links inside same geoid-nsample-date record
links = pd.merge(tags_df, tags_df, on='geo_id nsamples'.split())

# Echo info
print('     All links: ',links.shape[0])

# Rename to Source-Target
links.rename(columns={'mesh_id_x':'source', 'mesh_id_y':'target'}, inplace=True)

# Delete self-linkage
links.drop(links[links['source']==links['target']].index, inplace=True)

# Collapse repetitions while calculating weights
links_weights = links.groupby('source target'.split()).sum().reset_index()

# Rename sum(nsamples) to 'weight'
links_weights.rename(columns={'nsamples':'weight'}, inplace=True)

# Account for mirror-duplicates
links_weights['weight']/=2

# Normalize weights
links_weights['weight']/=links_weights['weight'].max()

# Echo info
print('Weighted links: ', links_weights.shape[0])

# Head
links_weights.head()

     All links:  806080
Weighted links:  200496


Unnamed: 0,source,target,weight
0,D000001,D002110,0.000253
1,D000001,D002118,0.000337
2,D000001,D003029,0.000337
3,D000001,D006657,0.000337
4,D000001,D007476,0.000337


# Construct Graph

In [9]:
# Construct Directed Graph
az = nx.from_pandas_edgelist(links_weights, 
                             source='source', 
                             target='target', 
                             edge_attr='weight', 
                             create_using=nx.DiGraph()
                            )

# Check for perfectly balanced links
suma = 0.0
n0='D000001'
for n1 in az.neighbors(n0):
    for n2 in az.neighbors(n1):
        suma += az[n1][n2]['weight'] - az[n2][n1]['weight']
print('Balanced graph: ', suma==0.0)

# Transform to undirected graph
azud = nx.to_undirected(az)

# Echo info
print('  Size (Nodes): ', azud.size())
print(' Order (Edges): ', azud.order())

Balanced graph:  True
  Size (Nodes):  200496
 Order (Edges):  4496


# Run statistics

In [10]:
# Eigenvector centrality
eigencentrality = nx.eigenvector_centrality(azud, max_iter=500, weight='weight')

In [11]:
# PageRank
pagerank = nx.pagerank(azud, alpha=0.9)

# Export to Gephi

In [12]:
# Export directly in GEXF format
nx.write_gexf(azud, gephi_weighted_edges)