In [8]:
!pip install pandas networkx matplotlib openpyxl



In [9]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os

# check the working directory
cwd = os.getcwd()
print(cwd)

C:\JHCloud\OneDrive - North Dakota University System\Desktop\Training\KatelynShared\AMIA2024


In [12]:
# Read the Excel files
all_AKI_genes = pd.read_excel('./20240901_NewDignetResults/AKI_6701-gene-pairs_20240901.xlsx', dtype=str)
all_CKD_genes = pd.read_excel('./20240901_NewDignetResults/CKD_79569-gene-pairs_20240901.xlsx', dtype=str)

# Remove " hits" and convert the fourth column to integers
all_AKI_genes.iloc[:, 3] = all_AKI_genes.iloc[:, 3].str.replace(' hits', '').astype(int)
all_CKD_genes.iloc[:, 3] = all_CKD_genes.iloc[:, 3].str.replace(' hits', '').astype(int)


In [13]:
# Filter by a minimum of 3 papers
all_AKI_genes = all_AKI_genes[all_AKI_genes.iloc[:, 3].astype(float) >= 3].iloc[:, 0:4]
all_CKD_genes = all_CKD_genes[all_CKD_genes.iloc[:, 3].astype(float) >= 3].iloc[:, 0:4]


In [14]:
# Remove rows with NaN in gene1 or gene2 for both AKI and CKD dataframes
all_AKI_genes = all_AKI_genes.dropna(subset=[all_AKI_genes.columns[1], all_AKI_genes.columns[2]])
all_CKD_genes = all_CKD_genes.dropna(subset=[all_CKD_genes.columns[1], all_CKD_genes.columns[2]])

In [18]:
# Create a graph
G = nx.Graph()

# Function to add edges to the graph
def add_edges(df, color, unique_set):
    for i in range(len(df)):
        gene1 = df.iloc[i, 1]
        gene2 = df.iloc[i, 2]
        weight = df.iloc[i, 3]
        G.add_edge(gene1, gene2, color=color, weight=weight, unique=unique_set)
        #print(f"Added edge: {gene1} - {gene2} with weight: {weight}")  # Print edge details

# Add edges for AKI and CKD
add_edges(all_AKI_genes, 'red', 'AKI')
add_edges(all_CKD_genes, 'blue', 'CKD')

In [20]:
# Add common edges with average weights
common_pairs = set(tuple(sorted([all_AKI_genes.iloc[i, 1], all_AKI_genes.iloc[i, 2]])) for i in range(len(all_AKI_genes))) & \
               set(tuple(sorted([all_CKD_genes.iloc[i, 1], all_CKD_genes.iloc[i, 2]])) for i in range(len(all_CKD_genes)))

In [22]:
# Add common edges with average weights
common_pairs = set(tuple(sorted([all_AKI_genes.iloc[i, 1], all_AKI_genes.iloc[i, 2]])) for i in range(len(all_AKI_genes))) & \
               set(tuple(sorted([all_CKD_genes.iloc[i, 1], all_CKD_genes.iloc[i, 2]])) for i in range(len(all_CKD_genes)))

# Calculate unique pairs for AKI and CKD
unique_AKI_pairs = set(tuple(sorted([all_AKI_genes.iloc[i, 1], all_AKI_genes.iloc[i, 2]])) for i in range(len(all_AKI_genes))) - common_pairs
unique_CKD_pairs = set(tuple(sorted([all_CKD_genes.iloc[i, 1], all_CKD_genes.iloc[i, 2]])) for i in range(len(all_CKD_genes))) - common_pairs


In [23]:
# check how many gene pairs we have
print(f"Number of common pairs: {len(common_pairs)}")
print(f"Number of unique AKI pairs: {len(unique_AKI_pairs)}")
print(f"Number of unique CKD pairs: {len(unique_CKD_pairs)}")


Number of common pairs: 561
Number of unique AKI pairs: 237
Number of unique CKD pairs: 14508


In [34]:
# Iterate through common pairs
for pair in common_pairs:
    gene1, gene2 = pair
    
    # Print the current genes being processed
    #print(f"Processing pair: gene1 = {gene1}, gene2 = {gene2}")
    
    try:
        # Check both possible orders in the AKI dataset
        aki_genes_filtered = all_AKI_genes[
            ((all_AKI_genes.iloc[:, 1] == gene1) & (all_AKI_genes.iloc[:, 2] == gene2)) |
            ((all_AKI_genes.iloc[:, 1] == gene2) & (all_AKI_genes.iloc[:, 2] == gene1))
        ]
        
        # Check both possible orders in the CKD dataset
        ckd_genes_filtered = all_CKD_genes[
            ((all_CKD_genes.iloc[:, 1] == gene1) & (all_CKD_genes.iloc[:, 2] == gene2)) |
            ((all_CKD_genes.iloc[:, 1] == gene2) & (all_CKD_genes.iloc[:, 2] == gene1))
        ]
        
        # Ensure both filters found a match
        if aki_genes_filtered.empty or ckd_genes_filtered.empty:
            raise IndexError("No matching pair found in one or both datasets.")
        
        # Calculate weights
        avg_weight = (aki_genes_filtered.iloc[0, 3] + ckd_genes_filtered.iloc[0, 3]) / 2

        # Get the max weights
        max_weight = max(aki_genes_filtered.iloc[0, 3],ckd_genes_filtered.iloc[0, 3])
        
        # Add edge to graph
        # Choose either ave_weight or max_weight
        G.add_edge(gene1, gene2, color='green', weight=max_weight, unique='common')
    
    except IndexError as e:
        print(f"Error processing pair: gene1 = {gene1}, gene2 = {gene2}")
        print(f"Error message: {e}")
        # Continue to the next pair
        continue


In [38]:
# Centrality analysis
degree_dict = dict(G.degree())
nx.set_node_attributes(G, degree_dict, 'degree')


In [40]:
# Draw the network
pos = nx.spring_layout(G)
node_sizes = [v * 100 for v in degree_dict.values()]  # Scale node sizes
edge_colors = [G[u][v]['color'] for u, v in G.edges()]


In [50]:
# Draw nodes and edges
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='lightgray')
nx.draw_networkx_edges(G, pos, edge_color=edge_colors)
nx.draw_networkx_labels(G, pos)


{'A2M': Text(-0.011038615368306637, -0.010421626269817352, 'A2M'),
 'LCN2': Text(-0.011440332047641277, -0.010112927295267582, 'LCN2'),
 'ACE': Text(0.004545495845377445, -0.008118106983602047, 'ACE'),
 'CDKN2A': Text(-0.02254929207265377, 0.0014744296204298735, 'CDKN2A'),
 'MME': Text(-0.0003759513492695987, 0.0018708952702581882, 'MME'),
 'TMPRSS2': Text(0.01950192265212536, -0.012995793484151363, 'TMPRSS2'),
 'ACE2': Text(0.014173547737300396, -0.01193949580192566, 'ACE2'),
 'AGT': Text(0.0032964693382382393, -0.010507519356906414, 'AGT'),
 'AGTR1': Text(0.002724412828683853, -0.009137085638940334, 'AGTR1'),
 'HAVCR1': Text(-0.006240722257643938, -0.007160788867622614, 'HAVCR1'),
 'ACSL4': Text(0.024450266733765602, 0.014082374982535839, 'ACSL4'),
 'GPX4': Text(0.015117730014026165, 0.008215827867388725, 'GPX4'),
 'PTGS2': Text(0.004117936827242374, 0.0006729491869919002, 'PTGS2'),
 'ADA': Text(-0.037182994186878204, 0.003538951277732849, 'ADA'),
 'ABP1': Text(-0.04219888523221016, 

In [48]:
# # Save the network as an image
# plt.title("Gene Interaction Network")
# plt.axis('off')
# plt.savefig('./20240901_NewDignetResults/gene_network_updated.png', format='png')
# plt.close()
# plt.savefig('./20240901_NewDignetResults/gene_network_updated.pdf', format='pdf')
# plt.close()

In [52]:
# Save the graph to an XML file
nx.write_graphml(G, './20240901_NewDignetResults/gene_network_updated.graphml')