In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from glycowork.glycan_data.loader import glycan_binding, lectin_specificity

In [3]:
glycan_binding.head()

TAL6-4LysM
rCnSL-proA
AntigenI/IIA3VP1
TAL6-6LysM
SP15308A-bot-339-19-339
VLRB.aGPA.23-GCN4-biotin
Protein L(A-C2)
A/H16-2
A/H16
A/Gull/Maryland/704/1977(H13N6)
TIM1
A/California/04/2009(H1N1)
A/California/04/2009(H1N1)
A/CA/04/2009(pdmH1N1)
A/QHO5HA
A/Chicken/H5N1
A/H11
A/H6
A/H12
A/H15
A/H7-3
A/H7-1
A/Indonesia/05/2005(H5N1)
A/H8
A/H10
A/68H3HA
A/H4
A/AHHA
A/NCHA
StcE
A/Puerto Rico/8/1934(H1N1)
BTL-native
BC2L-A
DGL
ConA
Ricin
LF82
BabA
Exotoxin A
Aggrecan
Mannose Binding Protein
SSL6
Intelectin-1a
VacA p55 receptor binding domain
fimbriae
MIC1
DC-SIGN
GS7807F-bth-514-18-514
Nc3LysM-2
CrataBL
CTLY 1
CHT
Thrombomodulin
DBA
Thrombomodulin_Extracellular
Loc1
Mg3lysM
LTB
A/Ch-03V0358F3
A/NIV929893
A/PRG921
A/LLR15
A/CMH079/05
PhoSpL
RS-IIL
CV-IIL
ZG16p
ZG16p
ZG16p
GS7807G-bfa-506-18-506
GS7807Q-bfa-507-19-507
Fc-marm-DC-ASGPR-CRD
PX14351B-bth-512-36-512
Avaren
Avaren-Fc
SBD
Actinohivin
MRCL
SNLRP
PA-IIL
MAL-II
Pbx1
CFEM-PPm
CFEM-PPl
Cel5A
Pbx2
PA-IL
PX14343F-bfa-562-20-562
Tamarinin
PX14

In [2]:
www_columns = [col for col in glycan_binding.columns if col.startswith("www")]
glycan_lectin_dataframe = glycan_binding.drop(columns=www_columns)

In [3]:
def create_glycan_binding_network(df, directed=None):
    if directed is None:
        G = nx.Graph()
    else:
        G = nx.DiGraph()

    glycan_columns = df.columns[1:-2].tolist()

    for glycan in glycan_columns:
        G.add_node(glycan, type='glycan')

    proteins = df['protein'].tolist()
    for protein in proteins:
        G.add_node(protein, type='protein')

    for idx, row in df.iterrows():
        protein = row['protein']
        for glycan in glycan_columns:
            if pd.notna(row[glycan]) and (float(row[glycan]) >= 0):
                weight = float(row[glycan])
                if directed == 'glycan_to_protein':
                    G.add_edge(glycan, protein, weight=weight, abs_weight=abs(weight))
                elif directed == 'protein_to_glycan':
                    G.add_edge(protein, glycan, weight=weight, abs_weight=abs(weight))
                else:  # Undirected case
                    G.add_edge(protein, glycan, weight=weight, abs_weight=abs(weight))

    return G

In [5]:
def analyze_binding_network(G):
    """Analyzes the glycan binding network and returns basic statistics.
    
    Parameters:
    G (NetworkX Graph): Bipartite graph of glycan-protein binding
    
    Returns:
    dict: Dictionary containing network statistics
    """
    stats = {
        'num_glycans': len([n for n, d in G.nodes(data=True) if d['bipartite'] == 0]),
        'num_proteins': len([n for n, d in G.nodes(data=True) if d['bipartite'] == 1]),
        'num_binding_interactions': G.number_of_edges(),
        'average_binding_strength': sum(d['abs_weight'] for _, _, d in G.edges(data=True)) / G.number_of_edges(),
        'strongest_binding': max(d['abs_weight'] for _, _, d in G.edges(data=True)),
        'weakest_binding': min(d['abs_weight'] for _, _, d in G.edges(data=True))
    }

    # Add degree statistics
    glycan_degrees = [d for n, d in G.degree() if G.nodes[n]['type'] == 'glycan']
    protein_degrees = [d for n, d in G.degree() if G.nodes[n]['type'] == 'protein']

    stats.update({
        'avg_glycan_degree': sum(glycan_degrees) / len(glycan_degrees),
        'avg_protein_degree': sum(protein_degrees) / len(protein_degrees)
    })

    return stats

In [4]:
G = create_glycan_binding_network(glycan_lectin_dataframe, "glycan_to_lectin")

In [8]:
connected_components = list(nx.connected_components(G))
for component in connected_components:
    print(len(component))

1
1
1
1
3175
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

In [9]:
G0 = G.subgraph(max(connected_components, key=len))

In [10]:
order, size = G0.number_of_edges(), G0.number_of_nodes(); order, size

(137251, 3175)

In [24]:
diameter = nx.diameter(G0); diameter

7

In [11]:
average_degree = order / size; average_degree

43.22866141732283

In [12]:
density = nx.density(G0); density

0.02723923214702132

In [13]:
attribute_dictionary = nx.get_node_attributes(G0, 'type')

In [14]:
glycan_nodes = [node for node, node_type in attribute_dictionary.items() if node_type == 'glycan']; len(glycan_nodes)

1970

In [15]:
lectin_nodes = [node for node, node_type in attribute_dictionary.items() if node_type == 'protein']; len(lectin_nodes)

1205