In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from glycowork.glycan_data.loader import glycan_binding, lectin_specificity

In [2]:
glycan_binding.head()

Unnamed: 0,3-Anhydro-Gal(a1-3)Gal(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal2S(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3dGal(b1-3)[Fuc(a1-4)]Glc,3dGal(b1-4)Glc,4d8dNeu5Ac(a2-3)Gal(b1-4)Glc,4dNeu5Ac(a2-3)Gal(b1-4)Glc,7dNeu5Ac(a2-3)Gal(b1-4)Glc,...,wwwSflexneri5c,wwwSflexneriO2c,wwwSflexneriO5c,wwwSisomicin,wwwSmix,wwwTobramycin,wwwTyrS,wwwpHGGs,target,protein
0,,,,,,,,,,,...,,,,,,,,,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,TAL6-4LysM
1,,,,,,,,,,,...,,,,,,,,,AAFFSLVVLLALLPFGIHASALPSTELTPRVNPNLPGPNDVFVGFR...,rCnSL-proA
2,,,,,,,,,,,...,,,,,,,,,AANEADYQAKLTAYQTELARVQKANADAKAAYEAAVAANNAANAAL...,AntigenI/IIA3VP1
3,,,,,,,,,,,...,,,,,,,,,AASKLGVPQPAQRDQVNCQLYAVQPNDNCIDISSKNNITYAQLLSW...,TAL6-6LysM
4,,,,,,,,,,,...,,,,,,,,,ACNNEWEDEQYEQYISFKSPIPAGGEGVTDIYVRYKEDGKVTYRLP...,SP15308A-bot-339-19-339


In [3]:
www_columns = [col for col in glycan_binding.columns if col.startswith("www")]
glycan_lectin_dataframe = glycan_binding.drop(columns=www_columns)

In [16]:
def create_glycan_binding_network(df):
    """Creates a bipartite NetworkX graph from glycan binding data.
    
    Parameters:
    df (pandas.DataFrame): Dataframe with glycans as columns and proteins as rows.
                         Must have 'target' and 'protein' as the last two columns.
    
    Returns:
    NetworkX Graph: Bipartite graph with glycans and proteins as nodes and binding values as edge weights.
    The graph has the following attributes:
    - Node attribute 'bipartite': 0 for glycans, 1 for proteins
    - Node attribute 'type': 'glycan' or 'protein'
    - Edge attribute 'weight': binding value
    - Edge attribute 'abs_weight': absolute value of binding
    """
    G = nx.Graph()

    glycan_columns = df.columns[1:-2].tolist()

    # Add glycan nodes with bipartite attribute 0
    for glycan in glycan_columns:
        G.add_node(glycan, bipartite=0, type='glycan')

    # Add protein nodes with bipartite attribute 1
    proteins = df['protein'].tolist()
    for protein in proteins:
        G.add_node(protein, bipartite=1, type='protein')

    # Add edges with weights where binding values exist
    for idx, row in df.iterrows():
        protein = row['protein']
        for glycan in glycan_columns:
            if pd.notna(row[glycan]) and (float(row[glycan]) >= 0):
                weight = float(row[glycan])
                G.add_edge(protein, glycan,
                           weight=weight,
                           abs_weight=abs(weight))

    return G

In [5]:
def analyze_binding_network(G):
    """Analyzes the glycan binding network and returns basic statistics.
    
    Parameters:
    G (NetworkX Graph): Bipartite graph of glycan-protein binding
    
    Returns:
    dict: Dictionary containing network statistics
    """
    stats = {
        'num_glycans': len([n for n, d in G.nodes(data=True) if d['bipartite'] == 0]),
        'num_proteins': len([n for n, d in G.nodes(data=True) if d['bipartite'] == 1]),
        'num_binding_interactions': G.number_of_edges(),
        'average_binding_strength': sum(d['abs_weight'] for _, _, d in G.edges(data=True)) / G.number_of_edges(),
        'strongest_binding': max(d['abs_weight'] for _, _, d in G.edges(data=True)),
        'weakest_binding': min(d['abs_weight'] for _, _, d in G.edges(data=True))
    }

    # Add degree statistics
    glycan_degrees = [d for n, d in G.degree() if G.nodes[n]['type'] == 'glycan']
    protein_degrees = [d for n, d in G.degree() if G.nodes[n]['type'] == 'protein']

    stats.update({
        'avg_glycan_degree': sum(glycan_degrees) / len(glycan_degrees),
        'avg_protein_degree': sum(protein_degrees) / len(protein_degrees)
    })

    return stats

In [17]:
G = create_glycan_binding_network(glycan_lectin_dataframe)

In [18]:
connected_components = list(nx.connected_components(G))
for component in connected_components:
    print(len(component))

1
1
1
1
3175
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

In [19]:
G0 = G.subgraph(max(connected_components, key=len))

In [20]:
order, size = G0.number_of_edges(), G0.number_of_nodes(); order, size

(137251, 3175)

In [24]:
diameter = nx.diameter(G0); diameter

7

In [21]:
average_degree = order / size; average_degree

43.22866141732283

In [22]:
density = nx.density(G0); density

0.02723923214702132

In [23]:
attribute_dictionary = nx.get_node_attributes(G0, 'type')

In [24]:
glycan_nodes = [node for node, node_type in attribute_dictionary.items() if node_type == 'glycan']; len(glycan_nodes)

1970

In [25]:
lectin_nodes = [node for node, node_type in attribute_dictionary.items() if node_type == 'protein']; len(lectin_nodes)

1205

In [26]:
nx.write_gexf(G0, 'glycan_binding_network.gexf')

In [13]:
core_number = nx.core_number(G); core_number

# {'3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S': 4,
#  '3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S': 4,
#  '3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S': 4,

cliques = list(nx.enumerate_all_cliques(G)); cliques

# [['3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S'],
#  ['3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S'],
#  ['3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S'],

KeyboardInterrupt: 