<a href="https://colab.research.google.com/github/kattens/SASA-Calculation-For-LLMs/blob/main/BioGrid_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import requests
import zipfile
import io
import pandas as pd
import os

In [26]:
def download_and_extract_biogrid(url, output_folder):
    # Send a GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Check if the download was successful

    # Use io.BytesIO for in-memory bytes buffer (no need to save the file to disk)
    zip_file_bytes = io.BytesIO(response.content)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_bytes, 'r') as zip_ref:
        zip_ref.extractall(output_folder)  # Extract all files into the specified folder

def load_biogrid_file(file_path):
    # Load the file into a pandas DataFrame, assuming tab-separated values
    df = pd.read_csv(file_path, delimiter='\t', low_memory=False)
    return df

# URL of the BioGRID ZIP file (replace 'YOUR_RELEASE_VERSION' with the correct version number)
biogrid_url = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.241/BIOGRID-ALL-4.4.241.tab3.zip'

# Folder where you want to extract the files
output_folder = './biogrid_data'

# Download and extract the files
download_and_extract_biogrid(biogrid_url, output_folder)

# Assuming the main data file follows a consistent naming pattern, update as needed
data_file_path = './biogrid_data/BIOGRID-ALL-4.4.241.tab3.txt'

# Load the data
biogrid_data = load_biogrid_file(data_file_path)

# Print the first few rows of the DataFrame
print(biogrid_data.head())

# Optionally, analyze or process the data further


   #BioGRID Interaction ID Entrez Gene Interactor A Entrez Gene Interactor B  \
0                      103                     6416                     2318   
1                      117                    84665                       88   
2                      183                       90                     2339   
3                      278                     2624                     5371   
4                      418                     6118                     6774   

   BioGRID ID Interactor A  BioGRID ID Interactor B  \
0                   112315                   108607   
1                   124185                   106603   
2                   106605                   108625   
3                   108894                   111384   
4                   112038                   112651   

  Systematic Name Interactor A Systematic Name Interactor B  \
0                            -                            -   
1                            -                            -   

In [31]:
# Print all column names from the DataFrame
print(biogrid_data.columns)


Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Publication Source',
       'Organism ID Interactor A', 'Organism ID Interactor B', 'Throughput',
       'Score', 'Modification', 'Qualifications', 'Tags', 'Source Database',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names',
       'Ontology

In [32]:
# Basic statistics
print("Number of entries:", biogrid_data.shape[0])
print("Number of unique interactions:", biogrid_data['#BioGRID Interaction ID'].nunique())
print("Number of unique proteins:", pd.concat([biogrid_data['Entrez Gene Interactor A'], biogrid_data['Entrez Gene Interactor B']]).nunique())

# Check for the presence of different interaction types
print("Interaction types:", biogrid_data['Experimental System Type'].unique())

# Count interactions per organism using organism name instead of ID for clarity
print("Interactions per organism:", biogrid_data['Organism Name Interactor A'].value_counts())


Number of entries: 2741240
Number of unique interactions: 2741240
Number of unique proteins: 90440
Interaction types: ['physical' 'genetic']
Interactions per organism: Organism Name Interactor A
Homo sapiens                        1186561
Saccharomyces cerevisiae (S288c)     857228
Escherichia coli (K12/W3110)         210469
Mus musculus                          99747
Drosophila melanogaster               96712
                                     ...   
Human papillomavirus (7)                  1
Sorghum bicolor                           1
Escherichia coli (K12)                    1
Myotis lucifugus                          1
Vitis vinifera                            1
Name: count, Length: 80, dtype: int64


In [28]:
import networkx as nx

# Create a graph
G = nx.from_pandas_edgelist(biogrid_data, 'Entrez Gene Interactor A', 'Entrez Gene Interactor B')

# Basic network statistics
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Connected components:", nx.number_connected_components(G))

# Degree distribution
degrees = [G.degree(n) for n in G.nodes()]
print("Average degree:", sum(degrees) / len(degrees))

# Centrality measures (example: Degree Centrality)
degree_centrality = nx.degree_centrality(G)


Number of nodes: 90440
Number of edges: 2117621
Connected components: 769
Average degree: 46.82930119416188
