# Download Hetionet Null Graphs

This notebook downloads the pre-computed null graphs (permutations) from the Hetionet repository.

**Source**: https://github.com/hetio/hetionet/blob/a95ae76581af604e91d744680aee3f888fa18887/hetnet/permuted/matrix/hetionet-v1.0-permutations.zip



In [None]:
# Import all required libraries
import os
import requests
import zipfile
from pathlib import Path
from tqdm import tqdm
import shutil

# Data manipulation
import pandas as pd

In [None]:
# Configuration
DOWNLOAD_URL = "https://github.com/hetio/hetionet/raw/a95ae76581af604e91d744680aee3f888fa18887/hetnet/permuted/matrix/hetionet-v1.0-permutations.zip"
REPO_DIR = Path().cwd().parent
DATA_DIR = REPO_DIR / "data"
PERMUTATIONS_DIR = DATA_DIR / "permutations"
HETIO_PERMUTATIONS_DIR = PERMUTATIONS_DIR / "hetio200"  # New directory for downloaded permutations
DOWNLOAD_DIR = DATA_DIR / "downloads"
ZIP_FILENAME = "hetionet-v1.0-permutations.zip"
ZIP_PATH = DOWNLOAD_DIR / ZIP_FILENAME

print(f"Repository directory: {REPO_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Permutations directory: {PERMUTATIONS_DIR}")
print(f"Hetio permutations directory: {HETIO_PERMUTATIONS_DIR}")
print(f"Download directory: {DOWNLOAD_DIR}")
print(f"Download URL: {DOWNLOAD_URL}")

In [None]:
# Create necessary directories
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
PERMUTATIONS_DIR.mkdir(parents=True, exist_ok=True)
HETIO_PERMUTATIONS_DIR.mkdir(parents=True, exist_ok=True)

print(f"✓ Created download directory: {DOWNLOAD_DIR}")
print(f"✓ Created permutations directory: {PERMUTATIONS_DIR}")
print(f"✓ Created hetio permutations directory: {HETIO_PERMUTATIONS_DIR}")

In [None]:
# Function to download file with progress bar
def download_file(url, filepath):
    """
    Download a file from URL with progress bar.
    
    Parameters:
    -----------
    url : str
        URL to download from
    filepath : Path
        Local path to save the file
    """
    print(f"Downloading from: {url}")
    print(f"Saving to: {filepath}")
    
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    # Get total file size
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filepath, 'wb') as file, tqdm(
        desc=filepath.name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as pbar:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                file.write(chunk)
                pbar.update(len(chunk))
    
    print(f"✓ Download completed: {filepath}")
    return filepath

In [None]:
# Check if file already exists
if ZIP_PATH.exists():
    file_size = ZIP_PATH.stat().st_size
    print(f"File already exists: {ZIP_PATH}")
    print(f"File size: {file_size / (1024*1024):.1f} MB")
    
    # Ask user if they want to re-download
    response = input("File already exists. Re-download? (y/n): ")
    if response.lower() not in ['y', 'yes']:
        print("Skipping download.")
        SKIP_DOWNLOAD = True
    else:
        SKIP_DOWNLOAD = False
else:
    SKIP_DOWNLOAD = False
    print("File does not exist. Will download.")

In [None]:
# Download the permutations zip file
if not SKIP_DOWNLOAD:
    try:
        download_file(DOWNLOAD_URL, ZIP_PATH)
        
        # Verify download
        if ZIP_PATH.exists():
            file_size = ZIP_PATH.stat().st_size
            print(f"✓ Download successful!")
            print(f"✓ File size: {file_size / (1024*1024):.1f} MB")
        else:
            print("✗ Download failed - file not found")
            
    except Exception as e:
        print(f"✗ Download failed: {e}")
        print("You may need to download manually from:")
        print(DOWNLOAD_URL)
else:
    print("Using existing file.")

In [None]:
# Function to extract zip file
def extract_zip(zip_path, extract_to):
    """
    Extract zip file with progress tracking.
    
    Parameters:
    -----------
    zip_path : Path
        Path to zip file
    extract_to : Path
        Directory to extract to
    """
    print(f"Extracting: {zip_path}")
    print(f"To directory: {extract_to}")
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Get list of files in zip
        file_list = zip_ref.namelist()
        print(f"Found {len(file_list)} files in archive")
        
        # Extract with progress bar
        for file in tqdm(file_list, desc="Extracting"):
            zip_ref.extract(file, extract_to)
    
    print(f"✓ Extraction completed to: {extract_to}")

In [None]:
# Extract the downloaded zip file
if ZIP_PATH.exists():
    try:
        # Check if already extracted
        extract_check_path = DOWNLOAD_DIR / "hetionet-v1.0-permutations"
        
        if extract_check_path.exists():
            print(f"Archive appears to be already extracted at: {extract_check_path}")
            response = input("Re-extract? (y/n): ")
            if response.lower() not in ['y', 'yes']:
                print("Skipping extraction.")
                SKIP_EXTRACTION = True
            else:
                SKIP_EXTRACTION = False
        else:
            SKIP_EXTRACTION = False
        
        if not SKIP_EXTRACTION:
            extract_zip(ZIP_PATH, DOWNLOAD_DIR)
        
    except Exception as e:
        print(f"✗ Extraction failed: {e}")
else:
    print("✗ Zip file not found. Cannot extract.")

In [None]:
# Explore the extracted contents
extract_dir = DOWNLOAD_DIR / "hetionet-v1.0-permutations"

if extract_dir.exists():
    print(f"\nExploring extracted contents in: {extract_dir}")
    print("=" * 50)
    
    # List top-level contents
    contents = list(extract_dir.iterdir())
    print(f"Found {len(contents)} items:")
    
    for item in sorted(contents)[:10]:  # Show first 10 items
        if item.is_dir():
            sub_items = list(item.iterdir())
            print(f"📁 {item.name}/ ({len(sub_items)} items)")
        else:
            size = item.stat().st_size
            print(f"📄 {item.name} ({size / 1024:.1f} KB)")
    
    if len(contents) > 10:
        print(f"... and {len(contents) - 10} more items")
        
else:
    print("✗ Extracted directory not found")

In [None]:
# Function to organize permutations into the expected directory structure
def organize_permutations(source_dir, target_dir):
    """
    Organize downloaded permutations into the expected directory structure.
    
    Parameters:
    -----------
    source_dir : Path
        Directory containing extracted permutations
    target_dir : Path
        Target permutations directory
    """
    print(f"Organizing permutations...")
    print(f"Source: {source_dir}")
    print(f"Target: {target_dir}")
    
    if not source_dir.exists():
        print(f"✗ Source directory not found: {source_dir}")
        return
    
    # Look for permutation directories or files
    permutation_items = []
    for item in source_dir.rglob("*"):
        if item.is_dir() and (
            "permutation" in item.name.lower() or 
            item.name.endswith(".hetmat") or
            item.name.isdigit()
        ):
            permutation_items.append(item)
    
    print(f"Found {len(permutation_items)} potential permutation items")
    
    # Copy or move items to target directory
    for item in permutation_items[:5]:  # Show first 5 as example
        print(f"Found: {item.relative_to(source_dir)}")
    
    if len(permutation_items) > 5:
        print(f"... and {len(permutation_items) - 5} more")
    
    # Ask user before proceeding with organization
    if permutation_items:
        response = input("\nProceed with organizing permutations? (y/n): ")
        if response.lower() in ['y', 'yes']:
            for item in tqdm(permutation_items, desc="Organizing"):
                target_path = target_dir / item.name
                if not target_path.exists():
                    if item.is_dir():
                        shutil.copytree(item, target_path)
                    else:
                        shutil.copy2(item, target_path)
            print(f"✓ Organized {len(permutation_items)} permutation items")
        else:
            print("Skipping organization.")
    else:
        print("No permutation items found to organize.")

In [None]:
# Organize the downloaded permutations into the hetio200 directory
if extract_dir.exists():
    organize_permutations(extract_dir, HETIO_PERMUTATIONS_DIR)
else:
    print("Cannot organize - extracted directory not found")

In [None]:
# Summary and verification
print("\n" + "="*60)
print("DOWNLOAD SUMMARY")
print("="*60)

# Check download status
if ZIP_PATH.exists():
    size_mb = ZIP_PATH.stat().st_size / (1024*1024)
    print(f"✓ Downloaded: {ZIP_PATH.name} ({size_mb:.1f} MB)")
else:
    print(f"✗ Download failed or not attempted")

# Check extraction status
if extract_dir.exists():
    extracted_items = len(list(extract_dir.rglob("*")))
    print(f"✓ Extracted: {extracted_items} total items")
else:
    print(f"✗ Extraction failed or not attempted")

# Check hetio permutations directory
if HETIO_PERMUTATIONS_DIR.exists():
    hetio_items = list(HETIO_PERMUTATIONS_DIR.iterdir())
    print(f"✓ Hetio permutations directory: {len(hetio_items)} items")
    
    # Show some examples
    for item in sorted(hetio_items)[:5]:
        print(f"  - {item.name}")
    if len(hetio_items) > 5:
        print(f"  ... and {len(hetio_items) - 5} more")
else:
    print(f"✗ Hetio permutations directory not found")

# Check local permutations directory for comparison
if PERMUTATIONS_DIR.exists():
    local_items = [item for item in PERMUTATIONS_DIR.iterdir() if item.name != "hetio200"]
    print(f"✓ Local permutations directory: {len(local_items)} items (excluding hetio200)")

print("\n" + "="*60)
print("DIRECTORY STRUCTURE")
print("="*60)
print(f"📁 {PERMUTATIONS_DIR.name}/")
print(f"  📁 hetio200/           <- Downloaded Hetionet permutations")
print(f"  📁 000.hetmat/         <- Your local permutations") 
print(f"  📁 001.hetmat/")
print(f"  📁 ...")

print("\n" + "="*60)
print("NEXT STEPS")
print("="*60)
print("1. Verify the permutation files are in the expected format")
print("2. Update your analysis notebooks to compare hetio200 vs local permutations")
print("3. Run edge prediction analysis on both sets of permutations")
print("4. Compare results between downloaded and locally generated permutations")