# Protein Abundance Data Analysis with Embeddings

This notebook processes protein abundance data and merges it with PPI network and sequence embeddings from STRING database for downstream machine learning analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuration
ORGANISM_CONFIG = {
    'human': {'id': 9606, 'name': 'H.sapiens'},
    'mouse': {'id': 10090, 'name': 'M.musculus'}
}

# Set target organism (change this to switch between human/mouse)
TARGET_ORGANISM = 'human'  # Options: 'human', 'mouse'

print(f"Processing data for: {ORGANISM_CONFIG[TARGET_ORGANISM]['name']} (ID: {ORGANISM_CONFIG[TARGET_ORGANISM]['id']})")

Processing data for: H.sapiens (ID: 9606)


In [None]:
# Setup paths using relative paths from project root
project_root = Path('../data')

# Load original data
#data_file = project_root / 'all_organisms_filtered_without_M.musculus_KIDNEY.parquet'

#def load_protein_abundance_data(file_path):
    #"""Load and validate protein abundance data."""
    #print(f"Loading protein abundance data from: {file_path}")
    
    #if not file_path.exists():
        #raise FileNotFoundError(f"Data file not found: {file_path}")
    
    #df = pd.read_parquet(file_path)
    #print(f"Loaded {len(df):,} rows and {len(df.columns)} columns")
    
    #return df

# Load main dataset
#df = load_protein_abundance_data(data_file)

Loading protein abundance data from: ../all_organisms_filtered_without_M.musculus_KIDNEY.parquet


FileNotFoundError: Data file not found: ../all_organisms_filtered_without_M.musculus_KIDNEY.parquet

In [None]:
def filter_organism_data(df, organism_key):
    """Filter dataframe for specific organism."""
    organism_id = ORGANISM_CONFIG[organism_key]['id']
    organism_name = ORGANISM_CONFIG[organism_key]['name']
    
    print(f"Filtering data for {organism_name} (ID: {organism_id})")
    organism_df = df[df['organism_id'] == organism_id].copy()
    
    print(f"Found {len(organism_df):,} rows for {organism_name}")
    print(f"Unique proteins: {organism_df['string_external_id'].nunique():,}")
    print(f"Unique tissues: {organism_df['sample_organ'].nunique()}")
    
    return organism_df

# Filter for target organism
organism_df = filter_organism_data(df, TARGET_ORGANISM)
organism_df.head()

# Load Protein Embeddings

Load both PPI network embeddings and sequence embeddings from STRING database.

In [7]:
def load_embeddings(organism_key, embedding_type='network'):
    """
    Load protein embeddings from HDF5 files.
    
    Args:
        organism_key: 'human' or 'mouse'
        embedding_type: 'network' or 'sequence'
    
    Returns:
        tuple: (embeddings_array, protein_list, metadata_dict)
    """
    organism_id = ORGANISM_CONFIG[organism_key]['id']
    
    # Construct filename
    filename = project_root / f"{organism_id}.protein.{embedding_type}.embeddings.v12.0.h5"
    
    if not filename.exists():
        raise FileNotFoundError(f"Embedding file not found: {filename}")
    
    print(f"Loading {embedding_type} embeddings from: {filename.name}")
    
    try:
        with h5py.File(filename, 'r') as f:
            # Load metadata
            metadata = {}
            for key in f['metadata'].attrs.keys():
                metadata[key] = f['metadata'].attrs[key]
                print(f"{key}: {metadata[key]}")
            
            # Load embeddings and protein names
            embeddings = f['embeddings'][:]
            proteins = [p.decode('utf-8') for p in f['proteins'][:]]
            
        print(f"Successfully loaded {len(proteins):,} {embedding_type} embeddings")
        return embeddings, proteins, metadata
        
    except Exception as e:
        print(f"Error loading {embedding_type} embeddings: {e}")
        raise

# Load PPI network embeddings
ppi_embeddings, ppi_proteins, ppi_metadata = load_embeddings(TARGET_ORGANISM, 'network')

FileNotFoundError: Embedding file not found: ../9606.protein.network.embeddings.v12.0.h5

In [5]:
# Load sequence embeddings
sequence_embeddings, sequence_proteins, sequence_metadata = load_embeddings(TARGET_ORGANISM, 'sequence')

# Verify protein lists match between embedding types
proteins_match = set(ppi_proteins) == set(sequence_proteins)
print(f"\nProtein lists match between PPI and sequence embeddings: {proteins_match}")

if not proteins_match:
    ppi_set = set(ppi_proteins)
    seq_set = set(sequence_proteins)
    print(f"PPI only: {len(ppi_set - seq_set)} proteins")
    print(f"Sequence only: {len(seq_set - ppi_set)} proteins")
    print(f"Common proteins: {len(ppi_set & seq_set)} proteins")

Loading sequence embeddings from: 9606.protein.sequence.embeddings.v12.0.h5
embedding_dim: 1024
n_proteins: 19699
precision: 16
Successfully loaded 19,699 sequence embeddings

Protein lists match between PPI and sequence embeddings: True


In [None]:
def create_embedding_dataframes(embeddings, proteins, embedding_type):
    """Convert embeddings to DataFrame format."""
    return pd.DataFrame({
        'string_external_id': proteins,
        f'{embedding_type}_embeddings': list(embeddings)
    })

def merge_with_embeddings(abundance_df, ppi_embeddings, ppi_proteins, seq_embeddings, seq_proteins):
    """
    Merge abundance data with both PPI and sequence embeddings.
    
    Returns:
        pd.DataFrame: Merged dataframe with embeddings
        dict: Merge statistics
    """
    print("Creating embedding dataframes...")
    
    # Create embedding dataframes
    ppi_df = create_embedding_dataframes(ppi_embeddings, ppi_proteins, 'PPI')
    seq_df = create_embedding_dataframes(seq_embeddings, seq_proteins, 'sequence')
    
    print("Merging abundance data with embeddings...")
    
    # Merge with PPI embeddings
    merged_ppi = abundance_df.merge(ppi_df, on='string_external_id', how='inner')
    print(f"After PPI merge: {len(merged_ppi):,} rows ({len(merged_ppi)/len(abundance_df)*100:.1f}% of original)")
    
    # Merge with sequence embeddings
    final_df = merged_ppi.merge(seq_df, on='string_external_id', how='inner')
    print(f"After sequence merge: {len(final_df):,} rows ({len(final_df)/len(abundance_df)*100:.1f}% of original)")
    
    # Statistics
    stats = {
        'original_rows': len(abundance_df),
        'original_proteins': abundance_df['string_external_id'].nunique(),
        'final_rows': len(final_df),
        'final_proteins': final_df['string_external_id'].nunique(),
        'merge_success_rate': len(final_df) / len(abundance_df),
        'protein_coverage': final_df['string_external_id'].nunique() / abundance_df['string_external_id'].nunique()
    }
    
    return final_df, stats

# Perform the merge
final_merged_df, merge_stats = merge_with_embeddings(
    organism_df, ppi_embeddings, ppi_proteins, sequence_embeddings, sequence_proteins
)

# Display merge statistics
print(f"\n=== Merge Statistics ===")
print(f"Original proteins: {merge_stats['original_proteins']:,}")
print(f"Final proteins: {merge_stats['final_proteins']:,}")
print(f"Protein coverage: {merge_stats['protein_coverage']:.1%}")
print(f"Row merge success: {merge_stats['merge_success_rate']:.1%}")

final_merged_df.head()

In [None]:
# Data Quality Assessment
print("=== Data Quality Summary ===")
print(f"Final dataset shape: {final_merged_df.shape}")
print(f"Memory usage: {final_merged_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Check for missing values
missing_cols = final_merged_df.isnull().sum()
missing_cols = missing_cols[missing_cols > 0]
if len(missing_cols) > 0:
    print(f"\nColumns with missing values:")
    for col, count in missing_cols.items():
        print(f"  {col}: {count:,} ({count/len(final_merged_df)*100:.1f}%)")
else:
    print("\nNo missing values found in merged dataset")

# Display sample of the data
print(f"\nSample data preview:")
final_merged_df.sample(3)[['organism_name', 'sample_organ', 'abundance', 'string_external_id']]

In [None]:
def save_processed_data(df, organism_key):
    """Save the processed merged dataframe."""
    organism_name = ORGANISM_CONFIG[organism_key]['name'].replace('.', '_').lower()
    output_path = project_root / f"{organism_name}_abundance_PPI_seq_embeddings.parquet"
    
    print(f"Saving processed data to: {output_path}")
    
    # Optimize memory before saving
    print(f"Pre-save memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Save with compression
    df.to_parquet(output_path, engine='pyarrow', compression='snappy')
    
    file_size = output_path.stat().st_size / 1024**2
    print(f"Saved successfully! File size: {file_size:.1f} MB")
    
    return output_path

# Clean up large variables before saving to free memory
del ppi_embeddings, sequence_embeddings
del ppi_proteins, sequence_proteins

# Save the processed data
output_file = save_processed_data(final_merged_df, TARGET_ORGANISM)

print(f"\n=== Processing Complete ===")
print(f"Target organism: {ORGANISM_CONFIG[TARGET_ORGANISM]['name']}")
print(f"Final dataset: {final_merged_df.shape[0]:,} rows, {final_merged_df.shape[1]} columns")
print(f"Output file: {output_file.name}")