# Analysis of Homology mapping for mouse and rat to identify human ortholgs.

In [1]:
#Load data and Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data
mouse_data = pd.read_csv('../data/processed/mouse/human_ortholog_mapped.csv')
rat_data = pd.read_csv('../data/processed/rat/human_ortholog_mapped.csv')


In [7]:
# Analyze number of orthologs identified for each dataset.

# Mouse data numbers:
n_total_mouse = len(mouse_data)
n_orthologs_mouse = len(mouse_data[mouse_data["human_ortholog_id"].notna()])
n_no_orthologs_mouse = len(mouse_data[mouse_data["human_ortholog_id"].isna()])

print(f"Mouse data:")
print(f"Total phosphorylation sites: {n_total_mouse}")
print(f"Orthologs identified: {n_orthologs_mouse}")
print(f"No orthologs identified: {n_no_orthologs_mouse}")
print(f"Percentage of phosphorylation sites with orthologs: {n_orthologs_mouse / n_total_mouse * 100:.2f}%")

# Rat data numbers:
n_total_rat = len(rat_data)
n_orthologs_rat = len(rat_data[rat_data["human_ortholog_id"].notna()])
n_no_orthologs_rat = len(rat_data[rat_data["human_ortholog_id"].isna()])

print(f"\nRat data:")
print(f"Total phosphorylation sites: {n_total_rat}")
print(f"Orthologs identified: {n_orthologs_rat}")
print(f"No orthologs identified: {n_no_orthologs_rat}")
print(f"Percentage of phosphorylation sites with orthologs: {n_orthologs_rat / n_total_rat * 100:.2f}%")



Mouse data:
Total phosphorylation sites: 30442
Orthologs identified: 29354
No orthologs identified: 1088
Percentage of phosphorylation sites with orthologs: 96.43%

Rat data:
Total phosphorylation sites: 23004
Orthologs identified: 20235
No orthologs identified: 2769
Percentage of phosphorylation sites with orthologs: 87.96%


In [9]:
# Count protein IDs present in interface files

# Load interface files
mouse_interfaces = pd.read_csv('../data/interactome_insider/M_musculus_interfacesHQ.csv')
human_interfaces = pd.read_csv('../data/interactome_insider/H_sapiens_interfacesHQ.csv')

# Get unique protein IDs from mouse data
unique_mouse_protein_ids = set(mouse_data['protein_id'].dropna().unique())
unique_mouse_human_ortholog_ids = set(mouse_data['human_ortholog_id'].dropna().unique())

# Get unique protein IDs from interface files
mouse_interface_p1 = set(mouse_interfaces['P1'].dropna().unique())
mouse_interface_p2 = set(mouse_interfaces['P2'].dropna().unique())
mouse_interface_all = mouse_interface_p1.union(mouse_interface_p2)

human_interface_p1 = set(human_interfaces['P1'].dropna().unique())
human_interface_p2 = set(human_interfaces['P2'].dropna().unique())
human_interface_all = human_interface_p1.union(human_interface_p2)

# Count matches
# Mouse protein_id in M_musculus_interfacesHQ.csv (P1 or P2)
mouse_protein_ids_in_interfaces = unique_mouse_protein_ids.intersection(mouse_interface_all)
n_mouse_protein_ids_in_interfaces = len(mouse_protein_ids_in_interfaces)

# Human ortholog_id in H_sapiens_interfacesHQ.csv (P1 or P2)
human_ortholog_ids_in_interfaces = unique_mouse_human_ortholog_ids.intersection(human_interface_all)
n_human_ortholog_ids_in_interfaces = len(human_ortholog_ids_in_interfaces)

# Print results
print("Interactome Insider Interface File Analysis")
print(f"\nMouse protein IDs (from human_ortholog_mapped.csv):")
print(f"  Total unique protein_id values: {len(unique_mouse_protein_ids)}")
print(f"  Found in M_musculus_interfacesHQ.csv (P1 or P2): {n_mouse_protein_ids_in_interfaces}")
print(f"  Percentage: {n_mouse_protein_ids_in_interfaces / len(unique_mouse_protein_ids) * 100:.2f}%")

print(f"\nHuman ortholog IDs (from human_ortholog_mapped.csv):")
print(f"  Total unique human_ortholog_id values: {len(unique_mouse_human_ortholog_ids)}")
print(f"  Found in H_sapiens_interfacesHQ.csv (P1 or P2): {n_human_ortholog_ids_in_interfaces}")
print(f"  Percentage: {n_human_ortholog_ids_in_interfaces / len(unique_mouse_human_ortholog_ids) * 100:.2f}%")


Interface File Analysis

Mouse protein IDs (from human_ortholog_mapped.csv):
  Total unique protein_id values: 6015
  Found in M_musculus_interfacesHQ.csv (P1 or P2): 618
  Percentage: 10.27%

Human ortholog IDs (from human_ortholog_mapped.csv):
  Total unique human_ortholog_id values: 5682
  Found in H_sapiens_interfacesHQ.csv (P1 or P2): 4889
  Percentage: 86.04%
