# Environment Setup

In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
import boto3
import importlib.util
import ast
import numpy as np
import networkx as nx
from scipy.stats import poisson
import json

from dotenv import load_dotenv

In [2]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

os.environ["WORKING_DIRECTORY"] = working_directory
os.environ["DATA_DIRECTORY"] = data_directory
os.environ["REFERENCES_DIRECTORY"] = references_directory
os.environ["RESULTS_DIRECTORY"] = results_directory
os.environ["UTILS_DIRECTORY"] = utils_directory

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

Loaded environment variables from: /home/lakishadavid/computational_genetic_genealogy/.env
Working Directory: /home/lakishadavid/computational_genetic_genealogy
Data Directory: /home/lakishadavid/computational_genetic_genealogy/data
References Directory: /home/lakishadavid/computational_genetic_genealogy/references
Results Directory: /home/lakishadavid/computational_genetic_genealogy/results
Utils Directory: /home/lakishadavid/computational_genetic_genealogy/utils
The current directory is /home/lakishadavid/computational_genetic_genealogy


## Prepare Data

Bonsai requires data on the age and sex of each individual. However, when we simulated data, we did not get an age. Bonsai also requires the the individual name to be an integer, which is not how our simulated data outputs names. This section of code will assign a random integer ID and age based on certain parameters and output this bioinfo variable as needed for Bonsai.

In [37]:
import random
import os
import pandas as pd

# Read the seg file and extract unique individual IDs
seg_file = os.path.join(data_directory, "class_data/ped_sim_run2.seg")
seg_df = pd.read_csv(seg_file, sep="\t", header=None)
seg_df.columns = ["sample1", "sample2", "chrom", "phys_start", "phys_end", "ibd_type", "gen_start", "gen_end", "gen_seg_len"]
unique_individuals = set(seg_df["sample1"]).union(set(seg_df["sample2"]))
unique_individuals = sorted(list(unique_individuals))
print("Number of individuals:", len(unique_individuals))

Number of individuals: 520


In [38]:
# Read the fam file
with open(os.path.join(data_directory, "class_data/ped_sim_run2-everyone.fam"), 'r') as file:
    fam_lines = file.readlines()

# Create a dictionary to store individual information and Bonsai IDs
individuals = {}
bonsai_ids = {}

# Process each line in the fam file
for line in fam_lines:
    fields = line.strip().split()
    individual_id = fields[1]

    # Skip individuals not present in the seg file
    if individual_id not in unique_individuals:
        continue

    father_id = fields[2]
    mother_id = fields[3]
    sex = 'M' if fields[4] == '1' else 'F'

    # Extract the generation number from the individual ID
    generation = int(individual_id.split('-')[0].split('_')[-1][1:])

    # Store the individual information in the dictionary
    individuals[individual_id] = {
        'father_id': father_id,
        'mother_id': mother_id,
        'sex': sex,
        'generation': generation
    }
    
print("First 10 individuals:")
print({k: v for k, v in list(individuals.items())[:10]})

First 10 individuals:
{'FAM1_g1-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 1}, 'FAM1_g1-b1-i1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 1}, 'FAM1_g2-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 2}, 'FAM1_g2-b1-i1': {'father_id': 'FAM1_g1-b1-i1', 'mother_id': 'FAM1_g1-b1-s1', 'sex': 'M', 'generation': 2}, 'FAM1_g2-b2-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 2}, 'FAM1_g2-b2-i1': {'father_id': 'FAM1_g1-b1-i1', 'mother_id': 'FAM1_g1-b1-s1', 'sex': 'F', 'generation': 2}, 'FAM1_g3-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 3}, 'FAM1_g3-b1-i1': {'father_id': 'FAM1_g2-b1-i1', 'mother_id': 'FAM1_g2-b1-s1', 'sex': 'F', 'generation': 3}, 'FAM1_g3-b2-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 3}, 'FAM1_g3-b2-i1': {'father_id': 'FAM1_g2-b1-i1', 'mother_id': 'FAM1_g2-b1-s1', 'sex': 'M', 'generation': 3}}


In [39]:
# Get the earliest and latest generation numbers
generation_numbers = [info['generation'] for info in individuals.values()]
earliest_generation = min(generation_numbers)
latest_generation = max(generation_numbers)
print(f"Earliest generation: {earliest_generation}, Latest generation: {latest_generation}")

Earliest generation: 1, Latest generation: 6


This code block retrieves the earliest and latest generation numbers from the `individuals` dictionary. It creates a list comprehension to extract the 'generation' values from the dictionary values. The `min` and `max` functions are used to find the earliest and latest generation numbers, respectively.

In [40]:
# Assign ages to individuals based on their generation
for generation in range(latest_generation, earliest_generation - 1, -1):
    for individual_id, info in individuals.items():
        if info['generation'] == generation:
            if generation == latest_generation:
                # Assign ages between 18 and 40 for the latest generation
                age = random.randint(18, 40)
            else:
                # Assign ages based on the descendants' ages
                child_ages = []
                for child_id, child_info in individuals.items():
                    if child_info['father_id'] == individual_id or child_info['mother_id'] == individual_id:
                        child_ages.append(child_info['age'])

                if child_ages:
                    min_child_age = min(child_ages)
                    age = min_child_age + random.randint(12, 40)
                else:
                    # If no child information is available, assign a random age based on the generation gap
                    age_gap = (latest_generation - generation) * random.randint(12, 40)
                    age = random.randint(18, 40) + age_gap

            individuals[individual_id]['age'] = age
            

print("First 10 individuals with ages:")
print({k: v for k, v in list(individuals.items())[:10]})
print("\n")
print("The age range is:", min([info['age'] for info in individuals.values()]), "-", max([info['age'] for info in individuals.values()]))

First 10 individuals with ages:
{'FAM1_g1-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 1, 'age': 151}, 'FAM1_g1-b1-i1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 1, 'age': 170}, 'FAM1_g2-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 2, 'age': 127}, 'FAM1_g2-b1-i1': {'father_id': 'FAM1_g1-b1-i1', 'mother_id': 'FAM1_g1-b1-s1', 'sex': 'M', 'generation': 2, 'age': 150}, 'FAM1_g2-b2-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 2, 'age': 117}, 'FAM1_g2-b2-i1': {'father_id': 'FAM1_g1-b1-i1', 'mother_id': 'FAM1_g1-b1-s1', 'sex': 'F', 'generation': 2, 'age': 132}, 'FAM1_g3-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 3, 'age': 106}, 'FAM1_g3-b1-i1': {'father_id': 'FAM1_g2-b1-i1', 'mother_id': 'FAM1_g2-b1-s1', 'sex': 'F', 'generation': 3, 'age': 116}, 'FAM1_g3-b2-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 3, 'age': 120}, 'FAM1_g3-b2-i1': {'father_id': 

This code block assigns ages to individuals based on their generation. It iterates over the generations in reverse order, starting from the latest generation and going backward to the earliest generation.

For each generation:

* If it's the latest generation, ages between 18 and 40 are randomly assigned using random.randint(18, 40).
* For earlier generations, ages are assigned based on the descendants' ages. It iterates over the individuals and checks if the current individual is the father or mother of any other individual. If so, the age of the child is appended to the child_ages list.
  * If the child_ages list is not empty, the minimum age among the children is found using min(child_ages), and the individual's age is assigned by adding a random value between 12 and 40 to the minimum child age.
  * If the child_ages list is empty (i.e., the individual has no children), a random age is assigned based on the generation gap. The generation gap is calculated by subtracting the current generation from the latest generation, and then multiplying it by a random value between 12 and 40. The individual's age is then assigned by adding this age gap to a base age range of 18 to 40.
* The assigned age is stored in the individuals dictionary under the 'age' key for each individual.

In [41]:
# Create Genotype IDs for individuals
for index, individual_id in enumerate(individuals.keys(), start=1000):
    bonsai_id = str(index)
    individuals[individual_id]['genotype_id'] = bonsai_id
    
print("First 10 individuals with Genotype IDs:")
print({k: v for k, v in list(individuals.items())[:10]})

First 10 individuals with Genotype IDs:
{'FAM1_g1-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 1, 'age': 151, 'genotype_id': '1000'}, 'FAM1_g1-b1-i1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 1, 'age': 170, 'genotype_id': '1001'}, 'FAM1_g2-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'F', 'generation': 2, 'age': 127, 'genotype_id': '1002'}, 'FAM1_g2-b1-i1': {'father_id': 'FAM1_g1-b1-i1', 'mother_id': 'FAM1_g1-b1-s1', 'sex': 'M', 'generation': 2, 'age': 150, 'genotype_id': '1003'}, 'FAM1_g2-b2-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 2, 'age': 117, 'genotype_id': '1004'}, 'FAM1_g2-b2-i1': {'father_id': 'FAM1_g1-b1-i1', 'mother_id': 'FAM1_g1-b1-s1', 'sex': 'F', 'generation': 2, 'age': 132, 'genotype_id': '1005'}, 'FAM1_g3-b1-s1': {'father_id': '0', 'mother_id': '0', 'sex': 'M', 'generation': 3, 'age': 106, 'genotype_id': '1006'}, 'FAM1_g3-b1-i1': {'father_id': 'FAM1_g2-b1-i1', 'mother_id': 'FAM1_g2-b1-s1', 

This code block creates Bonsai IDs for each individual. It uses the enumerate function to iterate over the keys of the individuals dictionary, starting the index from 1000. For each individual ID, a corresponding Bonsai ID is created by converting the index to a string. The Bonsai ID is then stored in the bonsai_ids dictionary using the individual ID as the key.

In [42]:
# Write the individual information to a new file
with open(os.path.join(results_directory, 'individual_info.txt'), 'w') as file:
    file.write("Individual ID\tBonsai ID\tAge\tSex\n")
    for individual_id, info in individuals.items():
        genotype_id = info['genotype_id']
        age = info['age']
        sex = info['sex']
        file.write(f"{individual_id}\t{genotype_id}\t{age}\t{sex}\n")

This code block writes the individual information to a new file named "individual_info.txt" in the results_directory. It opens the file in write mode ('w').

The header line `"Individual ID\tBonsai ID\tAge\tSex\n"` is written to the file first.

Then, it iterates over the `individuals` dictionary items. For each individual:

* The corresponding Bonsai ID is retrieved from the `bonsai_ids` dictionary using the individual ID as the key.
* The age and sex information is retrieved from the `individuals` dictionary.
* The individual information is written to the file in the format `"Individual ID\tBonsai ID\tAge\tSex\n"` using an f-string.

Take a look at the `individual_info.txt` file in your results directory.

In [43]:
# Create the bioinfo value in the desired format
bioinfo = []
for individual_id, info in individuals.items():
    genotype_id = int(info['genotype_id'])
    age = info['age']
    sex = info['sex']
    bioinfo.append({'genotype_id': genotype_id, 'age': age, 'sex': sex})
    
print("The first 10 bioinfo values:")
for i in range(10):
    print(bioinfo[i])

The first 10 bioinfo values:
{'genotype_id': 1000, 'age': 151, 'sex': 'F'}
{'genotype_id': 1001, 'age': 170, 'sex': 'M'}
{'genotype_id': 1002, 'age': 127, 'sex': 'F'}
{'genotype_id': 1003, 'age': 150, 'sex': 'M'}
{'genotype_id': 1004, 'age': 117, 'sex': 'M'}
{'genotype_id': 1005, 'age': 132, 'sex': 'F'}
{'genotype_id': 1006, 'age': 106, 'sex': 'M'}
{'genotype_id': 1007, 'age': 116, 'sex': 'F'}
{'genotype_id': 1008, 'age': 120, 'sex': 'F'}
{'genotype_id': 1009, 'age': 112, 'sex': 'M'}


This code block creates the bioinfo value in the desired format. It initializes an empty list called `bioinfo`.

It iterates over the `individuals` dictionary items. For each individual:

* The corresponding Bonsai ID is retrieved from the `bonsai_ids` dictionary using the individual ID as the key.
* The Bonsai ID is converted to an integer and assigned to the `genotype_id` variable.
* The age and sex information is retrieved from the `individuals` dictionary.
* A dictionary containing the `genotype_id`, `age`, and `sex` is appended to the `bioinfo` list.

Remember that Bonsai is designed to read the individal names as integers. We already assigned integer IDs for each individual in our segments file in the earlier code. Let's use those assignments to update our segments file by replacing the individual names with their integer IDs.

**NOTE: The following code block can't run more than once unless you change the existing .seg_orig to .seg**

In [None]:
import pandas as pd
import os

seg_file_orig = seg_file + "_orig"

# Check for .seg_orig and .seg files in the results directory
if os.path.exists(seg_file_orig) and os.path.exists(seg_file):
    os.remove(seg_file)
    os.rename(seg_file_orig, seg_file)
elif os.path.exists(seg_file_orig):
    os.rename(seg_file_orig, seg_file)
    
segments = seg_df.copy()

# New file paths
seg_file_new = seg_file
dict_file = seg_file + "_dict.txt"

# Read the individual_info.txt file
individual_info = pd.read_csv(os.path.join(results_directory, 'individual_info.txt'), sep='\t')

# Create a dictionary to map individual IDs to Bonsai IDs
individual_to_bonsai = dict(zip(individual_info['Individual ID'], individual_info['Bonsai ID']))

# Replace sample names with their corresponding Bonsai IDs
segments['sample1'] = segments['sample1'].map(individual_to_bonsai)
segments['sample2'] = segments['sample2'].map(individual_to_bonsai)

# Save the modified segments as .seg
segments.to_csv(seg_file_new, sep='\t', index=False, header=False)

# Save the dictionary
with open(dict_file, 'w') as f:
    for individual, bonsai_id in individual_to_bonsai.items():
        f.write(f"{individual}\t{bonsai_id}\n")

print("Segments and dictionary saved successfully.")
display(segments.head())

Segments and dictionary saved successfully.


Unnamed: 0,sample1,sample2,chrom,phys_start,phys_end,ibd_type,gen_start,gen_end,gen_seg_len
0,1000,1003,1,817341,44617788,IBD1,0.0,68.343071,68.343071
1,1000,1003,1,44617789,205983275,IBD1,68.343113,200.153155,131.810042
2,1000,1003,1,205983276,242249428,IBD1,200.153157,250.580913,50.427756
3,1000,1003,1,242249429,248876512,IBD1,250.580914,261.713366,11.132452
4,1000,1003,2,118913,4929466,IBD1,0.0,8.741841,8.741841


Create the segment list

In [46]:
def create_unphased_ibd_seg_list(segments):
    """
    Creates an unphased IBD segment list from the given DataFrame.

    Parameters:
        segments_ibd (pd.DataFrame): DataFrame containing the IBD segments with columns:
                                     ['id1', 'id2', 'chromosome', 'physical_position_start',
                                      'physical_position_end', 'IBD_type', 'genetic_length'].
        numeric_ids (dict): Mapping of sample IDs (str) to numeric IDs (int).

    Returns:
        list: A list of unphased IBD segments in the specified format:
              [[id1, id2, chrom, start_bp, end_bp, is_full, len_cm], ...].
    """
    unphased_ibd_seg_list = []

    for _, row in segments.iterrows():
        try:
            id1 = int(row['sample1'])
            id2 = int(row['sample2'])
            chrom = str(row['chrom'])  # Convert chromosome to string if necessary
            start_bp = float(row['phys_start'])
            end_bp = float(row['phys_end'])
            is_full = row['ibd_type'] == 2  # Assuming IBD2 indicates "full"
            len_cm = float(row['gen_seg_len'])

            unphased_ibd_seg_list.append([id1, id2, chrom, start_bp, end_bp, is_full, len_cm])
        except KeyError as e:
            print(f"Error mapping ID: {e}")
        except ValueError as e:
            print(f"Error converting row data: {e}")

    return unphased_ibd_seg_list

unphased_ibd_seg_list = create_unphased_ibd_seg_list(segments)

print("First 10 unphased IBD segments:")
for i in range(10):
    print(unphased_ibd_seg_list[i])

First 10 unphased IBD segments:
[1000, 1003, '1', 817341.0, 44617788.0, False, 68.343071]
[1000, 1003, '1', 44617789.0, 205983275.0, False, 131.810042]
[1000, 1003, '1', 205983276.0, 242249428.0, False, 50.427756]
[1000, 1003, '1', 242249429.0, 248876512.0, False, 11.132452]
[1000, 1003, '2', 118913.0, 4929466.0, False, 8.741841]
[1000, 1003, '2', 4929467.0, 67922741.0, False, 77.365229]
[1000, 1003, '2', 67922742.0, 242101808.0, False, 162.599718]
[1000, 1003, '3', 66543.0, 42375917.0, False, 63.832452]
[1000, 1003, '3', 42375918.0, 198073373.0, False, 153.016539]
[1000, 1003, '4', 173807.0, 108049728.0, False, 107.960655]


## Run Bonsai

In [47]:
from utils.bonsaitree.bonsaitree.v3 import bonsai

up_dict_log_like_list = bonsai.build_pedigree(
    bio_info=bioinfo,
    unphased_ibd_seg_list=unphased_ibd_seg_list,
    min_seg_len=3
)

  log_term = np.log(1 - np.exp(-np.exp(log_mu_amt)))
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)


KeyboardInterrupt: 

## Louvain communities

Louvain communities is a community detection algorithm that helps identify groups of nodes that are more densely connected to each other than to nodes in other groups. In the context of our problem, using Louvain communities allows us to partition the large network of individuals into smaller, more manageable communities.

The Louvain algorithm is a hierarchical clustering algorithm that optimizes the modularity score of the network. Modularity is a measure of the strength of division of a network into communities. A high modularity score indicates that the nodes within a community have more connections among themselves than with nodes in other communities.

By applying the Louvain algorithm to our network of individuals, we can identify communities of individuals that are more closely related to each other based on their shared IBD segments. This allows us to focus our analysis on smaller subsets of the data, reducing the computational burden and memory requirements.

By leveraging Louvain communities, we can partition our large network into smaller communities and run Bonsai on each community separately. This approach enables us to work with larger datasets and overcome the memory limitations of the free version of Google Colab.

In [48]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

# Create a graph from the hapibd_df DataFrame
G = nx.Graph()

with tqdm(total=len(segments), desc="Adding edges to the graph") as pbar:
    for _, row in segments.iterrows():
        first_sample = row["sample1"]
        second_sample = row["sample2"]
        gen_seg_len = row["gen_seg_len"]
        G.add_edge(first_sample, second_sample, weight=gen_seg_len)
        pbar.update(1)

# Find Louvain communities
communities = nx.community.louvain_communities(G, weight='weight')

print(len(communities))

Adding edges to the graph: 100%|██████████| 183061/183061 [00:05<00:00, 35775.01it/s]

10





In [49]:
# Print the members of each community
for i, community in enumerate(communities[:5], start=1):
    print(f"Community {i}:")
    print(list(community))
    print()

Community 1:
[1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023]

Community 2:
[1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103]

Community 3:
[1152, 1153, 1154, 1155, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151]

Community 4:
[1156,

To make the Louvain communities smaller, you can adjust the `resolution` parameter in the `louvain_communities` function. The `resolution` parameter controls the size of the communities detected by the algorithm. By default, it is set to 1.0.

* Decreasing the `resolution` parameter (e.g., setting it to a value less than 1.0) will result in larger communities. The algorithm will favor merging smaller communities into larger ones.
* Increasing the `resolution` parameter (e.g., setting it to a value greater than 1.0) will result in smaller communities. The algorithm will favor splitting larger communities into smaller ones.

Here's how you can modify the code to make the communities smaller:

In [50]:
with tqdm(total=len(segments), desc="Adding edges to the graph") as pbar:
    for _, row in segments.iterrows():
        first_sample = row["sample1"]
        second_sample = row["sample2"]
        gen_seg_len = row["gen_seg_len"]
        G.add_edge(first_sample, second_sample, weight=gen_seg_len)
        pbar.update(1)

# Find Louvain communities with a smaller resolution value
resolution = 100  # Adjust this value to control the size of the communities
communities_v2 = nx.community.louvain_communities(G, resolution=resolution, weight='weight')
print(len(communities_v2))

Adding edges to the graph: 100%|██████████| 183061/183061 [00:04<00:00, 37638.45it/s]


358


In [51]:
# Print the members of each community
for i, community in enumerate(communities_v2[:5], start=1):
    print(f"Community {i}:")
    print(list(community))
    print()

Community 1:
[1000]

Community 2:
[1003]

Community 3:
[1005]

Community 4:
[1007]

Community 5:
[1009]



NOTE: It could be that you find Bonsai is more accurate for smaller, more related groups of individuals than for larger, more distantly related individuals. Bonsai has a `seed_pedigree_list` parameter that is an "optional [list] of seed pedigrees to use as starting points for building the pedigree". It also has a `validated_node_set_list` parameter where you can identify the nodes in the pedigrees in the `seed_pedigree_list` where you know the genealogical relationship.

# Run Bonsai

Rather than running Bonsai on our entire dataset, we can now run it on a Louvain community where we know there is relatedness among the members of the community. Because of this, we need to reduce our ibd_seg_list and bioinfo variables to only the inviduals in the community.

In [52]:
# Choose the community you want to focus on (e.g., community 0)
target_community = communities[0]

In [53]:
target_community

{1000,
 1001,
 1002,
 1003,
 1004,
 1005,
 1006,
 1007,
 1008,
 1009,
 1010,
 1011,
 1012,
 1013,
 1014,
 1015,
 1016,
 1017,
 1018,
 1019,
 1020,
 1021,
 1022,
 1023,
 1024,
 1025,
 1026,
 1027,
 1028,
 1029,
 1030,
 1031,
 1032,
 1033,
 1034,
 1035,
 1036,
 1037,
 1038,
 1039,
 1040,
 1041,
 1042,
 1043,
 1044,
 1045,
 1046,
 1047,
 1048,
 1049,
 1050,
 1051}

In [55]:
def filter_ibd_seg_list(ibd_seg_list, community_ids, both_in_community=True):
    if both_in_community:
        filtered_ibd_seg_list = [
            seg for seg in ibd_seg_list
            if seg[0] in community_ids and seg[1] in community_ids
        ]
    # else:
    #     filtered_ibd_seg_list = [
    #         seg for seg in ibd_seg_list
    #         if seg[0] in community_ids or seg[1] in community_ids
    #     ]
    return filtered_ibd_seg_list

# Filter the ibd_seg_list based on the community IDs
filtered_ibd_seg_list_v1 = filter_ibd_seg_list(unphased_ibd_seg_list, target_community, both_in_community=True)
# filtered_ibd_seg_list_v2 = filter_ibd_seg_list(ibd_seg_list, target_community, both_in_community=False)

# Filter the bioinfo based on the community IDs
filtered_bioinfo = [info for info in bioinfo if info['genotype_id'] in target_community]
filtered_bioinfo

[{'genotype_id': 1000, 'age': 151, 'sex': 'F'},
 {'genotype_id': 1001, 'age': 170, 'sex': 'M'},
 {'genotype_id': 1002, 'age': 127, 'sex': 'F'},
 {'genotype_id': 1003, 'age': 150, 'sex': 'M'},
 {'genotype_id': 1004, 'age': 117, 'sex': 'M'},
 {'genotype_id': 1005, 'age': 132, 'sex': 'F'},
 {'genotype_id': 1006, 'age': 106, 'sex': 'M'},
 {'genotype_id': 1007, 'age': 116, 'sex': 'F'},
 {'genotype_id': 1008, 'age': 120, 'sex': 'F'},
 {'genotype_id': 1009, 'age': 112, 'sex': 'M'},
 {'genotype_id': 1010, 'age': 122, 'sex': 'F'},
 {'genotype_id': 1011, 'age': 108, 'sex': 'M'},
 {'genotype_id': 1012, 'age': 96, 'sex': 'F'},
 {'genotype_id': 1013, 'age': 100, 'sex': 'M'},
 {'genotype_id': 1014, 'age': 70, 'sex': 'F'},
 {'genotype_id': 1015, 'age': 90, 'sex': 'M'},
 {'genotype_id': 1016, 'age': 79, 'sex': 'F'},
 {'genotype_id': 1017, 'age': 85, 'sex': 'M'},
 {'genotype_id': 1018, 'age': 77, 'sex': 'M'},
 {'genotype_id': 1019, 'age': 95, 'sex': 'F'},
 {'genotype_id': 1020, 'age': 95, 'sex': 'M'},


### Option 1

Run Bonsai without selecting a focal_id

In [None]:
from utils.bonsaitree.bonsaitree.v3 import bonsai

up_dict_log_like_list = bonsai.build_pedigree(
    bio_info=filtered_bioinfo,
    unphased_ibd_seg_list=filtered_ibd_seg_list_v1,
    min_seg_len=3
)

# Takes about 5 mintues

In [65]:
for element in up_dict_log_like_list:
    print(f"pedigree {element[0]}")
    print(f"likelihood {element[1]}")

pedigree {1050: {-2: 1, -5: 1}, -2: {-3: 1, -4: 1}, 1038: {-3: 1, -4: 1}, -5: {-6: 1, -7: 1}, 1049: {-10: 1, -13: 1}, -10: {-11: 1, -12: 1}, 1036: {-11: 1, -12: 1}, 1035: {-13: 1, -23: 1}, -13: {-6: 1, -18: 1}, 1020: {-18: 1, -22: 1}, 1037: {-18: 1, -6: 1}, 1048: {-26: 1, -33: 1}, -26: {-13: 1, -28: 1}, 1034: {-28: 1, -32: 1}, 1047: {-28: 1, -13: 1}, 1022: {-36: 1, -37: 1}, 1039: {-7: 1, -6: 1}, -7: {-36: 1, -37: 1}, 1025: {-44: 1, -47: 1}, -44: {-45: 1, -46: 1}, 1012: {-45: 1, -46: 1}, 1024: {-50: 1, -51: 1}, 1041: {-54: 1, -55: 1}, -54: {-50: 1, -51: 1}, 1040: {-58: 1, -59: 1}, 1051: {-58: 1, -60: 1}, -60: {-55: 1, -54: 1}, -55: {-47: 1, -44: 1}, 1017: {-63: 1, -64: 1}, 1033: {-67: 1, -68: 1}, 1018: {-67: 1, -69: 1}, -68: {-70: 1, -71: 1}, 1011: {-74: 1, -75: 1}, 1004: {-74: 1, -76: 1}, 1013: {-47: 1, -80: 1}, -47: {-75: 1, -74: 1}, 1028: {-83: 1, -84: 1}, 1044: {-87: 1, -88: 1}, -87: {-83: 1, -84: 1}, -88: {-89: 1, -90: 1}, 1026: {-93: 1, -94: 1}, 1043: {-93: 1, -90: 1}, 1042: {-98:

### Option 2

Run Bonsai looping through the sample where each sample becomes the focal_id

In [None]:
from utils.bonsaitree.bonsaitree.v3 import bonsai

for id in list(target_community):
    up_dict_log_like_list = bonsai.build_pedigree(
        bio_info=filtered_bioinfo,
        unphased_ibd_seg_list=filtered_ibd_seg_list_v1,
        focal_id=id,
        min_seg_len=3
    )

print(up_dict_log_like_list)

# Graph the Bonsai results

#### Create and Save the Graphs (without displaying)

In [None]:
import os
import glob
import igraph as ig
import pickle

# Get a list of files that begins with 'bonsai_normed_pedigree'
pedigree_files = glob.glob(os.path.join(results_directory, 'bonsai_normed_pedigree*.json'))

for file_path in pedigree_files:
    # Read the network file
    with open(file_path, 'r') as file:
        network = json.load(file)

    # Create a new graph
    g = ig.Graph(directed=True)

    # Add vertices (i.e., nodes)
    all_ids = set(map(str, network.keys())) | {str(item) for sublist in network.values() for item in sublist[1:]}
    g.add_vertices(sorted(all_ids))

    # Add edges and set child sex as a vertex attribute
    for child_id, (sex, parent1, parent2) in network.items():
        g.vs.find(name=str(child_id))['sex'] = sex
        if parent1 is not None:
            g.add_edge(str(parent1), str(child_id))
        if parent2 is not None:
            g.add_edge(str(parent2), str(child_id))

    # Save graph in pickle format
    pickle_filename = os.path.splitext(os.path.basename(file_path))[0] + '.pkl'
    with open(os.path.join(results_directory, pickle_filename), 'wb') as pickle_file:
        pickle.dump(g, pickle_file)

print("Graphs saved in pickle format.")

#### Plot (visualize) the graphs

In [None]:
import pygraphviz as pgv
import pickle
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

samples_str = [str(sample) for sample in list(target_community)]

def create_graph_image_pygraphviz(name):
    focus = name
    loaded_graph = pickle.load(open(f"{results_directory}/bonsai_normed_pedigree_{str(focus)}.pkl", 'rb'))

    # Create a new NetworkX graph
    G = nx.DiGraph()

    # Add nodes and edges
    for vertex in loaded_graph.vs:
        node_label = vertex["name"]
        G.add_node(node_label)

    for edge in loaded_graph.es:
        parent = loaded_graph.vs[edge.source]["name"]
        child = loaded_graph.vs[edge.target]["name"]
        G.add_edge(parent, child)

    # Create a new PyGraphviz graph for visualization
    A = nx.nx_agraph.to_agraph(G)

    # Set node colors
    for node in A.nodes():
        check_focus = focus.split("_")[0]
        if node == check_focus:
            node.attr['color'] = 'green'
        elif node in samples_str:
            node.attr['color'] = 'yellow'
        else:
            node.attr['color'] = 'white'

    # Set Graphviz layout options
    A.layout(prog='dot')

    # Save and display the graph
    graph_filename = f"{results_directory}/bonsai_normed_pedigree_{focus}_dot.png"
    A.draw(graph_filename, format='png')

    # Load and display the image
    img = mpimg.imread(graph_filename)
    plt.imshow(img)
    plt.axis('off')
    plt.show()

    return G

In [None]:
# Example usage
focus = "1025_v1"
predicted_graph = create_graph_image_pygraphviz(focus)

In [None]:
# Example usage
focus = "wo_focalid_v1"
predicted_graph = create_graph_image_pygraphviz(focus)

## Plot the true graph

In [None]:
# Creates the true graph (does not display)

import pandas as pd
import networkx as nx

# Load the fam file into a DataFrame
fam_df = pd.read_csv(f"{results_directory}/ped_sim_run-everyone.fam", sep=" ", header=None)
fam_df.columns = ["family_id", "individual_id", "father_id", "mother_id", "sex", "phenotype"]

# Load the seg_dict file into a DataFrame
seg_dict_df = pd.read_csv(f"{results_directory}/ped_sim_run.seg_dict.txt", sep="\t", header=None)
seg_dict_df.columns = ["individual_id", "bonsai_id"]

# Create a dictionary to map individual_id to bonsai_id
individual_to_bonsai = dict(zip(seg_dict_df["individual_id"], seg_dict_df["bonsai_id"]))

# Create a graph using the fam data
fam_graph = nx.DiGraph()
for _, row in fam_df.iterrows():
    individual_id = row["individual_id"]
    father_id = row["father_id"]
    mother_id = row["mother_id"]

    # Use Bonsai ID if available, otherwise use the original name
    individual_node = individual_to_bonsai.get(individual_id, individual_id)
    fam_graph.add_node(individual_node)

    if father_id != "0":
        father_node = individual_to_bonsai.get(father_id, father_id)
        fam_graph.add_edge(father_node, individual_node)

    if mother_id != "0":
        mother_node = individual_to_bonsai.get(mother_id, mother_id)
        fam_graph.add_edge(mother_node, individual_node)

In [None]:
# Check the number of nodes and edges in the graph
print("Number of nodes:", fam_graph.number_of_nodes())
print("Number of edges:", fam_graph.number_of_edges())

# Print a few nodes
print("\nNodes:")
for node in list(fam_graph.nodes())[:5]:
    print(node)

# Print a few edges
print("\nEdges:")
for edge in list(fam_graph.edges())[:5]:
    print(edge)

In [None]:
# Find the target community individuals
target_community_individual_ids = [seg_dict_df.loc[seg_dict_df["bonsai_id"] == int(bonsai_id), "individual_id"].values[0] for bonsai_id in target_community]

# Find all connected individuals (relatives and ancestors) of the target community
connected_individuals = set()
for individual_id in target_community_individual_ids:
    connected_individuals.update(nx.descendants(fam_graph, individual_to_bonsai.get(individual_id, individual_id)))
    connected_individuals.update(nx.ancestors(fam_graph, individual_to_bonsai.get(individual_id, individual_id)))
connected_individuals.update(target_community_individual_ids)

# Create the true graph for the target community and their connected individuals
true_graph = nx.subgraph(fam_graph, [individual_to_bonsai.get(individual_id, individual_id) for individual_id in connected_individuals])

# Check the number of nodes and edges in the graph
print("Number of nodes:", true_graph.number_of_nodes())
print("Number of edges:", true_graph.number_of_edges())

# Print a few nodes
print("\nNodes:")
for node in list(true_graph.nodes())[:5]:
    print(node)

# Print a few edges
print("\nEdges:")
for edge in list(true_graph.edges())[:5]:
    print(edge)

In [None]:
# Create a PyGraphviz graph
A = pgv.AGraph(directed=True)

# Add nodes and edges to the PyGraphviz graph
for node in true_graph.nodes():
    A.add_node(node)
    if node in [individual_to_bonsai.get(individual_id, individual_id) for individual_id in target_community_individual_ids]:
        A.get_node(node).attr['color'] = 'green'
    else:
        A.get_node(node).attr['color'] = 'white'

for edge in true_graph.edges():
    parent, child = edge
    A.add_edge(parent, child)

# Set Graphviz layout options
A.layout(prog='dot')

# Save and display the graph
graph_filename = f"{results_directory}/true_graph_target_community.png"
A.draw(graph_filename, format='png')

# Load and display the image
img = mpimg.imread(graph_filename)
plt.figure(figsize=(10, 10))
plt.imshow(img)
plt.axis('off')
plt.show()

## Exploring Bonsai

In [None]:
from use.bonsaitree import bonsai

results_v1 = bonsai.build_pedigree(filtered_ibd_seg_list_v1, filtered_bioinfo)

bonsai_normed_pedigree_wo_focalid_v1 = results_v1['normed_pedigree']

with open(f"{results_directory}/bonsai_normed_pedigree_wo_focalid_v1.json", 'w') as results_file:
    json.dump(bonsai_normed_pedigree_wo_focalid_v1, results_file)

#***********************************************
ped_obj_wo_focalid_v1 = results_v1['ped_obj']
#***********************************************

In [None]:
focus = "wo_focalid_v1"
predicted_graph = create_graph_image_pygraphviz(focus)

In [None]:
ped_obj_wo_focalid_v1

In [None]:
# up_pedigree_dict: (dict) Stores the topology of the inferred pedigree.
# Has the form {child_id : [child_sex, child_age, parent1_id, parent2_id]}.

ped_obj_wo_focalid_v1.up_pedigree_dict

In [None]:
# (dict) Stores the topology of the inferred pedigree.
# Has the form {parent_id : [parent_sex, parent_age, child1_id, child2_id, ...]}

ped_obj_wo_focalid_v1.down_pedigree_dict

In [None]:
# all_ids: (list) List of all ids in the pedigree

ped_obj_wo_focalid_v1.all_ids

In [None]:
# ibd_stats: (dict) Dict with keys of the form frozenset({id1, id2})
# and values giving summary statistics of the ibd sharing between the pair.
ped_obj_wo_focalid_v1.ibd_stats

In [None]:
# rel_dict: (dict) Dict of the form dict[id] = {'anc' : <Set of ancestor ids>, 'desc' : <Set of descendant ids>,
# 'rel' : <Set of relatives who are neither direct descendants nor ancestors>}.

ped_obj_wo_focalid_v1.rel_dict

In [None]:
# rels: (dict) Nested dict of the form dict[id1][id2] = deg,
# where deg is a three-element tuple representing the relationship between id1 and id2.
# Deg is of the form deg = (num_up, num_down, num_anc), where num_up is the number of
# meioses separating id1 from its common ancestor(s) with id2. num_down is the number of
# meioses separating id2 from its common ancestor(s) with id1. num_anc is the number of
# common ancestors shared between id1 and id2.

ped_obj_wo_focalid_v1.rels

In [None]:
# pairwise_log_likelihoods: (dict) Nested dict of the form dict[id1][id1] = log_like,
# where log_like is the pairwise log likelihood of the relationship between id and id2 based on IBD sharing and age.

ped_obj_wo_focalid_v1.pairwise_log_likelihoods

In [None]:
# get_connecting_path_set: (method) Find all ancestors on the path connecting two related nodes (id1 and id2).
# Usage: path_set = ped_obj.get_connecting_path_set(id1, id2).

ped_obj_wo_focalid_v1.get_connecting_path_set(1024, 1028)