In [None]:
import os
from collections import Counter
import logging
import sys
from pathlib import Path
import subprocess
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import IPython
import pandas as pd
from tqdm import tqdm

from dotenv import load_dotenv

In [None]:
def find_comp_gen_dir():
    """Find the computational_genetic_genealogy directory by searching up from current directory."""
    current = Path.cwd()
    
    # Search up through parent directories
    while current != current.parent:
        # Check if target directory exists in current path
        target = current / 'computational_genetic_genealogy'
        if target.is_dir():
            return target
        # Move up one directory
        current = current.parent
    
    raise FileNotFoundError("Could not find computational_genetic_genealogy directory")

def load_env_file():
    """Find and load the .env file from the computational_genetic_genealogy directory."""
    try:
        # Find the computational_genetic_genealogy directory
        comp_gen_dir = find_comp_gen_dir()
        
        # Look for .env file
        env_path = comp_gen_dir / '.env'
        if not env_path.exists():
            print(f"Warning: No .env file found in {comp_gen_dir}")
            return None
        
        # Load the .env file
        load_dotenv(env_path, override=True)
        print(f"Loaded environment variables from: {env_path}")
        return env_path
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Use the function
env_path = load_env_file()

In [None]:
working_directory = os.getenv('PROJECT_WORKING_DIR', default=None)
data_directory = os.getenv('PROJECT_DATA_DIR', default=None)
references_directory = os.getenv('PROJECT_REFERENCES_DIR', default=None)
results_directory = os.getenv('PROJECT_RESULTS_DIR', default=None)
utils_directory = os.getenv('PROJECT_UTILS_DIR', default=None)

print(f"Working Directory: {working_directory}")
print(f"Data Directory: {data_directory}")
print(f"References Directory: {references_directory}")
print(f"Results Directory: {results_directory}")
print(f"Utils Directory: {utils_directory}")

os.chdir(working_directory)
print(f"The current directory is {os.getcwd()}")

In [None]:
def configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO"):
    """
    Configure logging for both file and console handlers.

    Args:
        log_filename (str): Path to the log file where logs will be written.
        log_file_debug_level (str): Logging level for the file handler.
        console_debug_level (str): Logging level for the console handler.
    """
    # Create a root logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)  # Capture all messages at the root level

    # Convert level names to numeric levels
    file_level = getattr(logging, log_file_debug_level.upper(), logging.INFO)
    console_level = getattr(logging, console_debug_level.upper(), logging.INFO)

    # File handler: Logs messages at file_level and above to the file
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(file_level)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    # Console handler: Logs messages at console_level and above to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(console_level)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)

    # Add handlers to the root logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
def clear_logger():
    """Remove all handlers from the root logger."""
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

In [None]:
log_filename = os.path.join(results_directory, "lab8.log")
print(f"The Lab 8 log file is located at {log_filename}.")

# Ensure the results_directory exists
if not os.path.exists(results_directory):
    os.makedirs(results_directory)

# Check if the file exists; if not, create it
if not os.path.exists(log_filename):
    with open(log_filename, 'w') as file:
        pass  # The file is now created.
    
clear_logger() # Clear the logger before reconfiguring it
configure_logging(log_filename, log_file_debug_level="INFO", console_debug_level="INFO")

Go to Lab0_Code_Environment and run the set of code cells for **Install R** and **Install liftover**.

Also, rerun `poetry install --no-root` to install new packages.

Ensure rpy2 Jupyter Extension is Loaded by running the next cell.

In [None]:
%load_ext rpy2.ipython

Now you can use cell magic for R.

In [None]:
%%R

x <- c(1, 2, 3, 4, 5)
mean(x)

Don't worry about the warning message about libraries containing no packages. Install (if needed) and load packages.

In [None]:
%%R

# Function to check if a package is installed and up-to-date
check_install <- function(pkg) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg, repos = "https://cloud.r-project.org/")
  } else {
    # Check if the installed package is the latest version
    available <- available.packages(repos = "https://cloud.r-project.org/")
    installed_version <- packageVersion(pkg)
    latest_version <- available[pkg, "Version"]
    
    if (installed_version < latest_version) {
      install.packages(pkg, repos = "https://cloud.r-project.org/")
    }
  }
}

# Check and install/update the pedsuite package
check_install("pedsuite")

# Load the package
library(pedsuite)
print("pedsuite loaded successfully!")


## Create Genetic Family Tree

Now that you have R setup on your system, let's proceed with creating a genetic family tree.

Generate a cousin pedigree of degree 4 (as defined by pedsuite)

We start with a base structure called cousinPed. (There are other base structures to choose from.)

In [None]:
%%R

x = cousinPed(degree = 4)

plot(x)

Notice in the pedigree plot that each individual has an assigned number.

The assigned numbers for each individual is the identifier.

Here, I am adding children using the identifiers.

Note that nch is number of children.

Note that sex = 1 is male and sex = 2 is female.

Note that males are squares and females are circles.

Now that you have pedigree x, you can make adjustments. In this first adjustment, I am calling individual number 3 the father. I am calling individual number 4 the mother. Then I am adding 1 female child.

In [None]:
%%R

x = addChildren(x, father = 3, mother = 4, nch = 1, sex = 2)

x = addChildren(x, father = 5, mother = 6, nch = 1, sex = 2)
x = addChildren(x, father = 7, mother = 8, nch = 1, sex = 2)
x = addChildren(x, father = 9, mother = 10, nch = 1, sex = 2)
x = addChildren(x, father = 11, mother = 12, nch = 1, sex = 2)
x = addChildren(x, father = 13, mother = 14, nch = 1, sex = 2)
x = addChildren(x, father = 15, mother = 16, nch = 1, sex = 2)
x = addChildren(x, father = 17, mother = 18, nch = 1, sex = 2)

plot(x)

Notice how the pedigree changed. Take a moment to understand the changes that you made.

I continue to make adjustments until I get the pedigree I want.

In [None]:
%%R

x = addChildren(x, mother = 21, nch = 1, sex = 1)
x = addChildren(x, mother = 22, nch = 1, sex = 2)
x = addChildren(x, mother = 23, nch = 1, sex = 1)
x = addChildren(x, mother = 24, nch = 1, sex = 2)

x = addChildren(x, mother = 25, nch = 1, sex = 1)
x = addChildren(x, mother = 26, nch = 1, sex = 2)

plot(x)

In [None]:
%%R

x = addChildren(x, father = 30, nch = 1, sex = 2)
x = addChildren(x, father = 34, nch = 1, sex = 2)
x = addChildren(x, mother = 32, nch = 1, sex = 1)
x = addChildren(x, mother = 36, nch = 1, sex = 1)

plot(x)

With this final adjustment, I have satisfied with the pedigree.

In [None]:
%%R

x = addChildren(x, mother = 42, nch = 1, sex = 1)
x = addChildren(x, father = 46, nch = 1, sex = 2)

plot(x)

Think about other adjustments that you could do. For example, how might you represent a man with three wives, each with a certain number of children?

In [None]:
%%R

# Print the pedigree to check the structure

print(x)

In the output (you may need to select `scrollable element` or `text editor` to see the full output), you should see 52 rows. Each row represents one person in the tree you built. `id` is the individual identifier, `fid` is the father identifier and `mid` is the mother identifier. `sex` is the chromosomal sex where `1` = male and `2` = female.

Extract the genetic family tree created by pedsuite to a pandas dataframe.

In [None]:
%%R -o fam_df

# Extract data using indexing (modify indices if needed)
individual_id <- as.character(x$ID)
father_id <- ifelse(x$FIDX == "*", 0, x$FIDX)
mother_id <- ifelse(x$MIDX == "*", 0, x$MIDX)
sex <- as.character(x$SEX)

# Create data frame
fam_df <- data.frame(
  individual_id = individual_id,
  father_id = father_id,
  mother_id = mother_id,
  sex = sex
)

In the next cell, you are back to using Python. Inspect the dataframe to make sure that it looks as intended.

In [None]:
fam_df = fam_df.copy()
fam_df.info()

You should see `Index: 52 entries, 1 to 52`.

In [None]:
# Inspect the first 8 rows.
# Compare them to the pedsuite pedigree.
# (Note that we are using Python.)

fam_df.head(8)

Ped-sim needs a pedigree definition file, so let's go ahead and create that from the genetic family tree you created. (`FAM_ID` will be `FAM` for each person.)

`ped-sim` comes with example pedigree definition files for you to use. You can find them in your `~/use/pedsim/example` directory. To get a more detailed explanation of how to create a pedigree definiton file, see [Def file](https://github.com/williamslab/ped-sim?tab=readme-ov-file#def-file). You difine the pedigree using a text file. When doing so, make sure it is using the UNIX/OSX Format. I use [Notepad++](https://notepad-plus-plus.org/downloads/) for these purposes, but you are free to use whichever text editor you want.

In [None]:
# Rename the columns to match the required headers:
# FAM_ID, INDIV_ID, FATHER_ID, MOTHER_ID, SEX, PHENO
fam_df.rename(columns={
    "individual_id": "INDIV_ID",
    "father_id": "FATHER_ID",
    "mother_id": "MOTHER_ID",
    "sex": "SEX"
}, inplace=True)

fam_df["FAM_ID"] = "FAM"
fam_df["PHENO"] = -9

# Reorder columns if necessary
fam_df = fam_df[["FAM_ID", "INDIV_ID", "FATHER_ID", "MOTHER_ID", "SEX", "PHENO"]]

# Display the first few rows to verify
display(fam_df.head())

# Save the updated file without header and index, using tab separation
fam_df.to_csv(os.path.join(results_directory, "pedigree.fam"), sep="\t", index=False, header=False)

In [None]:
!{utils_directory}/ped-sim/fam2def.py -i {results_directory}/pedigree.fam -o {results_directory}/pedigree.def

Take a look at your pedigree.def file in the results directory.

How many simulationed pedigrees do you want?

In a machine learning project, the answer would be as many as is needed to train your model well. For now, let's just try it out with `10`. When you run the next cell, a pop-up will appear for you to enter the number of pedigrees that you want.

In [None]:
# Get user input for the number of pedigrees
num_pedigrees = int(input("Enter the number of pedigrees to generate: "))

# Define input file path
input_def_file = os.path.join(results_directory, "pedigree.def")

# Read the file contents
with open(input_def_file, "r") as file:
    lines = file.readlines()

# Modify the first line (only changing the second value)
if lines[0].startswith("def"):
    parts = lines[0].split()  # Split the first line into parts
    parts[2] = str(num_pedigrees)  # Update only the second value
    lines[0] = " ".join(parts) + "\n"  # Reconstruct the modified line

# Write back to the file
with open(input_def_file, "w") as file:
    file.writelines(lines)

print(f"Updated {input_def_file} with num_pedigrees = {num_pedigrees}")

Take another look at your pedigree.def file in the results directory. The def line has changed to reflect the number of pedigrees you said you wanted.

## Ped-sim method

The Pedigree Simulator (Ped-sim) is a powerful tool for simulating pedigree structures that use sex-specific genetic maps and considers sex of the individual. To get it up and running on your system, you'll need to download and compile it in your computing space if it is not already there.

Here is the ped-sim GitHub page for your reference: [ped-sim](https://github.com/williamslab/ped-sim).

### Sex-specific Genetic Map

From [Map file](https://github.com/williamslab/ped-sim?tab=readme-ov-file#map-file-):

"The genetic map file contains three columns for a sex-averaged map and four columns if using male and female maps. The format of this file is:

`[chromosome] [physical_position] [map_position0] <map_position1>`

The chromosomes are expected to be listed in the same order as they are in any input VCF file, with the physical positions in increasing order. The chromosome names must also match the names in the input VCF file, and all chromosome names present in the map must also have corresponding records in the VCF.

[map_position0] is genetic position in centiMorgans, and should either be the sex-averaged genetic position if using only one map, or should be the male genetic position if using two maps. When using only one map, the simulator samples all crossovers from that one map and does not distinguish male and female parents.

<map_position1> is likewise a genetic position in centiMorgans and should correspond to the female genetic position if given.

A high resolution human sex-specific genetic map is available [here](https://github.com/cbherer/Bherer_etal_SexualDimorphismRecombination), and is described in [Bhérer et al. (2017)](https://www.nature.com/articles/ncomms14994). To generate an autosomal map file in the format the simulator requires with both male and female genetic positions, run the following bash commands:"

In [None]:
%%bash -s "$references_directory"

# Receive directory variables from Python
references_directory=$1

# genetic map for ped-sim

wget https://github.com/cbherer/Bherer_etal_SexualDimorphismRecombination/raw/master/Refined_genetic_map_b37.tar.gz -P $references_directory
tar xvzf $references_directory/Refined_genetic_map_b37.tar.gz -C $references_directory
printf "#chr\tpos\tmale_cM\tfemale_cM\n" > $references_directory/refined_mf_b37.simmap

# The paste command combines each line from the male chromosome file (male_chr$chr.txt)
# with the corresponding line from the female chromosome file (female_chr$chr.txt).
# They are combined side by side, separated by a tab.
for chr in {1..22}; do
  paste $references_directory/Refined_genetic_map_b37/male_chr$chr.txt $references_directory/Refined_genetic_map_b37/female_chr$chr.txt \
    | awk -v OFS="\t" 'NR > 1 && $2 == $6 {print $1,$2,$4,$8}' \
    | sed 's/^chr//' >> $references_directory/refined_mf_b37.simmap
done

rm $references_directory/Refined_genetic_map_b37.tar.gz
rm -r $references_directory/Refined_genetic_map_b37

# When you get the check mark indicating that the cell successfully completed its run,
# go ahead and clear the cell output.

# For the moment, you can igore:

# tar: Ignoring unknown extended header keyword 'SCHILY.dev'
# tar: Ignoring unknown extended header keyword 'SCHILY.ino'
# tar: Ignoring unknown extended header keyword 'SCHILY.nlink'

You should now have the `refined_mf_b37.simmap` file in your references directory. Manually take a look at the file. It is a text file so you can use Notepad++ or your build-in text editor to view it.

The next two cells is used to convert the file from build 37 to build 38.

In [None]:
%%bash -s "$references_directory" "$utils_directory"

# Receive directory variables from Python
references_directory=$1
utils_directory=$2

# --- Convert the build 37 map to build 38 using liftOver ---

# Create a BED file from the build 37 simmap.
# The BED file requires 0-based start coordinates, so subtract 1 from the position.
awk 'NR>1 {print "chr"$1, $2-1, $2, $3, $4}' OFS="\t" $references_directory/refined_mf_b37.simmap > $references_directory/refined_mf_b37.bed

# Run liftOver to convert BED coordinates to build 38.
liftOver $references_directory/refined_mf_b37.bed \
         $references_directory/hg19ToHg38.over.chain.gz \
         $references_directory/refined_mf_b38.bed \
         $references_directory/refined_mf_b38.unmapped

# (Optional) Clean up temporary files
rm $references_directory/refined_mf_b37.bed

In [None]:
# Read the BED file
bed_file = os.path.join(references_directory, "refined_mf_b38.bed")
bed_data = pd.read_csv(bed_file, sep='\t', header=None)

# Rename columns for clarity
bed_data.columns = ['chr', 'start', 'pos', 'male_cM', 'female_cM']

# Clean chromosome names (remove 'chr' prefix)
bed_data['chr'] = bed_data['chr'].str.replace('chr', '', regex=True)

# Convert chromosome to numeric if possible
bed_data['chr'] = pd.to_numeric(bed_data['chr'], errors='coerce')

# Filter for chromosomes 1-22 and remove NaN values
mask = (bed_data['chr'] >= 1) & (bed_data['chr'] <= 22) & (bed_data['chr'].notna())
filtered_data = bed_data[mask].copy()  # Create a copy to avoid the warning

# Convert chromosome to integer after filtering out NaN values
filtered_data['chr'] = filtered_data['chr'].astype(int)

# Sort by chromosome (numerically) and position
sorted_data = filtered_data.sort_values(by=['chr', 'pos'])

# Keep only necessary columns
simmap_data = sorted_data[['chr', 'pos', 'male_cM', 'female_cM']]

# Print sample of data to verify
print("Sample of data to be saved:")
print(simmap_data.head())

# Save to simmap file with header
simmap_file = os.path.join(references_directory, "refined_mf_b38.simmap")
with open(simmap_file, 'w') as f:
    f.write("#chr\tpos\tmale_cM\tfemale_cM\n")
simmap_data.to_csv(simmap_file, sep='\t', index=False, header=False, mode='a')

print("Processing complete - chromosomes saved as integers.")

## Run Ped-sim

Now you are ready to run ped-sim.

In [None]:
%%bash -s "$references_directory" "$utils_directory" "$results_directory"

# Receive directory variables from Python
references_directory=$1
utils_directory=$2
results_directory=$3
pedigree_definition_file="pedigree.def"
ped_sim_filename="ped_sim_run"

# Run ped-sim with parameters:

# -d: Specifies the path to the pedigree definition file.
# -m: Specifies the path to the genetic map file.
# -o: Specifies the prefix for the output files.
# --intf: Uses the interference crossover model
# --seed: Sets the random number generator seed to ensure reproducibility.
# --fam: Outputs the simulated data in PLINK's .fam format.
# --mrca: Outputs the most recent common ancestor (MRCA) of all sampled individuals.


# Notice that this uses the human crossover interference parameters stored in pedsim/interfere directory

${utils_directory}/ped-sim/ped-sim \
  -d $results_directory/$pedigree_definition_file \
  -m $references_directory/refined_mf_b38.simmap \
  -o $results_directory/$ped_sim_filename \
  --intf $utils_directory/ped-sim/interfere/nu_p_campbell.tsv \
  --seed 1234 \
  --fam \
  --mrca

# # using --pois instead of --intf
# # --pois: Uses the Poisson distribution to determine the number of crossovers.
# ./pedsim/ped-sim \
#   -d $util_directory/$pedigree_definition_file \
#   -m $references_directory/combined_genetic_map_b38.txt \
#   -o $results_directory/$ped_sim_filename \
#   --pois \
#   --seed 1234 \
#   --fam \
#   --mrca

**Let's take a look at the results.** 

Familiarize yourself with the different result files. Those are the ped_sim_run files in your results directory. Notice from the output of the above cell that if you wanted to simulate genotype data for this configuration, you would need to rerun the above cell with an input VCF with 220 founders (if you selected 10 pedigrees and the predefined pedigree). For now, we do not actually need the genotype itself. Using the IBD segment data (the .seg file) is sufficient.

## Graphing the results

In this run, we will first define the graph function, then call the function to create our graph.

**Define the graph function.**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import os

def graph_fam_file_simulated(filename, pedigree=None):
    fam_file = pd.read_csv(os.path.join(results_directory, filename), sep=" ", header=None)
    fam_file.columns = ["fam", "id", "parent0", "parent1", "sex", "phenotype"]

    if pedigree is not None:

        # Extract pedigree number from the first element
        fam_file['pedigree_num'] = fam_file['fam'].apply(lambda x: x.replace("FAM", ""))
        fam_file = fam_file[fam_file['pedigree_num'] == str(pedigree)]

    fig = plt.figure(num=None, figsize=(250, 100), dpi=100)

    G = nx.DiGraph()

    for index, row in fam_file.iterrows():
        id = row['id']

        values_id1 = id.split("_")
        
        # Ensure the ID follows expected format
        if len(values_id1) < 2:
            raise ValueError(f"Unexpected ID format: {id} -> {values_id1}")
        
        subvalues_id1 = values_id1[1].split("-")
        gen_id1 = int(subvalues_id1[0].lstrip("g"))

        G.add_node(id, time=gen_id1)

        for parent_column in ['parent0', 'parent1']:
            if row[parent_column] != '0':
                parent_id = row[parent_column]
                values_parent = parent_id.split("_")

                if len(values_parent) < 2:
                    raise ValueError(f"Unexpected Parent format: {parent_id} -> {values_parent}")

                subvalues_parent = values_parent[1].split("-")
                gen_parent = int(subvalues_parent[0].lstrip("g"))
                G.add_node(parent_id, time=gen_parent)
                G.add_edge(parent_id, id)

    # Reverse the order of generations
    max_gen = max(gen_id1 for _, gen_id1 in G.nodes(data='time'))
    for node, data in G.nodes(data=True):
        data['time'] = max_gen - data['time']

    return G

**Call the graph function.**

In [None]:
G1 = graph_fam_file_simulated("ped_sim_run-everyone.fam")
pos = nx.multipartite_layout(G1, subset_key="time", align="horizontal", scale=50)
nx.draw_networkx(G1, pos, node_size=3000, with_labels=True, arrows=False)
ped_sim_plot_filename = f"{results_directory}/diagram_ped_sim_pedigree.svg"
# plt.title("Ped-Sim Pedigree")
plt.savefig(ped_sim_plot_filename, bbox_inches = 'tight', pad_inches = 0)

In the output, you will see a depiction of all 10 pedigrees (also saved in your results directory as a `diagram_ped_sim_pedigree.svg`). You should see five rows of blue dots. These are the six generations specified in the pedigree definition file. But perhaps it's a little difficult to see what's going on in the output. Let's take a look at just one of the pedigrees.

In [None]:
pedigree_num = 1
G2 = graph_fam_file_simulated("ped_sim_run-everyone.fam", pedigree = pedigree_num)
pos = nx.multipartite_layout(G2, subset_key="time", align="horizontal", scale=50)
nx.draw_networkx(G2, pos, node_size=3000, with_labels=True, arrows=False)
ped_sim_plot_filename = f"{results_directory}/diagram_ped_sim_pedigree_{pedigree_num}.svg"
plt.savefig(ped_sim_plot_filename, bbox_inches='tight', pad_inches=0)

That's better. Now we can see what one of the pedigrees look like.

Each node (blue circle) is a person. Each edge (line) represents one generation.

Can you identify:
- full siblings
- first cousins
- second cousins
- third cousins
- parents
- grandparents
- great grandparents
- 2x great grandparents
- aunt/uncle (avuncular)

## Segments

Now, let's look at the segment data and build our data table.

In [None]:
segments = pd.read_csv(os.path.join(results_directory, "ped_sim_run.seg"), sep="\t", header = None)
segments.head()

Notice the column headers are not very informative. Let's change that.

In [None]:
segments.columns = (["id1", "id2", "chromosome", "physical_position_start", "physical_position_end", "IBD_type", "genetic_position_start", "genetic_position_end", "genetic_length"])
segments.head()

In [None]:
segments.info()

In [None]:
# Get the unique values
segments['IBD_type'].unique()

In [None]:
segments['chromosome'].unique()

In [None]:
# Get data descriptives
segments['genetic_length'].describe()

In [None]:
# Get the DNA segment minimum genetic length

min_segment_size = float(segments['genetic_length'].min())

print(f"The minimum segment size is {min_segment_size:.6f} cM.")

In reality, we would not work with DNA segments smaller than what we can accurately detect. The IBD detection algorithms we use can accurately detect IBD segments at a minimum of 2 cM (e.g., Refined IBD). Let's conservatively filter the segment such that we use only the segments with a minimum of 3 cM.

In [None]:
# Subset your data.
# Get only the segments with a genetic length of at least 3 cM.

filtered_segments = segments[segments['genetic_length'] >= 3]
filtered_segments.info()

In [None]:
# Get data descriptives
filtered_segments['genetic_length'].describe()

Compare the descriptive output of the full dataset and the subsetted dataset.

You can use other filtering conditions if you want. For example, you could filter the segments between 10 and 50 cM inclusively.

In [None]:
filtered_segments_50_10 = segments[(segments['genetic_length'] >= 10) & (segments['genetic_length'] <= 50)]
filtered_segments_50_10.info()

In [None]:
filtered_segments_50_10['genetic_length'].describe()

Let's create a summary pandas dataframe called relationships based on the segment data. Instead of having a row for each segment, this dataframe will summarize the shared segments for each unique pair.

In [None]:
# Create the relationships dataframe for simulated data

def segment_position(x, position):
    sorted_values = x.sort_values(ascending=False)
    if len(sorted_values) < position:
        return 0
    return sorted_values.iloc[position - 1]

relationships = filtered_segments.groupby(['id1', 'id2']).agg(
    num_seg=('id1', 'size'),
    total_shared=('genetic_length', 'sum'),
    max_seg=('genetic_length', 'max'),
    second_seg=('genetic_length', lambda x: segment_position(x, 2)),
    third_seg=('genetic_length', lambda x: segment_position(x, 3)),
    fourth_seg=('genetic_length', lambda x: segment_position(x, 4)),
).reset_index()

relationships.head()

In [None]:
relationships.info()

In [None]:
updated_relationships = relationships.copy()  # Create a copy of the relationships DataFrame

def get_relationships(row):
    common_ancestors = nx.lowest_common_ancestor(G1, row['id1'], row['id2'])
    path_lengths1 = nx.shortest_path_length(G1, source=common_ancestors, target=row['id1'])
    path_lengths2 = nx.shortest_path_length(G1, source=common_ancestors, target=row['id2'])
    return str((path_lengths1, path_lengths2))

with tqdm(total=len(updated_relationships), desc="Processing relationships") as progress_bar:
    for index, row in updated_relationships.iterrows():
        updated_relationships.loc[index, 'genealogical_relationship'] = get_relationships(row)
        progress_bar.update()

updated_relationships.head()

In the updated relationships dataframe, each row represents an unique pair. It also has the total number of segments shared in the pair, total amount of DNA (minimum of 3 cM), and largest 4 segment (cM). You'll also see a genealogical_relationship column where the value is a tuple. In the tuple, the first value is the number of generations from id1 to the shared ancestor, and the second value is the number of generations from id2 and the shared ancestor. For example, (2, 1) means that there is 2 generations between id1 and the shared ancestor and 1 generation between id2 and the shared ancestor. A genealogical_relationship of (1, 1) is full siblings (i.e., siblings sharing both biological parents). A value of 0 means "self", indicating that self is the ancestor of the other person. For example, (0, 1) means that id1 is the parent of id2. In another example, (2, 0) means that id1 is the grandchild of id2.

In [None]:
updated_relationships.info()

In [None]:
updated_relationships.describe()

In the next cell, we will add column meiotic_distance. Meiotic distance is the number of meioses separating the members of the pair. It is calculated by summing the values in the genealogical_relationship tuple.

In [None]:
import ast

updated_relationships['genealogical_relationship'] = updated_relationships['genealogical_relationship'].apply(ast.literal_eval)
updated_relationships['meiotic_distance'] = updated_relationships['genealogical_relationship'].apply(lambda x: sum(x) if isinstance(x, tuple) else "")

updated_relationships.head()

In [None]:
updated_relationships.to_csv(os.path.join(results_directory, "relationships.csv"), sep="\t", index=False)

# Manually confirm that your relationships.csv file is in your results directory.

In [None]:
updated_relationships.describe()

In [None]:
print(updated_relationships['genealogical_relationship'].value_counts())
print("\n")
print(f"There are {len(updated_relationships['genealogical_relationship'].unique())} different relationship groups among the sampled individuals in this pedigree.")

Let's explore some descriptives of DNA sharing (min 3 cM segment) at the various relationship levels.

In [None]:
updated_relationships.drop([
    "id1", "id2",
    "second_seg", "third_seg",
    "meiotic_distance"
    ], axis=1).groupby("genealogical_relationship").agg(['count', 'mean', 'std', 'min', 'max'])

### Additional Exploration

In [None]:
# Read the saved csv file as a Pandas dataframe, replacing the previously saved `relationships` value.
relationships = pd.read_csv(os.path.join(results_directory, "relationships.csv"), sep = "\t")

relationships.head()

In [None]:
relationships.info()

In [None]:
sns.scatterplot(data=relationships, y='meiotic_distance', x='max_seg')
plt.xlabel('Max Segment Length')
plt.ylabel('Genealogical Distance')
plt.title('Scatter Plot: Genealogical Distance vs Max Segment Length')
plt.show()

In [None]:
sns.scatterplot(data=relationships, y='meiotic_distance', x='num_seg')
plt.xlabel('Number of Segments')
plt.ylabel('Genealogical Distance')
plt.title('Scatter Plot: Genealogical Distance vs Number of Segments')
plt.show()

In [None]:
sns.kdeplot(data=relationships, x='max_seg', y='meiotic_distance', cmap='viridis', fill=True)
plt.xlabel('Max Segment Length')
plt.ylabel('Genealogical Distance (generations)')
plt.title('Density Plot: Max Segment Length vs Genealogical Relationship')

# Set the y-axis tick labels based on the range of meiotic_distance
gg_distance_values = sorted(relationships['meiotic_distance'].unique())
plt.yticks(gg_distance_values, gg_distance_values)
plt.show()

In [None]:
sns.kdeplot(data=relationships, x='num_seg', y='meiotic_distance', cmap='viridis', fill=True)
plt.xlabel('Number of Segments')
plt.ylabel('Genealogical Distance (generations)')
plt.title('Density Plot: Number of Segments vs Genealogical Relationship')

# Set the y-axis tick labels based on the range of meiotic_distance
gg_distance_values = sorted(relationships['meiotic_distance'].unique())
plt.yticks(gg_distance_values, gg_distance_values)
plt.show()

In [None]:
# Get the average segment size for each pair
relationships["average_segment_size"] = relationships["total_shared"] / relationships["num_seg"]

relationships.head()

Let's take a look at the average segment size for each meiotic distance.

In [None]:
relationships.groupby('meiotic_distance')['average_segment_size'].mean()

Notice that the average segment size decreases as the meiotic distance between members of a pair increases.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(
    data=relationships,
    x='average_segment_size',
    y='meiotic_distance',
    cmap='viridis',
    fill=True
)

plt.xlabel('Average Segment Size')
plt.ylabel('Genealogical Distance (generations)')
plt.title('KDE Plot: Average Segment Size vs Genealogical Distance')
plt.show()

You should notice that overall, the methods we have used in our exploration is not a predictor of genetic genealogical relationships beyond 1 or 2 generations. For that, we turn to machine learning.