# Perform lineage tracking

In [99]:
import cv2
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy

import skimage
from skimage import measure, transform # to get contours from masks

# btrack module and configuration file
import btrack
from btrack.dataio import localizations_to_objects
from btrack.constants import BayesianUpdates
from btrack.render import plot_tracks

import re
import imagecodecs
import napari

# 1. Load segmentation and tracking results for {Whole Cell, ana}

In [104]:
os.chdir("D:\Hugo\Anaphase/Inter_Div_Correlation\H449.1")  # root dir containg Predictions and Images Path

# read images from dir
def alphanumeric_sort( l ): 
    """ Sort the given iterable in the way that humans expect.""" 
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

# predictions
wc_prediction_path = "Sd32_H449.1_f0_1-400_modified.tif"
wc_tracking_path = "H449.1_f0_1-400.csv"
wc_contours_path = "H449.1_f0_1-400.npy"
ana_prediction_path = "BS120.291_H449.1_f0_1-400_modified1.tif"
ana_tracking_path = "H449.1_f0_1-400_anaphase.csv"
ana_contours_path = "H449.1_f0_1-400_anaphase.npy"

corresponding_imgs_path = ["D:\Hugo\Data\H449.1/f0_RFP", "D:\Hugo\Data\H449.1/f0_BF"]

wc_masks = imageio.volread(wc_prediction_path)
wc_tracking = pd.read_csv(wc_tracking_path)
wc_contours = np.load(wc_contours_path, allow_pickle=True)
ana_masks = imageio.volread(ana_prediction_path)
ana_tracking = pd.read_csv(ana_tracking_path)
ana_contours = np.load(ana_contours_path, allow_pickle=True)

wc_tracking["Contours"] = wc_contours
ana_tracking["Contours"] = ana_contours

wc_tracking["Parent"] = wc_tracking["ID"]  # default parent value is ID.

bf, target_shape, max_id = [], (512, 512), 400
for i, im in zip(range(max_id), alphanumeric_sort(os.listdir(corresponding_imgs_path[1]))):
    bf.append(cv2.resize(imageio.imread(os.path.join(corresponding_imgs_path[1], im)), target_shape))
rfp, target_shape, max_id = [], (512, 512), 400
for i, im in zip(range(max_id), alphanumeric_sort(os.listdir(corresponding_imgs_path[0]))):
    rfp.append(cv2.resize(imageio.imread(os.path.join(corresponding_imgs_path[0], im)), target_shape))
imgs = np.concatenate([np.expand_dims(bf, axis=-1), np.expand_dims(rfp, axis=-1)], axis=-1)

if imgs.shape[1] != wc_masks.shape[1] or imgs.shape[2] != wc_masks.shape[2]:
    imgs = skimage.transform.resize(imgs, (imgs.shape[0], wc_masks.shape[1], wc_masks.shape[2]))
if ana_masks.shape[1] != wc_masks.shape[1] or ana_masks.shape[2] != wc_masks.shape[2]:
    ana_masks = skimage.transform.resize(ana_masks, (ana_masks.shape[0], wc_masks.shape[1], wc_masks.shape[2]))
    
print(wc_masks.shape, ana_masks.shape, imgs.shape)

# Cast to 8-bit depth images prior to thresholding
wc_masks_8b = ((wc_masks - wc_masks.min(axis=(0, 1))) / (wc_masks.max(axis=(0, 1)) - wc_masks.min(axis=(0, 1)))).astype("uint8")
ana_masks_8b = ((ana_masks - ana_masks.min(axis=(0, 1))) / (ana_masks.max(axis=(0, 1)) - ana_masks.min(axis=(0, 1)))).astype("uint8")

(400, 512, 512) (400, 512, 512) (400, 512, 512, 2)


  wc_masks_8b = ((wc_masks - wc_masks.min(axis=(0, 1))) / (wc_masks.max(axis=(0, 1)) - wc_masks.min(axis=(0, 1)))).astype("uint8")
  ana_masks_8b = ((ana_masks - ana_masks.min(axis=(0, 1))) / (ana_masks.max(axis=(0, 1)) - ana_masks.min(axis=(0, 1)))).astype("uint8")


In [105]:
# clean a bit the noise
for idx in wc_tracking["ID"].unique():
    if wc_tracking[wc_tracking["ID"] == idx].shape == 1:
        wc_tracking = wc_tracking.drop(wc_tracking[wc_tracking["ID"] == idx].index)

In [119]:
def visualize_data_and_predictions(bf, predictions1, predictions2, nc_ims=1, nc_masks=1):
    
    if nc_ims == 1:
        viewer = napari.view_image(bf[:, :, :])
    else:
        viewer = napari.view_image(bf[:, :, :, 0])  # bf
        for k in range(1, nc_ims):
            viewer.add_image(bf[:, :, :, k], blending="additive")
    
    if predictions1 is not None:
        pred1, track1 = predictions1
        if nc_masks == 1:
            viewer.add_labels(pred1[:, :, :], blending="additive")#, colormap="blue")
        else:
            cmaps = ["bop blue", "red", "bop_orange", "blue", "bop purple"]
            for k in range(0, nc_masks):
                viewer.add_labels(pred1[:, :, :, k], blending="additive")#, colormap=cmaps[k])
        viewer.add_tracks(track1.drop(["Contours", "Parent", "Anaphase_Frame"], axis=1))
    
    if predictions2 is not None:
        pred2, track2 = predictions2
        if nc_masks == 1:
            viewer.add_labels(pred2[:, :, :], blending="additive")#, colormap="bop purple")
        else:
            cmaps = ["bop blue", "red", "bop_orange", "blue", "bop purple"]
            for k in range(0, nc_masks):
                viewer.add_labels(pred2[:, :, :, k], blending="additive")#, colormap=cmaps[len(cmaps) - 1 - k])
        viewer.add_tracks(track2.drop(["Contours"], axis=1))
    

visualize_data_and_predictions(imgs, (wc_masks, wc_tracking), (ana_masks, ana_tracking), nc_ims=2, nc_masks=1)

In [118]:
print(wc_tracking.columns)

Index(['ID', 'Frame', 'X', 'Y', 'Contours', 'Parent', 'Anaphase_Frame'], dtype='object')


# 2. Link the tracking results between both modules

For each anading event (considered as a true positive), we want to retrieve the two maximum matching whole cell segmentation : they are the mother and daughter cell.

In [115]:
def get_relative_surface(mom_contours, dau_contours):
    mom_surf = cv2.contourArea(mom_contours)
    if dau_contours is None or dau_contours.size == 0:
        return mom_surf, np.nan, np.nan
    else:
        daugh_surf = cv2.contourArea(dau_contours)
    return mom_surf, daugh_surf, daugh_surf / mom_surf

def get_relative_perimeter(mom_contours, dau_contours):
    mom_per = cv2.arcLength(mom_contours, closed=True)
    if dau_contours is None or dau_contours.size == 0:
        return mom_per, np.nan, np.nan
    else:
        daugh_per = cv2.arcLength(dau_contours, closed=True)
    return mom_per, daugh_per, daugh_per / mom_per

def get_eccentricity(mom_ellipsis, dau_ellipsis):
    mom_ecc = mom_ellipsis[1][0] / mom_ellipsis[1][1]  # c'est vraiment de la daube opencv : width / hieght
    if dau_ellipsis is None:
        return mom_ecc, np.nan, np.nan
    else:
        daugh_ecc = dau_ellipsis[1][0] / dau_ellipsis[1][1]
    return mom_ecc, daugh_ecc, daugh_ecc / mom_ecc

def get_circularity(mom_contours, dau_contours):
    mom_circ = 4 * np.pi * cv2.contourArea(mom_contours) / (cv2.arcLength(mom_contours, closed=True) ** 2)
    if dau_contours is None or dau_contours.size == 0:
        return mom_circ, np.nan, np.nan
    else:
        daugh_circ = 4 * np.pi * cv2.contourArea(dau_contours) / (cv2.arcLength(dau_contours, closed=True) ** 2)
    return mom_circ, daugh_circ, daugh_circ / mom_circ

def get_inter_centroid_distances(mom_contours, dau_contours, mom_ellipsis, dau_ellipsis):
    M = cv2.moments(mom_contours)
    mom_cx, mom_cy = int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"])
    if dau_contours is None or dau_contours.size == 0 or dau_ellipsis is None:
        return np.nan, np.nan
    else:
        M = cv2.moments(dau_contours)
        dau_cx, dau_cy = int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"])
        icd = np.sqrt((dau_cx - mom_cx) ** 2 + (mom_cy - dau_cy) ** 2)
    return icd, icd / (2 * (mom_ellipsis[1][1] + dau_ellipsis[1][1]))

def get_barycenter(contours):
    return int(np.sum(contours[:, 0] / contours.shape[0])), int(np.sum(contours[:, 1] / contours.shape[0]))


def get_features(mom_df, dau_df, frame, f, time_step, movie_name):
    anaphase_now, time_to_anaphase = (f == frame),  frame - f
    
    try:
        mom_contours = mom_df[mom_df["Frame"] == f]["Contours"].to_numpy()[0].astype("float32")
    except:
        return {"ana_ID": ana_traj_id,
                "frame": f, "time": f * time_step,
                "mom_ID": np.nan, "daugh_ID": np.nan, "mom_x": np.nan, "mom_y": np.nan, "daugh_x": np.nan, "daugh_y": np.nan, "mom_surf": np.nan, "daugh_surf": np.nan, "relat_surf": np.nan, 
                "mom_per": np.nan, "daugh_per": np.nan, "relat_per": np.nan, "mom_ecc": np.nan, "daugh_ecc": np.nan, "relat_ecc": np.nan, "mom_circ": np.nan, "daugh_circ": np.nan, "relat_circ": np.nan,
                "inter_centroid_dist": np.nan, "relat_inter_centroid_distance": np.nan, 
                "anaphase": anaphase_now,
                "time_to_anaphase": time_to_anaphase,
                "movie_name": movie_name}
    try:
        dau_contours = dau_df[dau_df["Frame"] == f]["Contours"].to_numpy()[0].astype("float32")
    except:
        return {"ana_ID": ana_traj_id,
                "frame": f, "time": f * time_step,
                "mom_ID": np.nan, "daugh_ID": np.nan, "mom_x": np.nan, "mom_y": np.nan, "daugh_x": np.nan, "daugh_y": np.nan, "mom_surf": np.nan, "daugh_surf": np.nan, "relat_surf": np.nan, 
                "mom_per": np.nan, "daugh_per": np.nan, "relat_per": np.nan, "mom_ecc": np.nan, "daugh_ecc": np.nan, "relat_ecc": np.nan, "mom_circ": np.nan, "daugh_circ": np.nan, "relat_circ": np.nan,
                "inter_centroid_dist": np.nan, "relat_inter_centroid_distance": np.nan, 
                "anaphase": anaphase_now,
                "time_to_anaphase": time_to_anaphase,
                "movie_name": movie_name}
    
    mom_x, mom_y = get_barycenter(mom_contours)
    dau_x, dau_y = get_barycenter(dau_contours)
    mom_surf, dau_surf, relat_surf = get_relative_surface(mom_contours, dau_contours)
    mom_per, dau_per, relat_per = get_relative_perimeter(mom_contours, dau_contours)
    mom_ellipsis = cv2.fitEllipse(mom_contours)
    if dau_contours is not None and dau_contours.size != 0:
        dau_ellipsis = cv2.fitEllipse(dau_contours)
    else:
        dau_ellipsis = None
    mom_ecc, dau_ecc, relat_ecc = get_eccentricity(mom_ellipsis, dau_ellipsis)  # scale invariant : no relat
    mom_circ, dau_circ, relat_circ = get_circularity(mom_contours, dau_contours)  # scale invariant
    inter_centroid_distance, relat_inter_centroid_distance = get_inter_centroid_distances(mom_contours, dau_contours, mom_ellipsis, dau_ellipsis)  # scale by the sum of diametersr axis length of the daughter cell

    return {"ana_ID": ana_traj_id,
            "frame": f, "time": f * time_step,
            "mom_ID": mom_df["ID"].values[0], "daugh_ID": dau_df["ID"].values[0],
            "mom_x": mom_x, "mom_y": mom_y,  # the position keeps track of which cell it is
            "daugh_x": dau_x, "daugh_y": dau_y,
            "mom_surf": mom_surf, "daugh_surf": dau_surf, "relat_surf": relat_surf, 
            "mom_per": mom_per, "daugh_per": dau_per, "relat_per": relat_per, 
            "mom_ecc": mom_ecc, "daugh_ecc": dau_ecc, "relat_ecc": relat_ecc,
            "mom_circ": mom_circ, "daugh_circ": dau_circ, "relat_circ": relat_circ,
            "inter_centroid_dist": inter_centroid_distance, "relat_inter_centroid_distance": relat_inter_centroid_distance, 
            "anaphase": anaphase_now,
            "time_to_anaphase": time_to_anaphase,
            "movie_name": movie_name}

In [108]:
import scipy

wc_tracking["Parent"] = wc_tracking["ID"]  # default parent value is ID.
time_step = 1
movie_name = "H449.1_f0"

print(wc_tracking.shape)

anaphase_events_features = []  # event_ID, Frame, mom_ID, dau_ID, mom_features[t], dau_features[t] 

def distance_criterion(ana_object, whole_cell_objects):
    """
    To find the best match between whole cell objects and ana objects, computes the distance
    between the centroid of the ana object and all the whole cell objects at the same frame
    and takes the two closest whole cell objects.
    ana_object (pd.Series): ana point at one frame
    whole_cell_objects (pd.DataFrame): all the whole cell objects in a dataframe
    Return: the two best match indices
    """
    def euclidean_distance(x1, y1, x2, y2):
        return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
    
    x, y, frame = ana_object["X"], ana_object["Y"], ana_object["Frame"]
    
    # get the whole cell tracklets at this frame and retrieve the two maximum matching points (e.g. the two closest)
    at_this_frame = whole_cell_objects[whole_cell_objects["Frame"] == frame]
    barycenters = at_this_frame[["X", "Y"]]
    distances = euclidean_distance(x, y, barycenters.values[:, 0], barycenters.values[:, 1])
    sorted_distances = np.sort(distances)
    
    if sorted_distances[0] > 20 or sorted_distances[1] > 20:
        return None
    
    min_idx, min_idx1 = np.argwhere(distances == sorted_distances[0])[0, 0], np.argwhere(distances == sorted_distances[1])[0, 0]
    
    closest_indices = at_this_frame.iloc[[min_idx, min_idx1]]["ID"].values # mom and daughter index
    
    return closest_indices

def intersection_over_union_criterion(ana_object, whole_cell_objects):
    """
    To find the best match between whole cell objects and ana objects, computes the intersection between
    the ana object and all the whole cell objects at the same frame and takes the largest overlaps.
    ana_object (pd.Series): ana point at one frame
    whole_cell_objects (pd.DataFrame): all the whole cell objects in a dataframe
    Return: the two best match indices
    """
    # get the whole cell tracklets at this frame and retrieve the two maximum matching points (e.g. the two closest)
    at_this_frame = whole_cell_objects[whole_cell_objects["Frame"] == frame]
    
    x, y = ana_object["X"], ana_object["Y"]
        
    def euclidean_distance(x1, y1, x2, y2):
        return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
    
    def get_mask(series):
        mask = np.zeros((512, 512))
        if np.isnan(series["Contours"]).any():
            return mask
        if euclidean_distance(x, y, series["X"], series["Y"]) > 20:  # filter the candidates, avoid useles computing -> speed up
            return mask
        contours = series["Contours"].astype("uint8")
        mask[contours[:, 1], contours[:, 0]] = 1
        masked_image = scipy.ndimage.morphology.binary_fill_holes(mask)
        return masked_image
    
    # make masks array from contours for ana object
    ana_mask = get_mask(ana_object)

    # make masks for the whole cell objects also
    wc_masks = np.array([get_mask(row) for i, row in at_this_frame.iterrows()])
    
    # compute intersections
    intersections = np.array([np.sum(np.logical_and(ana_mask, m)) for m in wc_masks])
    unions = np.array([np.sum(np.logical_or(ana_mask, m)) for m in wc_masks])
    
    iou = (intersections + 1e-8) / (unions + 1e-8 )
        
    intersection_indices = np.argwhere(iou > 5e-3).squeeze(-1)  # more stringent criterion
    
    if intersection_indices.size < 2:
        print(intersection_indices)
        return None

    return at_this_frame.iloc[intersection_indices]["ID"].values


# distance or intersection
import time
criterion = "intersection"
t0 = time.time()

retrieved_mothers, non_retrieved = 0, 0
for i, ana_traj_id in enumerate(ana_tracking["ID"].unique()):
    ana_traj = ana_tracking[ana_tracking["ID"] == ana_traj_id].iloc[0]
    
    frame = ana_traj["Frame"]
        
    if ana_traj_id % 50 == 0:
        print(f"ana ID : {ana_traj_id}")
        
    # distance criterion
    if criterion == "distance":
        closest_indices = distance_criterion(ana_traj, wc_tracking)
    elif criterion == "intersection":
        closest_indices = intersection_over_union_criterion(ana_traj, wc_tracking)
                
    if closest_indices is None:
        non_retrieved += 1
        continue
    
    # now determine who is the mother and who is the daughter. # criterion: mother should be bigger and older
    candidates = [wc_tracking[wc_tracking["ID"] == closest_indices[0]], wc_tracking[wc_tracking["ID"] == closest_indices[1]]]
    
    # no contours at this frame
    if np.isnan(candidates[0][candidates[0]["Frame"] == frame]["Contours"].values[0]).any() or np.isnan(candidates[1][candidates[1]["Frame"] == frame]["Contours"].values[0]).any():
        non_retrieved += 1
        continue
        
    sizes = [cv2.contourArea(candidates[0][candidates[0]["Frame"] == frame]["Contours"].values[0].astype("float32")), cv2.contourArea(candidates[1][candidates[1]["Frame"] == frame]["Contours"].values[0].astype("float32"))]
    first_frames = [np.min(candidates[0]["Frame"].values), np.min(candidates[1]["Frame"].values)]
    older_idx, bigger_idx = np.argmin(first_frames), np.argmax(sizes)
    
    if older_idx == bigger_idx:  # agreement between the two criteria
        mother_idx = older_idx
        daughter_idx = 1 - older_idx
        
        # save features
        wc_tracking.loc[wc_tracking["ID"] == closest_indices[daughter_idx], "Parent"] = closest_indices[mother_idx]
        wc_tracking.loc[wc_tracking["ID"] == closest_indices[daughter_idx], "Anaphase_Frame"] = frame
        
        mom_df, dau_df = wc_tracking[wc_tracking["ID"] == closest_indices[mother_idx]], wc_tracking[wc_tracking["ID"] == closest_indices[daughter_idx]]
        for f in range(dau_df["Frame"].values[0], frame+5):  # collect features between apparition of daughter cell and anaphase frame  
            anaphase_events_features.append(get_features(mom_df, dau_df, frame, f, time_step, movie_name))

        
        print(f"Frame {frame}, Track {closest_indices[mother_idx]} mother of track {closest_indices[daughter_idx]}")
        retrieved_mothers += 1
    elif first_frames[0] == first_frames[1] and sizes[0] != sizes[1]:
        mother_idx = bigger_idx
        daughter_idx = 1 - bigger_idx
        
        # save features
        wc_tracking.loc[wc_tracking["ID"] == closest_indices[daughter_idx], "Parent"] = closest_indices[mother_idx]
        wc_tracking.loc[wc_tracking["ID"] == closest_indices[daughter_idx], "Anaphase_Frame"] = frame
        
        mom_df, dau_df = wc_tracking[wc_tracking["ID"] == closest_indices[mother_idx]], wc_tracking[wc_tracking["ID"] == closest_indices[daughter_idx]]
        for f in range(dau_df["Frame"].values[0], frame+5):  # collect features between apparition of daughter cell and anaphase frame  
            anaphase_events_features.append(get_features(mom_df, dau_df, frame, f, time_step, movie_name))
            
        print(f"Frame {frame}, Track {closest_indices[mother_idx]} mother of track {closest_indices[daughter_idx]}")
        retrieved_mothers += 1
    elif first_frames[0] != first_frames[1] and sizes[0] == sizes[1]:
        mother_idx = older_idx
        daughter_idx = 1 - older_idx
        
        # save features
        wc_tracking.loc[wc_tracking["ID"] == closest_indices[daughter_idx], "Parent"] = closest_indices[mother_idx]
        wc_tracking.loc[wc_tracking["ID"] == closest_indices[daughter_idx], "Anaphase_Frame"] = frame
        
        mom_df, dau_df = wc_tracking[wc_tracking["ID"] == closest_indices[mother_idx]], wc_tracking[wc_tracking["ID"] == closest_indices[daughter_idx]]
        anaphase_events_features.append(get_features(mom_df, dau_df, frame, f, time_step, movie_name))
        for f in range(dau_df["Frame"].values[0], frame+5):  # collect features between apparition of daughter cell and anaphase frame
            anaphase_events_features.append(get_features(mom_df, dau_df, frame, f, time_step, movie_name))
        
        print(f"Frame {frame}, Track {closest_indices[mother_idx]} mother of track {closest_indices[daughter_idx]}")
        retrieved_mothers += 1
    else:
        non_retrieved += 1
        continue  # mother and daughter could not be determined
            
n_trajectories_total = wc_tracking["ID"].unique().shape[0] - wc_tracking[wc_tracking["Frame"] == 0]["ID"].unique().shape[0] # number of cells that appeared = total # cells (including noise...) - initial # cells
print(f"Retrieved {retrieved_mothers} lineages over {n_trajectories_total}.")
print(f"Not retrieved {non_retrieved}.")

print(f"Duration: {round(time.time() - t0, 2)} sec.")

(56955, 6)
Frame 19, Track 25 mother of track 4
Frame 27, Track 16 mother of track 35
Frame 37, Track 17 mother of track 36
Frame 39, Track 26 mother of track 33
Frame 45, Track 21 mother of track 45
Frame 45, Track 23 mother of track 53
Frame 47, Track 10 mother of track 47
Frame 49, Track 28 mother of track 40
Frame 51, Track 8 mother of track 54
Frame 51, Track 13 mother of track 37
Frame 54, Track 9 mother of track 48
Frame 59, Track 19 mother of track 52
[40]
[40]
Frame 63, Track 22 mother of track 64
Frame 67, Track 7 mother of track 66
Frame 68, Track 20 mother of track 67
Frame 69, Track 24 mother of track 56
Frame 71, Track 15 mother of track 63
Frame 72, Track 12 mother of track 57
Frame 79, Track 2 mother of track 68
Frame 100, Track 14 mother of track 74
[26]
[23]
Frame 109, Track 25 mother of track 84
Frame 110, Track 17 mother of track 108
Frame 116, Track 27 mother of track 107
Frame 119, Track 28 mother of track 110
Frame 120, Track 23 mother of track 124
Frame 123, Tra

In [109]:
anaphase_events_df = pd.DataFrame(anaphase_events_features)

anaphase_events_df.head()

Unnamed: 0,ana_ID,frame,time,mom_ID,daugh_ID,mom_x,mom_y,daugh_x,daugh_y,mom_surf,...,daugh_ecc,relat_ecc,mom_circ,daugh_circ,relat_circ,inter_centroid_dist,relat_inter_centroid_distance,anaphase,time_to_anaphase,movie_name
0,1,0,0,25.0,4.0,113.0,211.0,126.0,219.0,189.970144,...,0.862119,1.133271,0.841267,0.86245,1.02518,15.264338,0.292929,False,19,H449.1_f0
1,1,1,1,25.0,4.0,113.0,213.0,127.0,220.0,191.970152,...,0.786918,1.02071,0.824286,0.771451,0.935902,14.764823,0.277835,False,18,H449.1_f0
2,1,2,2,25.0,4.0,113.0,212.0,125.0,219.0,152.910154,...,0.940186,1.283521,0.830594,0.78715,0.947696,13.892444,0.316435,False,17,H449.1_f0
3,1,3,3,25.0,4.0,113.0,212.0,127.0,220.0,192.970144,...,0.974455,1.251862,0.854552,0.893082,1.045087,15.264338,0.285596,False,16,H449.1_f0
4,1,4,4,25.0,4.0,113.0,213.0,126.0,219.0,161.930151,...,0.850709,1.094329,0.838256,0.775693,0.925365,14.317821,0.313404,False,15,H449.1_f0


In [110]:
print(wc_tracking.shape)

wc_tracking.drop("Contours", axis=1).to_csv("D:/Hugo/Anaphase/Inter_Div_Correlation/H449.1/H449.1_lineage_tracking.csv", sep=",", index=False)
anaphase_events_df.to_csv("D:/Hugo/Anaphase/Inter_Div_Correlation/H449.1/H449.1_anaphase_events.csv", sep=",", index=False)

(56955, 7)


# 3. Build lineage trees

In [111]:
anaphase_events_df = pd.read_csv("D:\Hugo\Anaphase\Inter_Div_Correlation\H449.1/H449.1_anaphase_events.csv", sep=",")
print(anaphase_events_df.shape)

(10196, 26)


In [113]:
import ete3

class Tree:
    
    def __init__(self, idx=None, left=None, right=None, parent=None, mom_ID=None, daugh_ID=None):
        self.idx = idx
        self.left = left
        self.right = right
        self.parent = parent
        
        self.mom_ID = mom_ID
        self.daugh_ID = daugh_ID
        
    def __str__(self):
        if self.left is None and self.right is None:
            return f"{self.idx} \\"
        return f"{self.idx}, left: {self.left}, right: {self.right}"
    
    def __repr__(self):
        if self.left is None and self.right is None:
            return f"{self.idx} \\"
        return f"{self.idx}, left: {self.left}, right: {self.right}"
        
    def get_newick(self):
        """
        Returns the tree structure in the Newick format (format 8) to import it
        in ete3 later.
        """
        if self.left is None and self.right is None:
            return f"{str(self.idx)}"
        elif self.left is None:
            return f"({self.right.get_newick()}){str(self.idx)}"
        elif self.right is None:
            return f"({self.left.get_newick()}){str(self.idx)}"
        return f"({self.left.get_newick()},{self.right.get_newick()}){str(self.idx)}"
    
    def write_newick(self, file_path):
        newick = self.get_newick()
        with open("file_path", "w") as ofl:
            ofl.write(newick)
            ofl.write(";\n")
    
    def get_height(self):
        if self.left is None and self.right is None:
            return 1
        elif self.left is None:
            return 1 + self.right.get_height()
        elif self.right is None:
            return 1 + self.left.get_height()
        return 1 + np.max((self.left.get_height(), self.right.get_height()))
    
    def get_nodes(self, acc=[]):
        acc.append(self.idx)
        if self.left is None and self.right is None:
            return acc
        elif self.left is None:
            return self.right.get_nodes(acc)
        elif self.right is None:
            return self.left.get_nodes(acc)
        return self.right.get_nodes(self.left.get_nodes(acc))
    
    def show(self):
        t = ete3.Tree(self.get_newick() + ";")
        ts = ete3.TreeStyle()
        ts.show_leaf_name = True
        def show_nodes_name(node):
            F = ete3.TextFace(node.name, tight_text=True)
            ete3.add_face_to_node(F, node, column=0, position="branch-right")
        ts.layout_fn = show_nodes_name
        t.show(tree_style=ts)
            
        
def get_previous_event(cell_idx, current_ana_idx):
    for ana_idx in anaphase_events_df[anaphase_events_df["mom_ID"] == cell_idx]["ana_ID"].unique()[::-1]:
        if ana_idx < current_ana_idx:
            return ana_idx
    return None
        
def get_next_event(cell_idx, current_ana_idx):
    for ana_idx in anaphase_events_df[anaphase_events_df["mom_ID"] == cell_idx]["ana_ID"].unique():
        if ana_idx > current_ana_idx:
            return ana_idx
    return None

def build_tree(current_ana_idx):
    
    if current_ana_idx is None:
        return None
        
    mom_idx = anaphase_events_df[anaphase_events_df["ana_ID"] == current_ana_idx]["mom_ID"].unique()
    dau_idx = anaphase_events_df[anaphase_events_df["ana_ID"] == current_ana_idx]["daugh_ID"].unique()
        
    if mom_idx.size > 0:
        mom_idx = mom_idx[0]
    else:
        raise KeyError("Mother cell not found in the anaphase dataframe!")
    if dau_idx.size > 0:
        dau_idx = dau_idx[0]
    else:
        raise KeyError("Mother cell not found in the anaphase dataframe!")
        
    if mom_idx in all_trees:
        left_tree = all_trees[mom_idx]
    else:
        left_tree = build_tree(get_next_event(mom_idx, current_ana_idx))
    if dau_idx in all_trees:
        right_tree = all_trees[dau_idx]
    else:
        right_tree = build_tree(get_next_event(dau_idx, current_ana_idx))
        
    parent = get_previous_event(mom_idx, current_ana_idx)
        
    if get_next_event(mom_idx, current_ana_idx) is None and get_next_event(dau_idx, current_ana_idx) is None:
        t = Tree(current_ana_idx, left=None, right=None, parent=parent, mom_ID=mom_idx, daugh_ID=dau_idx)
        all_trees[current_ana_idx] = t
        return t
    else:
        t = Tree(current_ana_idx, left_tree, right_tree, parent=parent, mom_ID=mom_idx, daugh_ID=dau_idx)
        all_trees[current_ana_idx] = t
        return t

In [114]:
all_trees, roots = {}, {}

for ana_idx in anaphase_events_df["ana_ID"].unique():
    
    if not ana_idx in all_trees:
        t = build_tree(ana_idx)
    else:
        t = all_trees[ana_idx]
        
    if t.parent is None:
        roots[ana_idx] = t

print(f"Total # division events : {len(all_trees.keys())}")
print(f"# root division events : {len(roots.keys())}")

with open("D:\Hugo\Anaphase\Inter_Div_Correlation\H449.1/division_events_lineages.nh", 'w') as ifl:
    for ana_idx, tree in roots.items():
        ifl.write(tree.get_newick())
        ifl.write(";\n")        

Total # division events : 238
# root division events : 116
