# 1.0 Preprocessing

## Set up

In [1]:
from cell_viewer.utils.paths import here

%run {here("notebooks", "0.2-jvs-before_to_start.ipynb")}

## Libraries

In [2]:
import janitor
import numpy as np
import pandas as pd
import scipy.stats as ss
from cell_viewer.UJ import UJExperiment


## Load data

### Data directory

In [3]:
UJ_dir = data_raw_dir("uJ_data")
UJ_dir.exists()

True

### Read

In [4]:
full_lineages_df = pd.concat(
    (
        pd.read_csv(filepath_or_buffer=file).add_column(
            column_name="file_id", value=file
        )
        for file in UJ_dir.rglob("*_lineages_all.csv")
    )
)

full_lineages_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 330067 entries, 0 to 4498
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   lineageID       330067 non-null  int64  
 1   trackID         330067 non-null  object 
 2   cellID          330067 non-null  float64
 3   motherID        330067 non-null  float64
 4   frame           330067 non-null  int64  
 5   roiID           330067 non-null  object 
 6   length          330067 non-null  float64
 7   division        330067 non-null  int64  
 8   state           188127 non-null  float64
 9   tracking_score  164490 non-null  float64
 10  GFP             330067 non-null  float64
 11  DsRed           330067 non-null  float64
 12  file_id         330067 non-null  object 
 13  dead            141940 non-null  float64
dtypes: float64(8), int64(3), object(3)
memory usage: 37.8+ MB


## Preprocessing

### Experiment specifications

In [5]:
ANTIBIOTIC_FRAMES = pd.DataFrame(
    dict(
        experiment_id = ["20190325_Bruno_pBGT_quimiostato", "20190828_Bruno_MGGT_quimiostato"],
        time_unit = 10,
        centered_antibiotic_start_frame = [6, 6],
        centered_antibiotic_end_frame = [14, 10]
    )
)

ANTIBIOTIC_FRAMES

Unnamed: 0,experiment_id,time_unit,centered_antibiotic_start_frame,centered_antibiotic_end_frame
0,20190325_Bruno_pBGT_quimiostato,10,6,14
1,20190828_Bruno_MGGT_quimiostato,10,6,10


### Clean names and center frames by experiment

In [6]:
processed_lineages_df = (
    full_lineages_df
    .clean_names(case_type="snake")
    .assign(
        experiment_id=lambda df: df.file_id.apply(lambda file: file.relative_to(UJ_dir).parts[1]),
        trap_id=lambda df: df.file_id.astype("str").str.extract(r"(xy\d+)")
    )
    .groupby("experiment_id")
    .apply(lambda group: group.assign(centered_frame=lambda x: x.frame - x.frame.min()))
    .reset_index(drop=True)
    .merge(ANTIBIOTIC_FRAMES, on="experiment_id")
    .assign(time=lambda df: df.centered_frame * df.time_unit)
    .drop(columns=["file_id", "state", "dead", "time_unit"])
)

processed_lineages_df.head()

Unnamed: 0,lineage_id,track_id,cell_id,mother_id,frame,roi_id,length,division,tracking_score,gfp,ds_red,experiment_id,trap_id,centered_frame,centered_antibiotic_start_frame,centered_antibiotic_end_frame,time
0,1,20.038-44.000,20.038,20.038,20,roi_f20_n38_x463_y290,28.071253,0,171.055377,202.571,98.776,20190325_Bruno_pBGT_quimiostato,xy01,0,6,14,0
1,1,20.038-44.000,21.025,20.038,21,roi_f21_n25_x465_y290,27.384603,0,254.229721,192.205,98.439,20190325_Bruno_pBGT_quimiostato,xy01,1,6,14,10
2,1,20.038-44.000,22.024,20.038,22,roi_f22_n24_x466_y290,27.714517,0,264.486122,188.813,98.337,20190325_Bruno_pBGT_quimiostato,xy01,2,6,14,20
3,1,20.038-44.000,23.026,20.038,23,roi_f23_n26_x467_y291,28.855827,0,261.012088,186.514,98.405,20190325_Bruno_pBGT_quimiostato,xy01,3,6,14,30
4,1,20.038-44.000,24.025,20.038,24,roi_f24_n25_x467_y291,29.005384,0,206.85216,188.081,98.698,20190325_Bruno_pBGT_quimiostato,xy01,4,6,14,40


### Create control stats table

In [7]:
control_stats_df = (
    processed_lineages_df
    .filter_on("centered_frame < centered_antibiotic_start_frame")
    .select_columns(["experiment_id", "length", "gfp", "ds_red"])
    .groupby("experiment_id")
    .aggregate([
        ("mean", "mean"),
        ("median", "median"),
        ("std", "std"),
        ("skew", "skew"),
        ("variation", ss.variation)
    ])
)

control_stats_df

Unnamed: 0_level_0,length,length,length,length,length,gfp,gfp,gfp,gfp,gfp,ds_red,ds_red,ds_red,ds_red,ds_red
Unnamed: 0_level_1,mean,median,std,skew,variation,mean,median,std,skew,variation,mean,median,std,skew,variation
experiment_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
20190325_Bruno_pBGT_quimiostato,28.124802,25.8339,11.389836,3.446234,0.404973,268.202351,258.871,62.186659,1.616867,0.231863,98.043428,98.017,0.32117,0.552858,0.003276
20190828_Bruno_MGGT_quimiostato,26.041063,25.005222,6.450021,1.406127,0.247676,121.46576,120.51,3.17663,1.923072,0.026151,107.710425,107.656,0.422095,0.749905,0.003919


### Create columns for filamentation thresholds

In [8]:
FILAMENTATION_THRESHOLDS = (
    control_stats_df["length"]
    .apply(lambda df: df["mean"] + 2 * df["std"], axis=1)
    .reset_index(name="filamentation_threshold")
)

FILAMENTATION_THRESHOLDS

Unnamed: 0,experiment_id,filamentation_threshold
0,20190325_Bruno_pBGT_quimiostato,50.904474
1,20190828_Bruno_MGGT_quimiostato,38.941106


In [9]:
final_processed_df = (
    processed_lineages_df
    .merge(FILAMENTATION_THRESHOLDS, on="experiment_id")
    .assign(filamentaded_at_frame=lambda df: df.length >= df.filamentation_threshold)
    .assign(
        filamentaded_lineage = lambda df: 
            df.groupby(["experiment_id", "trap_id", "lineage_id"])["filamentaded_at_frame"].transform("any"),
        filamentaded_track = lambda df:
            df.groupby(["experiment_id", "trap_id", "track_id"])["filamentaded_at_frame"].transform("any"),
        filamentaded_cell = lambda df:
            df.groupby(["experiment_id", "trap_id", "cell_id"])["filamentaded_at_frame"].transform("any")
    )
)

final_processed_df

Unnamed: 0,lineage_id,track_id,cell_id,mother_id,frame,roi_id,length,division,tracking_score,gfp,...,trap_id,centered_frame,centered_antibiotic_start_frame,centered_antibiotic_end_frame,time,filamentation_threshold,filamentaded_at_frame,filamentaded_lineage,filamentaded_track,filamentaded_cell
0,1,20.038-44.000,20.038,20.038,20,roi_f20_n38_x463_y290,28.071253,0,171.055377,202.571,...,xy01,0,6,14,0,50.904474,False,False,False,False
1,1,20.038-44.000,21.025,20.038,21,roi_f21_n25_x465_y290,27.384603,0,254.229721,192.205,...,xy01,1,6,14,10,50.904474,False,False,False,False
2,1,20.038-44.000,22.024,20.038,22,roi_f22_n24_x466_y290,27.714517,0,264.486122,188.813,...,xy01,2,6,14,20,50.904474,False,False,False,False
3,1,20.038-44.000,23.026,20.038,23,roi_f23_n26_x467_y291,28.855827,0,261.012088,186.514,...,xy01,3,6,14,30,50.904474,False,False,False,False
4,1,20.038-44.000,24.025,20.038,24,roi_f24_n25_x467_y291,29.005384,0,206.852160,188.081,...,xy01,4,6,14,40,50.904474,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330062,31,30.012-34.005,33.010,30.009,33,roi_f33_n10_x134_y213,57.083544,0,10.173721,119.925,...,xy01,23,6,10,230,38.941106,True,True,True,True
330063,31,30.012-34.005,34.005,30.009,34,roi_f34_n5_x137_y216,55.666790,0,,120.113,...,xy01,24,6,10,240,38.941106,True,True,True,True
330064,30,32.010-34.009,32.010,27.009,32,roi_f32_n10_x129_y174,113.085539,1,127.863529,120.030,...,xy01,22,6,10,220,38.941106,True,True,True,True
330065,30,32.010-34.009,33.013,32.005,33,roi_f33_n13_x116_y208,29.574817,0,229.450020,119.182,...,xy01,23,6,10,230,38.941106,False,True,True,False


### Save to file

In [10]:
final_processed_df.to_csv(
    path_or_buf=data_processed_dir("processed_lineages.tsv"),
    index=False,
    sep="\t"
)