# 1. Import libraries

In [None]:
import pandas as pd
import numpy as np

# 2. Load metadata

In [None]:
# Load metadata file
df_metaData_with_lineage = pd.read_csv('/data/benchmarks/scRNAseq_persisters/GSE150949_metaData_with_lineage.txt', sep="\t")

# Load metadata from the Seurat object to retrieve the majority fate
df_metadata_seurat = pd.read_csv("/data/benchmarks/scRNAseq_persisters/metadata_seuratobject.csv")

# Add majority fate of the cells retrieved from the metadata of seurat object (from R data file) to the dataframe 
df_metaData_with_lineage['majority_fate'] = df_metadata_seurat['majority_fate'].tolist()
df_metaData_with_lineage

# 3. Analyzing metadata

### 3.1 Analysis quality of cells

In [None]:
# Check mitochondrial fraction of cells
print('The number of cells with >0.1 mitochondrial fraction is =', len(df_metaData_with_lineage[df_metaData_with_lineage['percent.mito']>0.1]))
# check for cells with <1000 genes
print('The number of cells with <1000 genes is =', len(df_metaData_with_lineage[df_metaData_with_lineage['nGene']<1000]))
# check for cells with >4200 genes
print('The number of cells with >4200 genes is =', len(df_metaData_with_lineage[df_metaData_with_lineage['nGene']>4200]))

Since there are no cells with >0.1 mitochondrial fraction or with <1000 or >4200 genes, it looks like this data is already preprocessed before by Oren et al. (2021).

### 3.2 Analysis cell lineages

##### 3.2.1 Get number of cells without lineage and of cells with multiple lineages

In [None]:
nr_cells_total = len(df_metaData_with_lineage)
nr_cells_no_barcode = sum(df_metaData_with_lineage['lineage_barcode'].isnull()) # cells with NaN in the lineage barcode column
nr_cells_multiple_barcodes = sum(df_metaData_with_lineage['lineage_barcode'].str.contains(',', na=False)) # in case there are multiple lineage barcodes they are separated by a comma, so we can identify them by the comma

print('The total number of cells =',nr_cells_total)
print('The number of cells without a lineage barcode =',nr_cells_no_barcode, 'This is equal to', round((nr_cells_no_barcode/nr_cells_total)*100,1),'%')
print('The number of cells with multiple lineage barcodes =', nr_cells_multiple_barcodes,'This is equal to', round((nr_cells_multiple_barcodes/nr_cells_total)*100,1),'%')

##### 3.2.2 Get unique lineages per group

In [None]:
# Get unique lineage of all cells
all_barcodes = df_metaData_with_lineage['lineage_barcode'].dropna() # neglect NaN values
all_individual_barcodes = all_barcodes.str.split(',').explode() # Split strings by comma and create a new series with individual sequences
unique_individual_barcodes_total = all_individual_barcodes.unique() # Find unique sequences

print("Nr. unique lineages in total:", len(unique_individual_barcodes_total))
print("Nr. unique lineages in total when lineages barcodes are not splitted:",len(all_barcodes.unique()))
print("")


# Get unique lineages of day 14 cells
df_metaData_day14_cells = df_metaData_with_lineage[df_metaData_with_lineage['time_point']==14] # select only the day 14 cells
barcodes_persister_cells = df_metaData_day14_cells['lineage_barcode'].dropna() # neglect NaN values
individual_barcodes_persister_cells = barcodes_persister_cells.str.split(',').explode() # split lineage barcodes for cells with multiple lineage barcodes
unique_individual_barcodes_persister_cells = individual_barcodes_persister_cells.unique() # get unique lineage barcodes

print("Nr. unique lineages from persister cells:", len(unique_individual_barcodes_persister_cells))
print("Nr. unique lineages from persister cells when lineages barcodes are not splitted:",len(barcodes_persister_cells.unique()))
print("")

# Get multi lineages 
multi_barcode_indices = df_metaData_with_lineage['lineage_barcode'].str.contains(',', na=False) # get indices of multi lineage cells
barcodes_multilineage_cells = df_metaData_with_lineage[multi_barcode_indices]['lineage_barcode'] # get lineage barcodes of multi lineage cells
individual_barcodes_multilineage_cells = barcodes_multilineage_cells.str.split(',').explode() # split lineage barcodes for cells with multiple lineage barcodes
unique_barcodes_multilineage_cells = individual_barcodes_multilineage_cells.unique() # get unique lineage barcodes

print("Nr. lineages from multi lineage cells:", len(unique_barcodes_multilineage_cells))
print("Nr. lineages from multi lineage cells when lineages barcodes are not splitted:", len(barcodes_multilineage_cells.unique()))
print("")

print("Fraction of lineages that gave rise to persister cells:", len(unique_individual_barcodes_persister_cells)/len(unique_individual_barcodes_total))
print("Fraction of lineages that gave rise to persister cells (when barcodes are not splitted):", len(barcodes_persister_cells.unique())/len(all_barcodes.unique()))

These numbers are totally different from what is stated in the paper of Oren et al. (2021). They metioned that there are 1,135 individual PC9 cell lineages and 77 persister lineages (=that were still alive at day 14). They metioned that only 8% of cell lineges gave rise to persisters, while for my investigation it is 63%.

##### 3.2.2.2a Check what lineages detected at day 14 are also present at day 0 

In [None]:
# Get unique lineages of day 0 cells
df_metaData_day0_cells = df_metaData_with_lineage[df_metaData_with_lineage['time_point']==0] # select only the day 14 cells
barcodes_day0_cells = df_metaData_day0_cells['lineage_barcode'].dropna() # neglect NaN values
individual_barcodes_day0_cells = barcodes_day0_cells.str.split(',').explode() # split lineage barcodes for cells with multiple lineage barcodes
unique_individual_barcodes_day0_cells = individual_barcodes_day0_cells.unique() # get unique lineage barcodes

# Get unique lineages of day 3 cells
df_metaData_day3_cells = df_metaData_with_lineage[df_metaData_with_lineage['time_point']==3] # select only the day 14 cells
barcodes_day3_cells = df_metaData_day3_cells['lineage_barcode'].dropna() # neglect NaN values
individual_barcodes_day3_cells = barcodes_day3_cells.str.split(',').explode() # split lineage barcodes for cells with multiple lineage barcodes
unique_individual_barcodes_day3_cells = individual_barcodes_day3_cells.unique() # get unique lineage barcodes

# Get unique lineages of day 7 cells
df_metaData_day7_cells = df_metaData_with_lineage[df_metaData_with_lineage['time_point']==7] # select only the day 14 cells
barcodes_day7_cells = df_metaData_day7_cells['lineage_barcode'].dropna() # neglect NaN values
individual_barcodes_day7_cells = barcodes_day7_cells.str.split(',').explode() # split lineage barcodes for cells with multiple lineage barcodes
unique_individual_barcodes_day7_cells = individual_barcodes_day7_cells.unique() # get unique lineage barcodes

When not splitting the barcodes:

In [None]:
# check if all barcodes of day 14 were also detected at day 0 - when not splitting the barcodes

# get unique lineages of day0 and 14 cells when not splitting the barcodes
list1 = barcodes_persister_cells.unique()
list2 = barcodes_day0_cells.unique()

# checks if all lineages of day 14 cells were also detected at day 0
print('Are the day14 lineages all present at day0:', set(list1) <= set(list2)) 

# find the lineages of day 14 cells that are not detected in day 0 cells
missing_elements = set(list1) - set(list2)
print('Number of day14 lineages that were not present at day0:', len(missing_elements))  # Returns a list of elements in list1 but not in list2
print('Number of day14 lineages in total:', len(list1))
print('So, the percentage of day14 lineages that were also detected at day0:', len(missing_elements)/len(list1))

When splitting the barcodes:

In [None]:
# check if all barcodes of day 14 were also detected at day 0 - when splitting the barcodes

# get unique lineages of day0 and 14 cells when not splitting the barcodes
list1 = unique_individual_barcodes_persister_cells
list2 = unique_individual_barcodes_day0_cells

# checks if all lineages of day 14 cells were also detected at day 0
print('Are the day14 lineages all present at day0:', set(list1) <= set(list2)) 

# find the lineages of day 14 cells that are not detected in day 0 cells
missing_elements = set(list1) - set(list2)
print('Number of day14 lineages that were not present at day0:', len(missing_elements))  # Returns a list of elements in list1 but not in list2
print('Number of day14 lineages in total:', len(list1))
print('So, the percentage of day14 lineages that were also detected at day0:', len(missing_elements)/len(list1))

There are many lineage barcodes detected at day 14 while not at day 0. Maybe because of sampling, but still weird that the amount is this extremely large...

In [None]:
# check the lineages that are present at all time points

# convert unique barcodes (unsplitted) to a set
unique_lineages_non_splitted_day0 = set(barcodes_day0_cells.unique())
unique_lineages_non_splitted_day3 = set(barcodes_day3_cells.unique())
unique_lineages_non_splitted_day7 = set(barcodes_day7_cells.unique())
unique_lineages_non_splitted_day14 = set(barcodes_persister_cells.unique())

# get the lineages that are present at all time points
lineages_at_all_times = unique_lineages_non_splitted_day0.intersection(unique_lineages_non_splitted_day3).intersection(unique_lineages_non_splitted_day7).intersection(unique_lineages_non_splitted_day14)

print("Number of unique lineages in total when lineages barcodes are not splitted:", len(all_barcodes.unique()))
print('Number of unique lineages present at all time points:',len(lineages_at_all_times))
print('So, the percentage of lineages that were detected at all days:', len(lineages_at_all_times)/len(all_barcodes.unique()))
# lineages_at_all_times

##### 3.2.3 Check unique lineages of cells from replicate 1

In [None]:
# Check if we get the corresponding number of lineages according to the paper in case we only consider cells from replicate 1 
rep2_indices = df_metaData_with_lineage['sample_name'].str.contains('rep2', na=False) # get indices of rep2 cells
df_metaData_rep1 = df_metaData_with_lineage[~rep2_indices] # exclude the cells from replicate 2
# df_metaData_rep1.sample_name.astype('category') # convert to categorical to check for the groups

# Get unique lineage of replicate 1 cells
all_barcodes_rep1 = df_metaData_rep1['lineage_barcode'].dropna() # neglect NaN values
all_individual_barcodes_rep1 = all_barcodes_rep1.str.split(',').explode() # Split strings by comma and create a new series with individual sequences
unique_individual_barcodes_total_rep1 = all_individual_barcodes_rep1.unique() # Find unique sequences

print("Nr. unique lineages:", len(unique_individual_barcodes_total_rep1))
print("Nr. unique lineages when lineages barcodes are not splitted:",len(all_barcodes_rep1.unique()))

Also when only considering cells from 1 replicate, it doesn't yield the same number of lineages as stated in the paper

##### 3.2.4 Check if there are cells with lineage that have no clone_size

In [None]:
# check if there are cells with a lineage barcode but without a clone_size - to check if clone_size=NaN maybe meant that the cells are dead
filtered_df  = df_metaData_with_lineage.dropna(subset=['lineage_barcode']) # remove rows in dataframe that don't have a lineage barcode
print('Nr. cells with lineage', len(filtered_df ))
print('Nr. of cells from the ones that have a lineage that also have a clone_size which is not NaN:',len(filtered_df['clone_size'].dropna()))


All the cells with a lineage barcode have a clone size >= 1

##### 3.2.5 Check lineages when we filter out cells with low clone size

In [None]:
# filtered_df_clone_size  = df_metaData_with_lineage.dropna(subset=['clone_size']) # remove rows in dataframe that don't have a lineage barcode
filtered_df_clone_size = df_metaData_with_lineage[df_metaData_with_lineage['clone_size']>6.0]
print('multi-cellular prsister colonies', len(filtered_df_clone_size)/len(df_metaData_with_lineage))
filtered_df_clone_size.clone_size.astype('category') # convert to categorical to check for the groups
filtered_df_clone_size

# get unique barcodes
all_barcodes_filtered_clone_size = filtered_df_clone_size['lineage_barcode'].dropna() # neglect NaN values
all_individual_barcodes_filtered_clone_size = all_barcodes_filtered_clone_size.str.split(',').explode() # Split strings by comma and create a new series with individual sequences
unique_individual_barcodes_filtered_clone_size = all_individual_barcodes_filtered_clone_size.unique() # Find unique sequences

print("Nr. unique lineages in total:", len(unique_individual_barcodes_filtered_clone_size))
print("Nr. unique lineages in total when lineages barcodes are not splitted:",len(all_barcodes_filtered_clone_size.unique()))
print("")

# Get unique lineages of day 14 cells
filtered_df_clone_size_day14_cells = filtered_df_clone_size[filtered_df_clone_size['time_point']==14] # select only the day 14 cells
barcodes_filtered_persister_cells = filtered_df_clone_size_day14_cells['lineage_barcode'].dropna() # neglect NaN values
individual_barcodes_filtered_persister_cells = barcodes_filtered_persister_cells.str.split(',').explode() # split lineage barcodes for cells with multiple lineage barcodes
unique_individual_barcodes_filtered_persister_cells = individual_barcodes_filtered_persister_cells.unique() # get unique lineage barcodes

print("Nr. unique lineages from persister cells:", len(unique_individual_barcodes_filtered_persister_cells))
print("Nr. unique lineages from persister cells when lineages barcodes are not splitted:",len(barcodes_filtered_persister_cells.unique()))
print("")

# Get multi lineages 
multi_barcode_indices = filtered_df_clone_size['lineage_barcode'].str.contains(',', na=False) # get indices of multi lineage cells
barcodes_multilineage_cells = filtered_df_clone_size[multi_barcode_indices]['lineage_barcode'] # get lineage barcodes of multi lineage cells
individual_barcodes_multilineage_cells = barcodes_multilineage_cells.str.split(',').explode() # split lineage barcodes for cells with multiple lineage barcodes
unique_barcodes_multilineage_cells = individual_barcodes_multilineage_cells.unique() # get unique lineage barcodes

print("Nr. lineages from multi lineage cells:", len(unique_barcodes_multilineage_cells))
print("Nr. lineages from multi lineage cells when lineages barcodes are not splitted:", len(barcodes_multilineage_cells.unique()))
print("")



# 4. Retrieve cell fate based on lineage barcode

In [None]:
copy_df =df_metaData_with_lineage.copy() # copy of dataframe to make additions

# add column for info about the fate of the lineage at day 14
copy_df['fate_day_14'] = np.nan # create empty column

# Put 'Multi lineage' label in fate_day_14 column for cells that have multiple lineages
multi_barcode_indices = df_metaData_with_lineage['lineage_barcode'].str.contains(',', na=False)
copy_df.loc[multi_barcode_indices, 'fate_day_14'] = 'Multiple lineages'

# get index of cycling and non-cycling cells
index_non_cycling = df_metaData_with_lineage.index[df_metaData_with_lineage['sample_type']=='14_high']
index_moderate_cyclers = df_metaData_with_lineage.index[df_metaData_with_lineage['sample_type']=='14_med']
index_cycling = df_metaData_with_lineage.index[df_metaData_with_lineage['sample_type']=='14_low']

# # fill column of fate at day 14
# copy_df.loc[index_non_cycling, 'fate_day_14'] = 'Non cycling'
# copy_df.loc[index_moderate_cyclers, 'fate_day_14'] = 'Moderate cyclers'
# copy_df.loc[index_cycling, 'fate_day_14'] = 'Cycling'

copy_df

In [None]:
copy_df.loc[:,['time_point','sample_name','sample_type','lineage_barcode','fate_day_14']]

In [None]:
# Find barcodes of day 14 cells grouped per cell fate 
def get_unique_barcodes(df, indices_list):
    """Function to obtain a series object of the unique lineage barcodes of cells measured at day 14, as categoricals."""

    barcodes = df.loc[indices_list, 'lineage_barcode'] # extract lineage barcodes of day 14 cells from a population with the same cell fate
    barcodes = barcodes.astype('category') # convert to categories
    barcodes = barcodes.cat.categories # create an object containing all unique lineage barcodes (with the category data type)
    
    return barcodes

barcodes_non_cycling = get_unique_barcodes(copy_df,index_non_cycling) # barcoddes from day 14 cells categorized as non-cycling 
barcodes_moderate_cyclers = get_unique_barcodes(copy_df,index_moderate_cyclers )# barcoddes from day 14 cells categorized as moderate cyclers
barcodes_cycling = get_unique_barcodes(copy_df,index_cycling) # barcoddes from day 14 cells categorized as cycling 

In [None]:
# Find barcodes common between each pair of groups
common_noncycling_cycling = barcodes_non_cycling.intersection(barcodes_cycling)
common_noncycling_moderatecyclers = barcodes_non_cycling.intersection(barcodes_moderate_cyclers)
common_cycling_moderatecyclers = barcodes_cycling.intersection(barcodes_moderate_cyclers)

# Remove common barcodes from each group
unique_barcodes_non_cycling = barcodes_non_cycling.difference(common_noncycling_cycling.union(common_noncycling_moderatecyclers)) 
unique_barcodes_cycling = barcodes_cycling.difference(common_noncycling_cycling.union(common_cycling_moderatecyclers))
unique_barcodes_moderatecyclers = barcodes_moderate_cyclers.difference(common_noncycling_moderatecyclers.union(common_cycling_moderatecyclers))

# Combine all common barcodes --> multi fate lineages 
multifate_barcodes = common_noncycling_cycling.union(common_noncycling_moderatecyclers).union(common_cycling_moderatecyclers)


In [None]:
# get indices of cells with lineage barcodes per group
all_non_cyclers_indices = df_metaData_with_lineage['lineage_barcode'].isin(unique_barcodes_non_cycling)
all_moderatecyclers_indices = df_metaData_with_lineage['lineage_barcode'].isin(unique_barcodes_moderatecyclers)
all_cyclers_indices = df_metaData_with_lineage['lineage_barcode'].isin(unique_barcodes_cycling)
all_multifate_indices = df_metaData_with_lineage['lineage_barcode'].isin(multifate_barcodes)

# enter fate in cell fate column
copy_df.loc[all_non_cyclers_indices, 'fate_day_14'] = 'Non-cycling'
copy_df.loc[all_moderatecyclers_indices, 'fate_day_14'] = 'Moderate cycler'
copy_df.loc[all_cyclers_indices, 'fate_day_14'] = 'Cycling'
copy_df.loc[all_multifate_indices, 'fate_day_14'] = 'Multi-fate'

copy_df


In [None]:
copy_df.loc[:,['time_point','sample_name','sample_type','lineage_barcode','fate_day_14']]

# Save data

In [None]:
copy_df.to_csv('/data/benchmarks/scRNAseq_persisters/Processed_metaData_with_lineage.txt', sep="\t")