# 1. Import libraries

In [None]:
import pandas as pd
import numpy as np

# 2. Load metadata

In [None]:
# Load metadata from GEO
df_metaData_with_lineage = pd.read_csv('/data/benchmarks/scRNAseq_persisters/GSE150949_metaData_with_lineage.txt', sep="\t")
df_metaData_with_lineage

In [None]:
# Load metadata from Seurat object
df_metadata = pd.read_csv("/data/benchmarks/scRNAseq_persisters/metadata_seuratobject.csv")
df_metadata

# 2. Analysis

### 2.1 Check alignment of the dataframes

In [None]:
# Check if the order of observations is the same
if df_metaData_with_lineage.index.tolist() == df_metadata['Unnamed: 0'].tolist():
    print("Indices match!")
else:
    print("Indices do not match. Proceeding to align.")

In [None]:
# Check if lineage barcode columns are the same for the two dataframes
if df_metaData_with_lineage['lineage_barcode'].tolist() == df_metadata['lineage_barcode'].tolist():
    print("Lineage barcodes match!")
else:
    print("Lineage barcodes do not match.")

In [None]:
# Investigate if/how many barcodes are the same for the metadata files
same_barcodes = 0
different_barcodes = 0

for i in range(len(df_metaData_with_lineage['lineage_barcode'])):
    barcode_df1 = df_metaData_with_lineage['lineage_barcode'][i]
    # print(barcode_df1)
    barcode_df2 = df_metadata.loc[i,'lineage_barcode']
    # print(barcode_df2)
    if barcode_df1==barcode_df2:
        same_barcodes = same_barcodes + 1
    else:
        different_barcodes = different_barcodes+ 1
    # print('same_barcodes', same_barcodes)

print('ratio same:', same_barcodes/(same_barcodes+different_barcodes))
    

Interestingly, the lineage barcodes of the two dataframes do not align. There are cells that had 'NaN' as lineage barcode in the dataframe directly obtained from Gene Expression Omnibus (GEO) that appear to have a lineage barcode in the metadata of the Seurat object (which was obtained from the github page related to the paper).

I continued with aligning and combining the two dataframes, since the cell index is the same.

### 2.2 Combine metadata dataframes

In [None]:
# Add lineage barcodes of metadata of seurat object (from R data file) to the dataframe 
df_metaData_with_lineage['lineage_barcode_Rdata'] = df_metadata['lineage_barcode'].tolist()
df_metaData_with_lineage

In [None]:
# Add majority fate of the cells retrieved from the metadata of seurat object (from R data file) to the dataframe 
df_metaData_with_lineage['majority_fate'] = df_metadata['majority_fate'].tolist()
df_metaData_with_lineage

### 2.3 Check distribution of the data over specific groups

In [None]:
# Check distribution of majority_fate
df_metaData_with_lineage['majority_fate'].value_counts()

In [None]:
# Check distribution of sample_type
df_metaData_with_lineage['sample_type'].value_counts()

Uneven distribution of the classes, many more non-cycling cells than cycling or moderate cyclers. Maybe this has influence on the majority fate of the lineages

### 2.4 Check if the given majority fate is in line with the sample types of day 14 cells of that lineage

##### Use metadata file from GEO

In [None]:
# Check frequencies of lineage barcodes (of the metadata file from GEO)
df_metaData_with_lineage['lineage_barcode'].value_counts()


In [None]:
# Check majority fate of a specific lineage

df_lineage_1 = df_metaData_with_lineage[df_metaData_with_lineage['lineage_barcode'] == 'AGTGTGTGAGACTCTCTGAGTCTGTGAGAG']        # select the cells having a specific lineage barcode
# print(df_lineage_1.head())
df_lineage_1_time14 = df_lineage_1[df_lineage_1['time_point']==14]                                                              # select day 14 cells of this lineage
# print(df_lineage_1_time14.head())


sample_type_time14_counts_lineage_1 = df_lineage_1_time14['sample_type'].value_counts()                                         # get frequencies of the sample types
print(sample_type_time14_counts_lineage_1)
majority_fate_counts_lineage_1 = df_lineage_1['majority_fate'].value_counts()                                                   # get frequencies of majority fate
print(majority_fate_counts_lineage_1)

I selected a random lineage barcode (in this case it was the one with highest prevalence) and checked the sample type of the day14 cells with this lineage barcode. Most cells with this lineage were 14_low (=cycling), however, the majority cell fate of the cells is mostly non-cycling.
It is anyway remarkable that those cells not all have the same majority fate (same lineage so you would say they have the same majority fate), but this might be caused since the lineage barcodes of the Seurat Object were different than the lineage barcodes in the metadata of the csv file. I should check the previous steps when using the lineage barcodes of the Seurat Object. 

In [None]:
# Check majority fate of a specific lineage --> Try a different barcode 

df_lineage_2 = df_metaData_with_lineage[df_metaData_with_lineage['lineage_barcode'] == 'TGACAGTGTGTGTGTGTCACTGTCTGTGTG']        # select the cells having a specific lineage barcode
# print(df_lineage_2.head())
df_lineage_2_time14 = df_lineage_2[df_lineage_2['time_point']==14]                                                              # select day 14 cells of this lineage
# print(df_lineage_2_time14.head())


sample_type_time14_counts_lineage_2 = df_lineage_2_time14['sample_type'].value_counts()                                         # get frequencies of the sample types
print(sample_type_time14_counts_lineage_2)
majority_fate_counts_lineage_2 = df_lineage_2['majority_fate'].value_counts()                                                   # get frequencies of majority fate
print(majority_fate_counts_lineage_2)

##### Use metadata file from Seurat object

In [None]:
# Check frequencies of lineage barcodes (of the metadata from Seurat object)
df_metaData_with_lineage['lineage_barcode_Rdata'].value_counts()

In [None]:
# Check majority fate of a specific lineage
 
df_lineage_1 = df_metaData_with_lineage[df_metaData_with_lineage['lineage_barcode_Rdata'] == 'AGTGTGTGAGACTCTCTGAGTCTGTGAGAG']      # select the cells having a specific lineage barcode
# print(df_lineage_1.head())
df_lineage_1_time14 = df_lineage_1[df_lineage_1['time_point']==14]                                                                  # select day 14 cells of this lineage
# print(df_lineage_1_time14.head())


sample_type_time14_counts_lineage_1 = df_lineage_1_time14['sample_type'].value_counts()                                             # get frequencies of the sample types   
print(sample_type_time14_counts_lineage_1)
majority_fate_counts_lineage_1 = df_lineage_1['majority_fate'].value_counts()                                                       # get frequencies of majority fate
print(majority_fate_counts_lineage_1)

when using the lineage barcodes of the Seurat object, we see that the cells of the investigated lineage barcode are mostly 14_high (=non-cycling in case they have the same annotation as in the paper (=mCherry_high = non-cycling_)). The majority cell fate of those cells is all 14_cycling, which indicates that the majority fate is indeed based on the lineages of the Seurat object. However, based on the most common sample_type =14_high, I would expect the majority fate is non-cycling... 

In [None]:
# Check majority fate of a specific lineage --> Try a different barcode

df_lineage_2 = df_metaData_with_lineage[df_metaData_with_lineage['lineage_barcode_Rdata'] == 'TGTGAGTCTCTCTCACACACACTCACTGAG']      # select the cells having a specific lineage barcode
# print(df_lineage_2.head())
df_lineage_2_time14 = df_lineage_2[df_lineage_2['time_point']==14]                                                                  # select day 14 cells of this lineage
# print(df_lineage_2_time14.head())


sample_type_time14_counts_lineage_2 = df_lineage_2_time14['sample_type'].value_counts()                                             # get frequencies of the sample types
print(sample_type_time14_counts_lineage_2)
majority_fate_counts_lineage_2 = df_lineage_2['majority_fate'].value_counts()                                                       # get frequencies of majority fate
print(majority_fate_counts_lineage_2)

In [None]:
# Check majority fate of a specific lineage --> Try a different barcode

df_lineage_2 = df_metaData_with_lineage[df_metaData_with_lineage['lineage_barcode_Rdata'] == 'ACTCACTCAGAGAGTGAGTGTCAGAGTGTG']      # select the cells having a specific lineage barcode
# print(df_lineage_2.head())
df_lineage_2_time14 = df_lineage_2[df_lineage_2['time_point']==14]                                                                  # select day 14 cells of this lineage
# print(df_lineage_2_time14.head())


sample_type_time14_counts_lineage_2 = df_lineage_2_time14['sample_type'].value_counts()                                             # get frequencies of the sample types
print(sample_type_time14_counts_lineage_2)
majority_fate_counts_lineage_2 = df_lineage_2['majority_fate'].value_counts()                                                       # get frequencies of majority fate
print(majority_fate_counts_lineage_2)

In [None]:
# Check majority fate of a specific lineage --> Try a different barcode

df_lineage_2 = df_metaData_with_lineage[df_metaData_with_lineage['lineage_barcode_Rdata'] == 'TGTCAGAGAGTCAGACTGTGAGTGTCTGTC']      # select the cells having a specific lineage barcode
# print(df_lineage_2.head())
df_lineage_2_time14 = df_lineage_2[df_lineage_2['time_point']==14]                                                                  # select day 14 cells of this lineage
# print(df_lineage_2_time14.head())


sample_type_time14_counts_lineage_2 = df_lineage_2_time14['sample_type'].value_counts()                                             # get frequencies of the sample types
print(sample_type_time14_counts_lineage_2)
majority_fate_counts_lineage_2 = df_lineage_2['majority_fate'].value_counts()                                                       # get frequencies of majority fate
print(majority_fate_counts_lineage_2)

In [None]:
# Check majority fate of a specific lineage --> Try a different barcode

df_lineage_2 = df_metaData_with_lineage[df_metaData_with_lineage['lineage_barcode_Rdata'] == 'TGACAGTGTGTGTGTGTCACTGTCTGTGTG']      # select the cells having a specific lineage barcode
# print(df_lineage_2.head())
df_lineage_2_time14 = df_lineage_2[df_lineage_2['time_point']==14]                                                                  # select day 14 cells of this lineage
# print(df_lineage_2_time14.head())


sample_type_time14_counts_lineage_2 = df_lineage_2_time14['sample_type'].value_counts()                                             # get frequencies of the sample types
print(sample_type_time14_counts_lineage_2)
majority_fate_counts_lineage_2 = df_lineage_2['majority_fate'].value_counts()                                                       # get frequencies of majority fate
print(majority_fate_counts_lineage_2)

The majority fate is not always equal to the most prevalent sample type. It looks like they switched the definition of 14_high and 14_low (14_high should be non-cycling according to the paper). Additionally, there is at least for one lineage that the majority fate is cycling while the most prevalent sample type of the day 14 cells with that lineage 14_med is.

##### Conclusion: I cannot get how they came up with the majority fate. It looks like it is based on the lineage barcodes from the Seurat object, but it is not really in line with the most common sample type of the day 14 cells with that lineage barcode. Maybe they switched around the definition of 14_high and 14_low, but still there are also cases where for moderate cyclers it gives the wrong majority fate. Additionally, why the lineage barcodes of the Seurat object are different from the ones in the GEO data file remains unclear to me.