# 0. Import libraries & settings

In [None]:
import pandas as pd
import numpy as np
import polars as pl
import anndata as ad

import scanpy as sc

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

import os

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["figure.figsize"] = [6, 4.5]

## 0.1. Make a folder to save graphs

In [None]:
# Create folder for figures
save_folder_figures = "final_figures"
os.makedirs(save_folder_figures, exist_ok=True)

# Create folder for specific trial
save_folder_current_trial = save_folder_figures+"/n_pcs_20"
os.makedirs(save_folder_current_trial, exist_ok=True)

# 1. Load data

## 1.1 Load metadata

In [None]:
df_metaData_with_lineage = pd.read_csv('/data/benchmarks/scRNAseq_persisters/GSE150949_metaData_with_lineage.txt', sep="\t")
df_metaData_with_lineage

In [None]:
# Load metadata file
df_metaData_with_lineage = pd.read_csv('/data/benchmarks/scRNAseq_persisters/GSE150949_metaData_with_lineage.txt', sep="\t")
df_metaData_with_lineage

# Load metadata from the Seurat object to retrieve the majority fate
df_metadata_seurat = pd.read_csv("/data/benchmarks/scRNAseq_persisters/metadata_seuratobject.csv")

# Add majority fate of the cells retrieved from the metadata of seurat object (from R data file) to the dataframe 
df_metaData_with_lineage['majority_fate'] = df_metadata_seurat['majority_fate'].tolist()

In [None]:
df_metaData_with_lineage

### 1.1.1 Analyzing metadata

In [None]:
nr_cells_total = len(df_metaData_with_lineage)
nr_cells_no_barcode = sum(df_metaData_with_lineage['lineage_barcode'].isnull())
nr_cells_multiple_barcodes = sum(df_metaData_with_lineage['lineage_barcode'].str.contains(',', na=False))

print('The total number of cells =',nr_cells_total)
print('The number of cells without a lineage barcode =',nr_cells_no_barcode, 'This is equal to ', round((nr_cells_no_barcode/nr_cells_total)*100,1),'%')
print('The number of cells with multiple lineage barcodes =', nr_cells_multiple_barcodes,'This is equal to ', round((nr_cells_multiple_barcodes/nr_cells_total)*100,1),'%')

In [None]:
# Check mitochondrial fraction of cells
print('The number of cells with >0.1 mitochondrial fraction is =', len(df_metaData_with_lineage[df_metaData_with_lineage['percent.mito']>0.1]))
# check for cells with <1000 genes
print('The number of cells with <1000 genes is =', len(df_metaData_with_lineage[df_metaData_with_lineage['nGene']<1000]))
# check for cells with >4200 genes
print('The number of cells with >4200 genes is =', len(df_metaData_with_lineage[df_metaData_with_lineage['nGene']>4200]))

Since there are no cells with >0.1 mitochondrial fraction or with <1000 or >4200 genes, it looks like this data is already preprocessed before by Oren et al. (2021).

### 1.1.2 Preprocessing metadata

In [None]:
copy_df =df_metaData_with_lineage.copy() # copy of dataframe to make additions

# replace sample_type label: from 14_high to non-cycling etc. to avoid confusion
copy_df = copy_df.replace('14_high', 'Non-cycling')
copy_df = copy_df.replace('14_med', 'Moderate_cyclers')
copy_df = copy_df.replace('14_low', 'Cycling')

## 1.2 Load count matrix data & create to AnnData object

In [None]:
# Load data using polars (=more effective/efficient than pandas)
df_pc9_count_matrix = pl.read_csv('/data/benchmarks/scRNAseq_persisters/GSE150949_pc9_count_matrix.csv')

In [None]:
df_pc9_count_matrix.head(10)

In [None]:
# Read preprocessed AnnData object - to get predicted cell fates
adata_preprocessed = sc.read_h5ad('/home/jolien/Notebooks/data/preprocessed_data_v2_with_predicted_class_v2.h5ad')
adata_preprocessed

# 2. Definition of function for data preprocessing

In [None]:
sample_type_palette = {
    '0': '#F560A6',  # Pink
    '3': '#91307F',  # Purple
    '7': '#2D0059',  # Dark purple
    'Cycling': '#1f77b4',  # Blue
    'Moderate_cyclers': '#ff7f0e',  # Orange
    'Non-cycling': '#2ca02c',  # Green
}

sample_type_palette_time = {
    0: '#F560A6',  # Pink
    3: '#91307F',  # Purple
    7: '#2D0059',  # Dark purple
    14: '#5b5b5b',  # Grey
}

In [None]:
def scRNAseq_preprocessing(df_pc9_count_matrix, df_metaData, adata_preprocessed, list_subset_days, selection_column, remove_category, plot_str):
    """
    Inputs:
    - df_pc9_count_matrix: scRNA-seq data (count matrix)
    df_metaData_with_lineage: dataframe of the metadata
    adata_preprocessed: 
    """

    ## Prepare data
    gene_names = df_pc9_count_matrix[:, 0].to_list()                        # Extract gene names (=first column)
    df_pc9_count_matrix_without_genenames = df_pc9_count_matrix[:, 1:]      # Exclude first column which containes the gene names
    cell_names = df_pc9_count_matrix_without_genenames.columns              # Extract names of the cells
    numpy_count_matrix = df_pc9_count_matrix_without_genenames.to_numpy()   # Convert to a numpy matrix to enable conversion to AnnData object

    ## Create AnnData object
    adata = ad.AnnData(X=numpy_count_matrix.T,
                    var=pd.DataFrame(index=gene_names),
                    obs=pd.DataFrame(index=cell_names))
    print('Created AnnData object:', adata)

    ## Enter relavant metadata to the AnnData object
    # Get time points as categorical in adata object
    time_points_cat = df_metaData.time_point.astype('category')             # convert dtype from int64 to category (for plotting lateron)
    adata.obs['time_point'] = time_points_cat                               # add categorical time points to adata object
    # Get sample types as categorical in adata object (= time points for cells from day 0 - 7 and cell fate categories for day-14 cells)
    sample_type_cat = df_metaData.sample_type.astype('category')            # convert dtype to category (for plotting lateron)
    adata.obs['sample_type'] = sample_type_cat                              # add categorical sample type points to adata object
    print('adata with relevant metadata:',adata)
    # Get majority fate of the lineages in adata object
    majority_fate_cat = df_metaData.majority_fate.astype('category') # convert dtype from int64 to category (for plotting lateron)
    adata.obs['majority_fate'] = majority_fate_cat # add categorical time points to adata object

    ## Add predicted cell fate
    # Add predicted cell fate (from RF classification model) to the not yet preprocessed data
    adata.obs['Predicted_cell_fate'] = adata_preprocessed.obs['Predicted_cell_fate']
    # Add empty column
    adata.obs['Sample_type_supplemented_with_predictions'] = np.nan
    # Create masks to retrieve the rows for the different subgroups
    mask_predicted_labels = adata.obs['time_point']!=14
    mask_real_label = adata.obs['time_point']==14
    # Assign sample type to the new column for day 14 cells and for the cells at other time points the predicted cell fates are added
    adata.obs.loc[mask_predicted_labels,'Sample_type_supplemented_with_predictions'] = adata.obs.loc[mask_predicted_labels,'Predicted_cell_fate']
    adata.obs.loc[mask_real_label,'Sample_type_supplemented_with_predictions'] = adata.obs.loc[mask_real_label,'sample_type']

    ## Subset the cells - Select the cells of interest
    mask_subset1 = adata.obs['time_point'].isin(list_subset_days)
    adata_subset = adata[mask_subset1]                                       # select cells from the correct day
    mask_not_subset2 = adata_subset.obs[selection_column].isin(remove_category)

    # mask_subset2 = adata_subset.obs[selection_column]!=remove_category
    adata_subset = adata_subset[~mask_not_subset2]
    print('Subset of data:', adata_subset)

    ## Only consider genes with more than 1 count
    sc.pp.filter_genes(adata_subset, min_counts=1)
    print('Filtered adata:', adata)

    ## Normalize gene expression matrix with total UMI count per cell
    adata_subset.X = adata_subset.X.astype('float64')                       # Convert the main data matrix to float64, because normalization was not possible with int64 values
    sc.pp.normalize_per_cell(adata_subset, key_n_counts='n_counts_all')

    # keep the data before selecting highly variable genes
    adata_subset.layers["original_data_normalized_all_genes"] = adata_subset.X.copy()

    ## Select top 2000 highly-variable genes
    filter_result = sc.pp.filter_genes_dispersion(adata_subset.X,
                                                flavor='cell_ranger',
                                                n_top_genes=2000,
                                                log=False)
    # Subset the genes
    adata_subset = adata_subset[:, filter_result.gene_subset]
    # Renormalize after filtering - making the total expression per cell equal across the dataset
    sc.pp.normalize_per_cell(adata_subset)

    
    ## Log transformation
    # keep raw count data before log transformation
    adata_subset.raw = adata_subset
    adata_subset.layers["raw_count"] = adata_subset.raw.X.copy()
    # Log transformation 
    sc.pp.log1p(adata_subset)                                               # The "log1p" function means taking the natural logarithm of (1 + X) for each value in the expression matrix, the addition of 1 ensures all values, including zeros, are log-transformed without creating NaN values
    # Keep log_transformed data before scaling
    adata_subset.layers["log_transformed"] = adata_subset.X.copy()
    # Scaling 
    sc.pp.scale(adata_subset)
    print('After logtransformation:',adata_subset)

    # PCA
    sc.tl.pca(adata_subset, svd_solver='arpack')
    # Diffusion map
    sc.pp.neighbors(adata_subset, n_neighbors=4, n_pcs=20)
    # sc.tl.draw_graph(adata_subset, random_state=123)
    # sc.pl.draw_graph(adata_subset, color="sample_type")
    sc.tl.diffmap(adata_subset)
    # Calculate neihbors again based on diffusionmap
    sc.pp.neighbors(adata_subset, n_neighbors=10, use_rep='X_diffmap')
    # sc.tl.draw_graph(adata_subset, random_state=123)
    # sc.pl.draw_graph(adata_subset, color="sample_type")

    ## Cell clustering
    # Run Louvain clustering
    sc.tl.louvain(adata_subset)

    ## Dimensionality reduction
    # PAGA graph construction
    sc.tl.paga(adata_subset, groups='louvain')
    plt.rcParams["figure.figsize"] = [6, 4.5]
    sc.pl.paga(adata_subset)
    # Calculate force-directed graph with PAGA graph as initial cluster position
    sc.tl.draw_graph(adata_subset, init_pos='paga', random_state=123) # Random seed to ensure consistency of plot for different runs
    # Calculate UMAP 
    sc.tl.umap(adata_subset,random_state=123)

    # Visualization
    # Plot force-directed graph with PAGA graph as initial cluster position
    # sc.pl.draw_graph(adata_subset, color=["louvain", "time_point","sample_type","Predicted_cell_fate"], legend_loc='on data')#, save="_PAGA_all_groupings.png")

    # Plot force-directed graph with PAGA graph as initial cluster position - legend next to plot
    fig, axes = plt.subplots(1, 4, figsize=(20, 5))
    sc.pl.draw_graph(adata_subset, color='louvain', legend_loc='on data', ax=axes[0], show=False)    # First plot with legend on data
    sc.pl.draw_graph(adata_subset, color='time_point', ax=axes[1], palette=sample_type_palette_time, show=False)                       # Second plot without legend
    sc.pl.draw_graph(adata_subset, color='sample_type', ax=axes[2], palette=sample_type_palette, show=False)                      # Third plot without legend
    sc.pl.draw_graph(adata_subset, color='Predicted_cell_fate', ax=axes[3], palette=sample_type_palette, show=False)              # Fourth plot without legend
    # Save the combined plot
    plt.tight_layout()
    plot_title1 = "PAGA_all_groupings"+plot_str+".png"
    plt.savefig(os.path.join("/home/jolien/Notebooks/data_preprocessing/subset/",save_folder_current_trial,plot_title1))

    # UMAP plot
    # sc.pl.umap(adata_subset, color=['louvain','time_point','sample_type'])#,save="_old_UMAP_all_groupings.png")
    fig, axes = plt.subplots(1, 4, figsize=(20, 5))
    sc.pl.umap(adata_subset, color='louvain', legend_loc='on data', ax=axes[0], show=False)    # First plot with legend on data
    sc.pl.umap(adata_subset, color='time_point', ax=axes[1], palette=sample_type_palette_time, show=False)                       # Second plot without legend
    sc.pl.umap(adata_subset, color='sample_type', ax=axes[2], palette=sample_type_palette, show=False)                      # Third plot without legend
    sc.pl.umap(adata_subset, color='Predicted_cell_fate', ax=axes[3], palette=sample_type_palette, show=False)              # Fourth plot without legend
    # sc.pl.umap(adata_subset, color='majority_fate', ax=axes[4], show=False)              # Fourth plot without legend
    # Save the combined plot
    plt.tight_layout()
    plot_title2 = "UMAP_all_groupings"+plot_str+".png"
    plt.savefig(os.path.join("/home/jolien/Notebooks/data_preprocessing/subset/",save_folder_current_trial,plot_title2))


    return adata_subset

In [None]:
# Functions for plotting PCA results

def plot_subplot_PCA(fig, axes, axs, adata, coloredby, colorbar_label, subplot_title,expl_var_pc1, expl_var_pc2):
    """
    Create PCA plot: plot PC1 vs PC2 in a subplot.
    Color the cells by their value of the indicated variable (based on the coloredby argument).
    """
    # Create the scatter plot
    sc = axes[axs[0],axs[1]].scatter(
        adata.obs['PC1'], 
        adata.obs['PC2'], 
        c=coloredby,  # Color points by 'percent_mito'
        cmap='viridis',  
        s=5) 
    # Add labels and title
    axes[axs[0],axs[1]].set_xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1), fontsize=14)
    axes[axs[0],axs[1]].set_ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2), fontsize=14)
    # axes[axs[0],axs[1]].set_title(subplot_title, fontsize=14)
    # Add colorbar
    cbar = fig.colorbar(sc, ax=axes[axs[0], axs[1]])
    cbar.set_label(colorbar_label, fontsize=14)  

    return


def PCA_plots(adata, copy_df, color_palette):
    """Perform PCA
    - Create an elbow plot
    - PC1 vs PC2
    - PC1 vs PC2 colored by several cell properties
    - PC1 vs PC2 as a contour plot colored by the predicted cell fate
    """

    # Store PCs in adata.obs
    adata.obs['PC1'] = adata.obsm['X_pca'][:,0] # First principal component
    adata.obs['PC2'] = adata.obsm['X_pca'][:,1] # Second principal component

    # Access the variance explained by each PC
    variance_ratio = adata.uns['pca']['variance_ratio']
    # Get explained variance percentage for PC1 and PC2
    expl_var_pc1 = adata.uns['pca']['variance_ratio'][0]*100
    expl_var_pc2 = adata.uns['pca']['variance_ratio'][2]*100

    # Create an elbow plot
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(variance_ratio) + 1), variance_ratio*100, marker='o')
    plt.xlabel('Principal Component', fontsize=14)
    plt.ylabel('Variance Explained', fontsize=14)
    plt.yticks(fontsize=12)
    plt.xticks(fontsize=12)
    # plt.title('Elbow Plot for PCA')
    # plt.grid(True)
    # plt.savefig('/home/jolien/Notebooks/data_preprocessing/figures/PCA_elbow_plot.png')
    plt.show()

    # Plot PC1 vs PC2
    plt.figure(figsize=(8, 6))
    plt.scatter(adata.obs['PC1'], adata.obs['PC2'], s=5) 
    plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1), fontsize=14)
    plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2), fontsize=14)
    plt.yticks(fontsize=12)
    plt.xticks(fontsize=12)
    # plt.title('PCA Plot of PC1 vs PC2')
    plt.show()

    # Store metadata in adata object
    adata.obs['percent_mito'] = copy_df['percent.mito']        # Get percentage mitochrondrial genes in adata object
    adata.obs['nUMI'] = copy_df['nUMI']                        # Get the number of UMI
    adata.obs['nGene'] = copy_df['nGene']                      # Get the number of genes

    # Plot PC1 vs PC2 with cell colored based on their properties
    fig, axes = plt.subplots(2, 3, figsize=(22, 12))#, sharex='all', sharey='all') 
    # contour plot
    sns.kdeplot(data=adata.obs, x="PC1", y="PC2",  ax=axes[0,0], hue="Predicted_cell_fate", palette=color_palette) 
    sns.move_legend(axes[0,0], "upper left")
    axes[0,0].set_xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1), fontsize=14)
    axes[0,0].set_ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2), fontsize=14)
    # axes[0,0].set_title('Colored by predicted cell fate')

    plot_subplot_PCA(fig, axes, [0,1], adata, adata.obs['percent_mito'], 'percent_mito', 'Colored by percent mitochondrial genes',expl_var_pc1, expl_var_pc2)
    plot_subplot_PCA(fig, axes, [0,2], adata, adata.obs['nUMI'], 'nUMI','Colored by nUMI',expl_var_pc1, expl_var_pc2)
    plot_subplot_PCA(fig, axes, [1,0], adata, adata.obs['nGene'], 'nGene', 'Colored by nGene',expl_var_pc1, expl_var_pc2)
    plot_subplot_PCA(fig, axes, [1,1], adata, np.mean(adata.X, axis=1), 'mean expression', 'Colored by mean expression',expl_var_pc1, expl_var_pc2) 
    plot_subplot_PCA(fig, axes, [1,2], adata, np.count_nonzero(adata.layers['raw_count'], axis=1), 'non-zero expression', 'Colored by non-zero expression',expl_var_pc1, expl_var_pc2) 


    # fig.suptitle('PCA Plot of PC1 vs PC2')
    # fig.savefig('PCA plots day 3 cells colored by variables')

    # Contour plot
    plt.figure(figsize=(8, 6))
    sns.kdeplot(data=adata.obs, x="PC1", y="PC2", hue="Predicted_cell_fate", palette=color_palette) # contour plot
    plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1), fontsize=14)
    plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2), fontsize=14)
    plt.yticks(fontsize=12)
    plt.xticks(fontsize=12)
    # plt.title('PCA density contour plot for sample types')

    return

# 3. Data preprocessing

## 3.1 Day 7 and 14 cells only

In [None]:
adata_day7_14 = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [7, 14], 'Sample_type_supplemented_with_predictions', [''], "day7_14")

In [None]:
adata_day7_14_without_non_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [7, 14], 'Sample_type_supplemented_with_predictions', ['Non-cycling'], "day7_14_without_noncycling")

## 3.2 Day 0 and 3 cells only 

Reason: because we see two clusters in day 3 cells which might means there is a bifurcation in the data

In [None]:
adata_day0_3 = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [0,3], 'Sample_type_supplemented_with_predictions', [''], "day0_3")


In [None]:
# plots for report
sc.pl.draw_graph(adata_day0_3, color='time_point', palette=sample_type_palette_time, title="Cells day 0 and 3")
sc.pl.draw_graph(adata_day0_3, color='Predicted_cell_fate', palette=sample_type_palette, title="Cells day 0 and 3")

## 3.3 Day 0, 3, and 7 cells 

In [None]:
adata_day0_3_7 = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [0,3,7], 'Sample_type_supplemented_with_predictions', [''], "day0_3_7")

## 3.4 Day 3 cells only

To investigate subpopulations in day 3 cells

In [None]:
adata_day3 = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [3], 'Sample_type_supplemented_with_predictions', [''], "day3")

### 3.4.1 PCA analysis

In [None]:
PCA_plots(adata_day3, copy_df, sample_type_palette)

If I want to show the mean or non-zero values of all genes, I need to preprocess the data again without the HVG selection (because now only the top 2000 genes are in the adata).

In [None]:
# plt.figure(figsize=(8, 6))

# sns.kdeplot(data=adata_day3.obs, x="PC1", y="PC2", hue="majority_fate") # contour plot

# plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1))
# plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2))
# plt.title('PCA density contour plot for sample types')

So, the cycling cells are only in one of the populations

## 3.5 Day 3, 7, and 14 cells - all measurements after treatment

Reason: day 0 cells are the only ones before treatment and therefore probably largly different from the cells of other time points. This biological difference between the samples might be the cause of the different clusters. So, maybe we a development for the cells if we only look at the time points after treatment

In [None]:
adata_day3_7_14 = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [3,7,14], 'Sample_type_supplemented_with_predictions', [''], "day3_7_14")

## 3.6 Day 14 cells only

In [None]:
# All day 14 cells 
adata_day14 = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [14], 'sample_type', [''], "day14_cells")

In [None]:
# plots for report
sc.pl.draw_graph(adata_day14, color='sample_type', palette=sample_type_palette)

In [None]:
adata_day14.write('/home/jolien/Notebooks/data/preprocessed_data_day14.h5ad')

## 3.7 Moderate cycling cells only

In [None]:
# Day 14 moderate cycling cells 
adata_day14_moderate_cyclers = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [14], 'sample_type', ['Non-cycling','Cycling'], "day14_mod_cycl")

In [None]:
# All moderate cycling cells 
adata_moderate_cyclers = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [0,3,7,14], 'Sample_type_supplemented_with_predictions', ['Non-cycling','Cycling'], "all_days_mod_cycl")

In [None]:
# plots for report
sc.pl.draw_graph(adata_moderate_cyclers, color='time_point', palette=sample_type_palette_time, title='Moderate cyclers')

In [None]:
# Day 7 and 14 moderate cycling cells 
adata_day7_14_moderate_cyclers = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [7,14], 'Sample_type_supplemented_with_predictions', ['Non-cycling','Cycling'], "day7_14_mod_cycl")

## 3.8 Cycling and moderate cycling cells

In [None]:
# Day 14 cycling and moderate cycling cells 
adata_day14_without_non_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [14], 'Sample_type_supplemented_with_predictions', ['Non-cycling'], "day14_without_non_cycling")

In [None]:
# All cycling and moderate cycling cells (from all days)
adata_without_non_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [0,3,7,14], 'Sample_type_supplemented_with_predictions', ['Non-cycling'], "all_days_without_non_cycling")

In [None]:
# plots for report
sc.pl.draw_graph(adata_without_non_cycling, color='time_point', palette=sample_type_palette_time, title="Moderate cyclers and cycling persisters")
sc.pl.draw_graph(adata_without_non_cycling, color='sample_type', palette=sample_type_palette, title="Moderate cyclers and cycling persisters")

In [None]:
# Cycling and moderate cycling cells from day 7 and 14
adata_day7_14_without_non_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [7,14], 'Sample_type_supplemented_with_predictions', ['Non-cycling'], "day7_14_without_non_cycling")

In [None]:
# plots for report
sc.pl.draw_graph(adata_day7_14_without_non_cycling, color='Sample_type_supplemented_with_predictions', palette=sample_type_palette, title='Days 7 and 14')

## 3.9 Cycling cells only

In [None]:
# Day 14 cycling cells 
adata_day14_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [14], 'sample_type', ['Non-cycling','Moderate_cyclers'], "day14_cycling")

In [None]:
# All days cycling cells 
adata_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [0,3,7,14], 'Sample_type_supplemented_with_predictions', ['Non-cycling','Moderate_cyclers'], "all_days_cycling")

In [None]:
# plots for report
sc.pl.draw_graph(adata_cycling, color='time_point', palette=sample_type_palette_time, title="Cycling persisters")

In [None]:
# Day 7 and 14 cycling cells 
adata_day7_14_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [7,14], 'Sample_type_supplemented_with_predictions', ['Non-cycling','Moderate_cyclers'], "day7_14_cycling")

## 3.10 Non-cycling cells only

In [None]:
# Day 14 non-cycling cells 
adata_day14_non_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [14], 'sample_type', ['Cycling','Moderate_cyclers'], "day14_non_cycling")

In [None]:
# All days non-cycling cells 
adata_non_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [0,3,7,14], 'Sample_type_supplemented_with_predictions', ['Cycling','Moderate_cyclers'], "all_days_non_cycling")

In [None]:
# plots for report
sc.pl.draw_graph(adata_non_cycling, color='time_point', palette=sample_type_palette_time, title="Non-cycling persisters")

In [None]:
# Day 7 and 14 cycling cells 
adata_day7_14_non_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [7,14], 'Sample_type_supplemented_with_predictions', ['Cycling','Moderate_cyclers'], "day7_14_non_cycling")

## 3.11 Moderate cycling and non-cycling cells

In [None]:
# Non-cycling and moderate cycling cells from day 7 and 14
adata_day7_14_without_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [7,14], 'Sample_type_supplemented_with_predictions', ['Cycling'], "day7_14_without_cycling")

In [None]:
# Non-cycling and moderate cycling cells from all days
adata_all_days_without_cycling = scRNAseq_preprocessing(df_pc9_count_matrix, copy_df, adata_preprocessed, [0,3,7,14], 'Sample_type_supplemented_with_predictions', ['Cycling'], "all_days_without_cycling")

In [None]:
# plots for report
sc.pl.draw_graph(adata_all_days_without_cycling, color='sample_type', palette=sample_type_palette, title="Moderate cyclers and non-cycling persisters")