# 0. Import libraries

In [None]:
import pandas as pd
import numpy as np
import anndata as ad

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats


# 1. Read proprecessed data

In [None]:
# Read preprocessed AnnData object
# adata = ad.read_h5ad('/home/jolien/Notebooks/data/preprocessed_data_v2.h5ad')
adata = ad.read_h5ad('/home/jolien/Notebooks/data/new_filtering_HVG/preprocessed_data_new_filtering_renormalization.h5ad')

In [None]:
df_metadata = pd.read_csv('/data/benchmarks/scRNAseq_persisters/Processed_metaData_with_lineage.txt', sep="\t", index_col=0)

In [None]:
df_metadata

# 2. PCA analysis

In [None]:
# Define color scheme for plots

# Palette for sample_type variables
sample_type_palette = {
    '0': '#F560A6',  # Pink
    '3': '#91307F',  # Purple
    '7': '#2D0059',  # Dark purple
    'Cycling': '#1f77b4',  # Blue
    'Moderate_cyclers': '#ff7f0e',  # Orange
    'Non-cycling': '#2ca02c',  # Green
}

# Palette for time_point variables
sample_type_palette_time = {
    0: '#F560A6',  # Pink
    3: '#91307F',  # Purple
    7: '#2D0059',  # Dark purple
    14: '#5b5b5b',  # Grey
}

In [None]:
# Store PCs in adata.obs
adata.obs['PC1'] = adata.obsm['X_pca'][:,0] # First principal component
adata.obs['PC2'] = adata.obsm['X_pca'][:,1] # Second principal component
adata.obs['PC3'] = adata.obsm['X_pca'][:,2] # Third principal component
adata.obs['PC4'] = adata.obsm['X_pca'][:,3] # Fourth principal component
adata.obs['PC5'] = adata.obsm['X_pca'][:,4] # Fifth principal component
adata.obs['PC6'] = adata.obsm['X_pca'][:,5] # Sixt principal component

### 2.1 Explained variance

In [None]:
# Access the variance explained by each PC
variance_ratio = adata.uns['pca']['variance_ratio']

# Create an elbow plot
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(variance_ratio) + 1), variance_ratio, marker='o')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.title('Elbow Plot for PCA')
plt.grid(True)
# plt.savefig('/home/jolien/Notebooks/data_preprocessing/figures/PCA_elbow_plot.png')
plt.show()

In [None]:
# Get explained variance percentage for PC1 and PC2
expl_var_pc1 = adata.uns['pca']['variance_ratio'][0]*100
expl_var_pc2 = adata.uns['pca']['variance_ratio'][2]*100

### 2.2 Plot PC1 vs PC2

In [None]:
# Plot PC1 vs PC2
plt.figure(figsize=(8, 6))
plt.scatter(adata.obs['PC1'], adata.obs['PC2'], s=5) 
plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1))
plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2))
plt.title('PCA Plot of PC1 vs PC2')
plt.show()

In [None]:
# Store metadata in adata object
adata.obs['percent_mito'] = df_metadata['percent.mito']        # Get percentage mitochrondrial genes in adata object
adata.obs['nUMI'] = df_metadata['nUMI']                        # Get the number of UMI
adata.obs['nGene'] = df_metadata['nGene']                      # Get the number of genes


# Function for plotting
def plot_subplot_PCA(fig, axes, axs, adata, coloredby, colorbar_label, subplot_title):
    """
    Create PCA plot: plot PC1 vs PC2 in a subplot.
    Color the cells by their value of the indicated variable (based on the coloredby argument).
    """
    # Create the scatter plot
    sc = axes[axs[0],axs[1]].scatter(
        adata.obs['PC1'], 
        adata.obs['PC2'], 
        c=coloredby,  # Color points by adata observables
        cmap='viridis',  
        s=5) 
    # Add labels and title
    axes[axs[0],axs[1]].set_xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1))
    axes[axs[0],axs[1]].set_ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2))
    axes[axs[0],axs[1]].set_title(subplot_title)
    # Add colorbar
    cbar = fig.colorbar(sc, ax=axes[axs[0], axs[1]])
    cbar.set_label(colorbar_label)  

    return

# Plot results
fig, axes = plt.subplots(2, 3, figsize=(20, 10))#, sharex='all', sharey='all') 
plot_subplot_PCA(fig, axes, [0,0], adata, adata.obs['percent_mito'], 'percent_mito', 'Colored by percent mitochondrial genes')
plot_subplot_PCA(fig, axes, [0,1], adata, adata.obs['nUMI'], 'nUMI','Colored by nUMI')
plot_subplot_PCA(fig, axes, [0,2], adata, adata.obs['nGene'], 'nGene', 'Colored by nGene')

plot_subplot_PCA(fig, axes, [1,0], adata, np.mean(adata.X, axis=1), 'mean expression', 'Colored by mean expression') 
plot_subplot_PCA(fig, axes, [1,1], adata, np.count_nonzero(adata.X, axis=1), 'non-zero expression', 'Colored by non-zero expression') 


fig.suptitle('PCA Plot of PC1 vs PC2')

# fig.savefig('PCA plots day 3 cells colored by variables')


In [None]:
# Extract PC1 and PC2 for plotting per sample_type group
# time = 0
pc1_0 = adata.obsm['X_pca'][df_metadata['sample_type']=='0', 0]  # First principal component
pc2_0 = adata.obsm['X_pca'][df_metadata['sample_type']=='0', 1]  # Second principal component

# time = 3
pc1_3 = adata.obsm['X_pca'][df_metadata['sample_type']=='3', 0]  # First principal component
pc2_3 = adata.obsm['X_pca'][df_metadata['sample_type']=='3', 1]  # Second principal component

# time = 7
pc1_7 = adata.obsm['X_pca'][df_metadata['sample_type']=='7', 0]  # First principal component
pc2_7 = adata.obsm['X_pca'][df_metadata['sample_type']=='7', 1]  # Second principal component

# time = cycling (14_low)
pc1_14l = adata.obsm['X_pca'][df_metadata['sample_type']=='Cycling', 0]  # First principal component
pc2_14l = adata.obsm['X_pca'][df_metadata['sample_type']=='Cycling', 1]  # Second principal component
# time = moderate cyclers (14_med)
pc1_14m = adata.obsm['X_pca'][df_metadata['sample_type']=='Moderate_cyclers', 0]  # First principal component
pc2_14m = adata.obsm['X_pca'][df_metadata['sample_type']=='Moderate_cyclers', 1]  # Second principal component
# time = non-cycling (14_high)
pc1_14h = adata.obsm['X_pca'][df_metadata['sample_type']=='Non-cycling', 0]  # First principal component
pc2_14h = adata.obsm['X_pca'][df_metadata['sample_type']=='Non-cycling', 1]  # Second principal component

# Plot PC1 vs PC2
plt.figure(figsize=(8, 6))
plt.scatter(pc1_0, pc2_0, c='g', s=5)       # plot day 0 cells
plt.scatter(pc1_3, pc2_3, c='r', s=5)       # plot day 3 cells
plt.scatter(pc1_7, pc2_7, c='k', s=5)       # plot day 7 cells
plt.scatter(pc1_14l, pc2_14l, c='b', s=5)   # plot day cycling (14_low) cells
plt.scatter(pc1_14m, pc2_14m, c='c', s=5)   # plot day moderate cycler (14_med) cells
plt.scatter(pc1_14h, pc2_14h, c='m', s=5)   # plot day non-cycling (14_high) cells

plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1))
plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2))
plt.title('PCA Plot of PC1 vs PC2 colored by sample type')
plt.legend(['Day 0','Day 3','Day 7','Day 14 - cycling','Day 14 - moderate cyclers','Day 14 - non-cycling']) 
# plt.savefig('/home/jolien/Notebooks/data_preprocessing/figures/PCA_colored_by_sample_type.png')
plt.show()

In [None]:
# scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=adata.obs, x="PC1", y="PC2", hue="sample_type", palette=sample_type_palette , size=5)

plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1))
plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2))
plt.title('PCA Plot of PC1 vs PC2 colored by sample type')

# plt.savefig('/home/jolien/Notebooks/data_preprocessing/figures/PCA_colored_by_sample_type_v2.png')

In [None]:
# contour plot
plt.figure(figsize=(8, 6))
sns.kdeplot(data=adata.obs, x="PC1", y="PC2", hue="sample_type", palette=sample_type_palette)

plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1))
plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2))
plt.title('PCA density contour plot for sample types')

# plt.savefig('/home/jolien/Notebooks/data_preprocessing/figures/PCA_contour_colored_by_sample_type.png')

In [None]:
# contour plot filled
plt.figure(figsize=(8, 6))
sns.kdeplot(data=adata.obs, x="PC1", y="PC2", hue="sample_type", palette=sample_type_palette, alpha=0.3, fill=True) 

plt.xlabel('PC1 ({:.1f}%)'.format(expl_var_pc1))
plt.ylabel('PC2 ({:.1f}%)'.format(expl_var_pc2))
plt.title('PCA density contour plot for sample types')

# plt.savefig('/home/jolien/Notebooks/data_preprocessing/figures/PCA_contour_colored_by_sample_type_filled.png')

### 2.3 Check correlation PCs and time point

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 18), sharey=True)

pc_nr = 1

for i, ax_row in enumerate(axes):
    for j, ax in enumerate(ax_row):
        
        # Plot boxplots of PC per sample type
        sns.boxplot(x='sample_type', y='PC'+str(pc_nr), data=adata.obs, ax=ax) 
        
        # Add titles
        ax.set_title(f'PC{pc_nr} grouped per sample type')
        
        # Add axis label
        ax.set_ylabel(f"PC{pc_nr}")
        
        # Increment PC number
        pc_nr += 1

# fig.savefig('/home/jolien/Notebooks/data_preprocessing/figures/boxplots_PCs_per_sample_type.png')

In [None]:
# check whether the difference in PC1 of group 0 and 3 is significant
kruskal_result = stats.kruskal(
    adata.obs['PC1'][adata.obs['sample_type'] == '0'],
    adata.obs['PC1'][adata.obs['sample_type'] == '3']
)

print("Kruskal-Wallis p-value:", kruskal_result.pvalue)


I don't know if I can use it when we don't have more digits for the p-value

In [None]:
# cov(np.transpose(adata.obs['time_point']),np.transpose(adata.obs['PC1'])) # gives the same output as when I don't apply the transpose
cov_PC1 = np.cov(adata.obs['time_point'],adata.obs['PC1'])
print('covariance PC1 and time point:')
print(cov_PC1)
corr_PC1, _ = stats.pearsonr(adata.obs['time_point'],adata.obs['PC1'])
print('Pearsons correlation PC1 and time point: %.3f' % corr_PC1,'\n')

cov_PC2 = np.cov(adata.obs['time_point'],adata.obs['PC2'])
print('covariance PC2 and time point:')
print(cov_PC2)
corr_PC2, _ = stats.pearsonr(adata.obs['time_point'],adata.obs['PC2'])
print('Pearsons correlation PC2 and time point: %.3f' % corr_PC2,'\n')

cov_PC3 = np.cov(adata.obs['time_point'],adata.obs['PC3'])
print('covariance PC3 and time point:')
print(cov_PC3)
corr_PC3, _ = stats.pearsonr(adata.obs['time_point'],adata.obs['PC3'])
print('Pearsons correlation PC3 and time point: %.3f' % corr_PC3,'\n')

cov_PC4 = np.cov(adata.obs['time_point'],adata.obs['PC4'])
print('covariance PC4 and time point:')
print(cov_PC4)
corr_PC4, _ = stats.pearsonr(adata.obs['time_point'],adata.obs['PC4'])
print('Pearsons correlation PC4 and time point: %.3f' % corr_PC4,'\n')

cov_PC5 = np.cov(adata.obs['time_point'],adata.obs['PC5'])
print('covariance PC5 and time point:')
print(cov_PC5)
corr_PC5, _ = stats.pearsonr(adata.obs['time_point'],adata.obs['PC5'])
print('Pearsons correlation PC5 and time point: %.3f' % corr_PC5,'\n')

cov_PC6 = np.cov(adata.obs['time_point'],adata.obs['PC6'])
print('covariance PC6 and time point:')
print(cov_PC6)
corr_PC6, _ = stats.pearsonr(adata.obs['time_point'],adata.obs['PC6'])
print('Pearsons correlation PC6 and time point: %.3f' % corr_PC6,'\n')

PC1 and time point have a positive covariance (off diagnal values in covariance matrix) and a correlation of 0.6 which indicates there is a dependency between PC1 and time point (moderate positive relationship).