### Single-cell RNA sequencing reveals the effects of chemotherapy on human pancreatic adenocarcinoma and its tumor microenvironment. Nature Communications, 2023, 14(1): 797. doi: 10.1038/s41467-023-36296-4. GSE205013. Patient Samples.
### https://pubmed.ncbi.nlm.nih.gov/36781852/



In [29]:
import scanpy as sc
import scvi
import numpy as np
import pandas as pd
import os
import leidenalg
from scipy.sparse import csr_matrix

In [2]:
base_data_path = "/Users/klemkelab/Epithelial_and_Macrophage_specific_scRNAseq/data"

In [3]:
ribo_url = "http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=KEGG_RIBOSOME&fileType=txt"
ribo_genes = pd.read_table(ribo_url, skiprows=2, header = None)
ribo_genes

Unnamed: 0,0
0,FAU
1,MRPL13
2,RPL10
3,RPL10A
4,RPL10L
...,...
83,RPS9
84,RPSA
85,RSL24D1
86,RSL24D1P11


In [4]:
import torch

if torch.backends.mps.is_available():
    print("MPS is available")
    print("Is MPS device in use?", torch.backends.mps.is_built())
else:
    print("MPS is not available")


MPS is available
Is MPS device in use? True


In [13]:
import importlib
import utilities
importlib.reload(utilities)
from utilities import pp2 , create_anndata_object, gene_check, write_anndata, read_anndata , remove_ribosomal_genes

In [None]:
adata_files1 = create_anndata_object(base_data_path, 'Treated', pp2, ribo_genes)

In [6]:
adata_files1

AnnData object with n_obs × n_vars = 55987 × 36601
    obs: 'Sample', 'doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'

In [None]:
adata_files2 = create_anndata_object(base_data_path, 'Untreated', pp2, ribo_genes)
adata_files2

In [9]:
adata_files2

AnnData object with n_obs × n_vars = 24579 × 36601
    obs: 'Sample', 'doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'

In [14]:
gene_check(adata_files1, "EPCAM")
gene_check(adata_files1, "AR")

Total EPCAM genes : 55987
Total AR genes : 55987


In [15]:
gene_check(adata_files2, "EPCAM")
gene_check(adata_files2, "AR")

Total EPCAM genes : 24579
Total AR genes : 24579


In [16]:
adata = sc.concat([adata_files1, adata_files2])
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 80566 × 36601
    obs: 'Sample', 'doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'

In [17]:
write_anndata(base_data_path, 'no_gene_filtered.h5ad', adata)

In [18]:
adata = read_anndata(base_data_path, 'no_gene_filtered.h5ad')

  utils.warn_names_duplicates("obs")


In [19]:
sc.pp.filter_cells(adata, min_genes=200) #get rid of cells with fewer than 200 genes
sc.pp.filter_genes(adata, min_cells=10) #get rid of genes that are found in fewer than 10 cells
adata

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 73580 × 26911
    obs: 'Sample', 'doublet', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'n_genes'
    var: 'n_cells'

In [20]:
adata.X = csr_matrix(adata.X)
adata.X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 132124284 stored elements and shape (73580, 26911)>

In [21]:
write_anndata(base_data_path, 'all_combined_filtered.h5ad', adata)

In [22]:
adata = read_anndata(base_data_path, 'all_combined_filtered.h5ad')

  utils.warn_names_duplicates("obs")


In [23]:
adata = remove_ribosomal_genes(adata)

Remaining ribosomal genes: 0


In [25]:
adata.obs_names_make_unique()
adata.var_names_make_unique()

In [26]:
adata.obs

Unnamed: 0,Sample,doublet,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo,n_genes
AAACCCACAACATACC-1,Treated,False,733,1154.0,169.0,14.644713,124.0,10.745234,733
AAACCCACATCTATCT-1,Treated,False,578,928.0,120.0,12.931034,142.0,15.301723,578
AAACCCACATGATGCT-1,Treated,False,4646,20165.0,1110.0,5.504587,1974.0,9.789239,4646
AAACCCAGTATCGTAC-1,Treated,False,677,1050.0,129.0,12.285714,106.0,10.095238,677
AAACCCAGTCACCCTT-1,Treated,False,845,1466.0,217.0,14.802183,137.0,9.345157,845
...,...,...,...,...,...,...,...,...,...
TTTGGTTGTTCCACGG-1,Untreated,False,1671,3575.0,95.0,2.657343,477.0,13.342658,1671
TTTGTTGGTACCCGCA-1,Untreated,False,3727,14103.0,1531.0,10.855846,2064.0,14.635184,3727
TTTGTTGGTCGCATGC-1,Untreated,False,1750,3641.0,165.0,4.531722,404.0,11.095853,1750
TTTGTTGGTTCTCACC-1,Untreated,False,545,709.0,25.0,3.526093,73.0,10.296192,545


In [27]:
adata.X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 127115755 stored elements and shape (73580, 26811)>

In [28]:
write_anndata(base_data_path, 'data_combined.h5ad', adata)