In [19]:
import os
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import csv
import gzip
import anndata as ad
from pathlib import Path
import glob
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap


sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

-----
anndata     0.10.8
scanpy      1.10.2
-----
MulticoreTSNE       NA
PIL                 10.4.0
absl                NA
anyio               NA
appnope             0.1.4
asttokens           NA
backoff             2.2.1
brotli              1.1.0
bs4                 4.12.3
certifi             2024.07.04
cffi                1.16.0
charset_normalizer  3.3.2
chex                0.1.86
click               8.1.7
colorama            0.4.6
comm                0.2.2
contextlib2         NA
croniter            NA
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0
debugpy             1.8.2
decorator           5.1.1
deepdiff            7.0.1
docrep              0.3.2
email_validator     2.2.0
etils               1.9.2
executing           2.0.1
fastapi             0.111.0
flax                0.8.5
fsspec              2024.6.1
gmpy2               2.1.5
google              NA
h5py                3.11.0
idna                3.7
importlib_resources NA
ipykernel           6.29.5


In [20]:
import scvi
import scanpy as sc
import os
import numpy as np
from matplotlib.pyplot import rc_context
import pandas as pd



In [9]:
# Define the directory path
directory = '/Users/lidiayung/PhD_project/project_UCD_blca/blca_DATA/blca_DATA_mouse_GSE174182_RAW'
# Get the list of files in the directory (non-recursive)
dirs = os.listdir(directory)

# Create an empty list to store the names
names_list = []

# Extract the unique names from the first 20 characters of the filenames
for x in dirs:
    name = x[:20]
    names_list.append(name)

# Remove duplicates by converting to a set and then back to a list
names_list = list(set(names_list))

# Print the unique names
print(names_list)


['GSM5288674_Sample-11', '.DS_Store', 'GSM5288670_Sample-5_', 'GSM5288672_Sample-7_', 'GSM5288673_Sample-8_', 'GSM5288668_Sample-3_', 'GSM5288671_Sample-6_', 'GSM5288669_Sample-4_', '.Rhistory']


In [10]:
os.chdir(directory)


In [11]:
names_list=['GSM5288669_Sample-4_', 'GSM5288670_Sample-5_']#, 'GSM5288671_Sample-6_', 
            #'GSM5288668_Sample-3_', 'GSM5288672_Sample-7_', 'GSM5288673_Sample-8_', 'GSM5288674_Sample-11']

In [12]:
adata_list = []

# Loop over each sample and read in the AnnData object
for name in names_list:

    mtx =f"{name}filtered_matrix.mtx.gz"
    adata = sc.read_mtx(mtx)
    cells=pd.read_csv(f'{name}filtered_barcodes.tsv.gz',header=None)
    features=pd.read_csv(f'{name}filtered_features.tsv.gz',header=None,sep='\t')
    adata= adata.T
    #check the columns first to make sure they are the ones you need 
    adata.obs['CellID']= cells[0].tolist()
    adata.var['Gene']= features[1].tolist()
    adata.var.index= adata.var['Gene']
    adata.var_names_make_unique() 

    sc.pp.filter_cells(adata, min_genes=300)
    sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
    upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .97)
    adata = adata[adata.obs.n_genes_by_counts < upper_lim]

    adata.obs['source'] = name[:10]
    adata_list.append(adata)

  adata.obs['source'] = name[:10]
  adata.obs['source'] = name[:10]


In [13]:
batch_names = [adata.obs['source'].iloc[0] for adata in adata_list]
adata = adata_list[0].concatenate(adata_list[1:], batch_key='source', batch_categories=batch_names)                                  

  adata = adata_list[0].concatenate(adata_list[1:], batch_key='source', batch_categories=batch_names)


In [14]:
adata

AnnData object with n_obs × n_vars = 14063 × 31053
    obs: 'CellID', 'n_genes', 'n_genes_by_counts', 'total_counts', 'source'
    var: 'Gene', 'n_cells_by_counts-GSM5288669', 'mean_counts-GSM5288669', 'pct_dropout_by_counts-GSM5288669', 'total_counts-GSM5288669', 'n_cells_by_counts-GSM5288670', 'mean_counts-GSM5288670', 'pct_dropout_by_counts-GSM5288670', 'total_counts-GSM5288670'

In [15]:
adata.layers['counts'] = adata.X.copy()

In [16]:
adata

AnnData object with n_obs × n_vars = 14063 × 31053
    obs: 'CellID', 'n_genes', 'n_genes_by_counts', 'total_counts', 'source'
    var: 'Gene', 'n_cells_by_counts-GSM5288669', 'mean_counts-GSM5288669', 'pct_dropout_by_counts-GSM5288669', 'total_counts-GSM5288669', 'n_cells_by_counts-GSM5288670', 'mean_counts-GSM5288670', 'pct_dropout_by_counts-GSM5288670', 'total_counts-GSM5288670'
    layers: 'counts'

In [17]:

scvi.model.SCVI.setup_anndata(adata, layer = "counts",
                             categorical_covariate_keys=["source"],
                             continuous_covariate_keys=['total_counts'])


model = scvi.model.SCVI(adata)

In [18]:
model


