In [2]:
import os
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
import seaborn as sns 
import matplotlib.pyplot as plt
import scipy
import csv
import gzip
import anndata as ad
from pathlib import Path
import glob
import scvi
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

-----
anndata     0.10.8
scanpy      1.10.3
-----
PIL                 10.4.0
absl                NA
asttokens           NA
attr                24.2.0
bottleneck          1.4.1
cffi                1.17.1
chex                0.1.87
comm                0.2.1
contextlib2         NA
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.6.7
decorator           5.1.1
distutils           3.12.7
docrep              0.3.2
etils               1.9.4
executing           0.8.3
filelock            3.16.1
flax                0.9.0
fsspec              2024.10.0
gmpy2               2.1.5
h5py                3.11.0
igraph              0.11.6
ipykernel           6.28.0
ipywidgets          8.1.3
jaraco              NA
jax                 0.4.34
jaxlib              0.4.34
jedi                0.18.1
joblib              1.4.2
kiwisolver          1.4.7
legacy_api_wrap     NA
leidenalg           0.10.2
lightning           2.4.0
lightning_utilities 0.11.7
llvmlit

In [3]:
directory = '/home/jing/Phd_project/project_GBM/gbm_DATA/gbm_DATA_GSE174554/gbm_DATA_scRNA_atlas'
os.chdir(directory)

In [4]:
outdir = '/home/jing/Phd_project/project_GBM/gbm_OUTPUT/gbm_OUTPUT_publication/'

In [90]:
names_list=['GSM5319518_SF2777','GSM5319548_SF2979','GSM5319519_SF2990',
                'GSM5319549_SF3073','GSM5319520_SF3076','GSM5319550_SF3243',
                'GSM5319521_SF3391','GSM5319551_SF3448','GSM5319511_SF11916',
                'GSM5319543_SF12382','GSM5319506_SF11082','GSM5319562_SF11488',
                'GSM5319530_SF9358','GSM5319568_SF9962','GSM5319559_SF9798','GSM5319532_SF9494']

### Annotations 

In [91]:
doublet_df = pd.read_csv(os.path.join(outdir,'doublet_predictions.csv'), index_col=0)
doublet_df

Unnamed: 0_level_0,Solo_Prediction
Index,Unnamed: 1_level_1
AAACCCAAGCGAACTG-1-SF9494,doublet
AAACCCAAGGCATTTC-1-SF9494,singlet
AAACCCAAGTACCGGA-1-SF9494,doublet
AAACCCATCATTCGGA-1-SF9494,singlet
AAACCCATCCTCTCGA-1-SF9494,singlet
...,...
TTTGTTGAGTACCATC-1-SF2777,singlet
TTTGTTGCACTGGCCA-1-SF2777,doublet
TTTGTTGGTACTCAAC-1-SF2777,doublet
TTTGTTGTCCCAAGTA-1-SF2777,doublet


In [92]:
singlet_mask = doublet_df[(doublet_df['Solo_Prediction'] == 'singlet')]
singlet_mask

Unnamed: 0_level_0,Solo_Prediction
Index,Unnamed: 1_level_1
AAACCCAAGGCATTTC-1-SF9494,singlet
AAACCCATCATTCGGA-1-SF9494,singlet
AAACCCATCCTCTCGA-1-SF9494,singlet
AAACCCATCGAGCCAC-1-SF9494,singlet
AAACCCATCGATTGAC-1-SF9494,singlet
...,...
TTTGGTTCAGCAATTC-1-SF2777,singlet
TTTGGTTTCAGCAATC-1-SF2777,singlet
TTTGTTGAGGTTCTTG-1-SF2777,singlet
TTTGTTGAGTACCATC-1-SF2777,singlet


In [129]:
adata_list = []

# Loop over each sample and read in the AnnData object
for name in names_list:

    mtx =f"{name}_matrix.mtx.gz"
    adata = sc.read_mtx(mtx)
    cells=pd.read_csv(f'{name}_barcodes.tsv.gz',header=None)
    features=pd.read_csv(f'{name}_features.tsv.gz',header=None,sep='\t')
    adata= adata.T
    #check the columns first to make sure they are the ones you need 
    adata.obs['CellID']= cells[0].tolist()
    adata.obs.index = adata.obs['CellID']

    adata.var['Gene']= features[0].tolist()
    adata.var.index= adata.var['Gene']
    adata.var_names_make_unique() 
    adata.obs['source'] = name[11:]
    adata.obs['ref'] = adata.obs.index+ '-'+ adata.obs['source']

    adata =adata[adata.obs['ref'].isin(singlet_mask.index)].copy()
    adata.obs.index = adata.obs['ref']
    adata.var['mt'] =adata.var_names.str.startswith('MT-')

    sc.pp.filter_cells(adata, min_genes=200)
    #sc.pp.filter_genes(adata, min_cells=3)
    sc.pp.calculate_qc_metrics(adata,qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata= adata[adata.obs.n_genes_by_counts <6000, :]
    adata= adata[adata.obs.pct_counts_mt< 5, :].copy()


    adata.layers["counts"] = adata.X.copy()    
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata  # keep full dimension safe
    adata_list.append(adata)

normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)


In [130]:
batch_names = [adata.obs['source'].iloc[0] for adata in adata_list]


In [131]:
adata = adata_list[0].concatenate(adata_list[1:], batch_key='source', batch_categories=batch_names)  

  adata = adata_list[0].concatenate(adata_list[1:], batch_key='source', batch_categories=batch_names)


In [132]:
adata

AnnData object with n_obs × n_vars = 50738 × 33694
    obs: 'CellID', 'source', 'ref', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'Gene', 'mt', 'n_cells_by_counts-SF11082', 'mean_counts-SF11082', 'pct_dropout_by_counts-SF11082', 'total_counts-SF11082', 'n_cells_by_counts-SF11488', 'mean_counts-SF11488', 'pct_dropout_by_counts-SF11488', 'total_counts-SF11488', 'n_cells_by_counts-SF11916', 'mean_counts-SF11916', 'pct_dropout_by_counts-SF11916', 'total_counts-SF11916', 'n_cells_by_counts-SF12382', 'mean_counts-SF12382', 'pct_dropout_by_counts-SF12382', 'total_counts-SF12382', 'n_cells_by_counts-SF2777', 'mean_counts-SF2777', 'pct_dropout_by_counts-SF2777', 'total_counts-SF2777', 'n_cells_by_counts-SF2979', 'mean_counts-SF2979', 'pct_dropout_by_counts-SF2979', 'total_counts-SF2979', 'n_cells_by_counts-SF2990', 'mean_counts-SF2990', 'pct_dropout_by_counts-SF2990', 'total_counts-SF2990', 'n_cells_by_counts-SF3073', 'mean_counts-SF3073', 'pct_d

In [133]:
adata.obs['Status'] = 'Recurrent'
for i in ['SF2770', 'SF2990', 'SF3076', 'SF3391', 'SF11916', 'SF11082', 'SF9358', 'SF9798']:
    adata.obs.loc[adata.obs['source'].str.contains(i, na=False), 'Status'] = 'Primary'

In [134]:
metadata= pd.read_csv('/home/jing/Phd_project/project_GBM/gbm_DATA/gbm_DATA_metadata/GSE174554_Tumor_normal_metadata_11916v2.txt',
                      sep=' ')
metadata['Barcode'] = metadata['Barcode']+ '-1'
metadata

Unnamed: 0,Sample,Barcode,Tumor_Normal_annotation
0,SF10022,CTATCTAAGCAAGCCA-1,Tumor
1,SF10022,AAACCCAGTCTACGAT-1,Normal
2,SF10022,AAAGGGCTCACCCTGT-1,Normal
3,SF10022,AACAACCAGACCCGCT-1,Normal
4,SF10022,AACAAGAGTGTAAACA-1,Normal
...,...,...,...
254283,SF9791v2,TTTCAGTCATCTTCGC-1,Tumor
254284,SF9791v2,TTTCGATAGTCATAGA-1,Tumor
254285,SF9791v2,TTTGGAGCACTGTCGG-1,Tumor
254286,SF9791v2,TTTGGAGTCACGTCCT-1,Tumor


In [135]:
metadata['Barcode'] = metadata['Barcode']+ '-' +metadata['Sample']

In [136]:
adata.obs = adata.obs.merge(
    metadata[["Sample", "Barcode", "Tumor_Normal_annotation"]],
    how="left",
    left_on=["source", adata.obs.index],
    right_on=["Sample", "Barcode"]
)

In [137]:
adata.obs

Unnamed: 0,CellID,source,ref,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,Status,Sample,Barcode,Tumor_Normal_annotation
0,AAACCCAAGGGATCAC-1,SF2777,AAACCCAAGGGATCAC-1-SF2777,468,468,579.0,9.0,1.554404,Recurrent,,AAACCCAAGGGATCAC-1-SF2777-SF2777,
1,AAACCCAGTCGATTTG-1,SF2777,AAACCCAGTCGATTTG-1-SF2777,311,311,402.0,4.0,0.995025,Recurrent,,AAACCCAGTCGATTTG-1-SF2777-SF2777,
2,AAACCCAGTCGTCAGC-1,SF2777,AAACCCAGTCGTCAGC-1-SF2777,864,864,1518.0,3.0,0.197628,Recurrent,,AAACCCAGTCGTCAGC-1-SF2777-SF2777,
3,AAACCCAGTTGTAAAG-1,SF2777,AAACCCAGTTGTAAAG-1-SF2777,632,632,891.0,12.0,1.346801,Recurrent,,AAACCCAGTTGTAAAG-1-SF2777-SF2777,
4,AAACCCATCTATCGGA-1,SF2777,AAACCCATCTATCGGA-1-SF2777,849,849,1338.0,43.0,3.213752,Recurrent,,AAACCCATCTATCGGA-1-SF2777-SF2777,
...,...,...,...,...,...,...,...,...,...,...,...,...
50733,TTTGGTTTCATTATCC-1,SF9494,TTTGGTTTCATTATCC-1-SF9494,2027,2027,4502.0,0.0,0.000000,Recurrent,,TTTGGTTTCATTATCC-1-SF9494-SF9494,
50734,TTTGGTTTCCCTCGTA-1,SF9494,TTTGGTTTCCCTCGTA-1-SF9494,1972,1972,4071.0,0.0,0.000000,Recurrent,,TTTGGTTTCCCTCGTA-1-SF9494-SF9494,
50735,TTTGTTGTCAGACAAA-1,SF9494,TTTGTTGTCAGACAAA-1-SF9494,2340,2340,5272.0,0.0,0.000000,Recurrent,,TTTGTTGTCAGACAAA-1-SF9494-SF9494,
50736,TTTGTTGTCCATTGGA-1,SF9494,TTTGTTGTCCATTGGA-1-SF9494,2137,2137,4709.0,6.0,0.127416,Recurrent,,TTTGTTGTCCATTGGA-1-SF9494-SF9494,


In [138]:
#adata.obs.index = adata.obs.index.astype(str)  # Convert index to string

In [140]:
adata.raw = adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    layer="counts",
    batch_key="source",
    subset=True,
)

extracting highly variable genes


AssertionError: Don’t call _normalize_index with non-categorical/string names