This is the file use to subsample data from the whole data set of the paper 'COVID-19 immune features revealed by a large-scale single-cell transcriptome atlas'.

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

results_file = '/storage/holab/linxy/DCATS/'  # the file that will store the analysis results

scanpy==1.6.0 anndata==0.7.5 umap==0.4.6 numpy==1.19.4 scipy==1.5.3 pandas==1.2.1 scikit-learn==0.23.2 statsmodels==0.12.1 python-igraph==0.8.3 louvain==0.7.0 leidenalg==0.8.3


In [3]:
adata = sc.read_h5ad("/storage/holab/linxy/DCATS/Ren2021_all.h5ad")
adata

AnnData object with n_obs × n_vars = 1462702 × 27943
    obs: 'celltype', 'majorType', 'sampleID', 'PatientID', 'datasets', 'City', 'Age', 'Sex', 'Sample type', 'CoVID-19 severity', 'Sample time', 'Sampling day (Days after symptom onset)', 'SARS-CoV-2', 'Single cell sequencing platform', 'BCR single cell sequencing', 'TCR single cell sequencing', 'Outcome', 'Comorbidities', 'COVID-19-related medication and anti-microbials', 'Leukocytes [G/L]', 'Neutrophils [G/L]', 'Lymphocytes [G/L]', 'Unpublished'
    uns: 'neighbors', 'pca'
    obsm: 'X_pca', 'X_tsne', 'har_emb'
    obsp: 'connectivities', 'distances'

Subsample 5% cells

In [5]:
sub_adata = sc.pp.subsample(adata, fraction = 0.05, copy = True)
sub_adata

  res = method(*args, **kwargs)


AnnData object with n_obs × n_vars = 73135 × 27943
    obs: 'celltype', 'majorType', 'sampleID', 'PatientID', 'datasets', 'City', 'Age', 'Sex', 'Sample type', 'CoVID-19 severity', 'Sample time', 'Sampling day (Days after symptom onset)', 'SARS-CoV-2', 'Single cell sequencing platform', 'BCR single cell sequencing', 'TCR single cell sequencing', 'Outcome', 'Comorbidities', 'COVID-19-related medication and anti-microbials', 'Leukocytes [G/L]', 'Neutrophils [G/L]', 'Lymphocytes [G/L]', 'Unpublished'
    uns: 'neighbors', 'pca'
    obsm: 'X_pca', 'X_tsne', 'har_emb'
    obsp: 'connectivities', 'distances'

In [18]:
sub_adata.obsm['X_pca']   # a numpy.ndarry object
sub_adata.obsm['X_pca'].shape  # 73135, 50

(73135, 50)

In [32]:
sub_pca = sub_adata.obsm['X_pca']
pcaDF = pd.DataFrame(sub_pca)
pcaDF.head
#print(type(sub_adata.obs['sampleID']))

<bound method NDFrame.head of              0         1         2         3         4          5          6   \
0     -1.760253  3.386553  0.045719 -0.056496 -0.019635   0.112574  -1.183515   
1     -3.632395  7.855037  1.371150 -0.197809  1.011374  -0.008764   0.240486   
2     -2.659772 -2.739936 -2.674244  2.916973  0.519956  -2.309491   1.650157   
3     -0.793669 -1.983642 -0.820006  3.541181 -0.081824  22.126276  12.663067   
4     -4.218463  6.004827  0.558667 -0.447683  0.160857   0.672377  -1.080915   
...         ...       ...       ...       ...       ...        ...        ...   
73130 -4.004488  5.054613  0.141885 -0.676689 -0.561270   0.569979  -0.736629   
73131  7.303104  0.463246 -1.615991 -0.066300  3.957607  -1.163442   0.628077   
73132 -1.660278 -2.929883 -3.354737  3.296285 -0.177473  -1.527860   2.136412   
73133 -3.276169 -1.390568 -3.157488  2.727828  0.892777  -1.309038   0.945801   
73134 -1.953783 -5.673783  4.907259 -3.128402  0.351234   0.148699  -0.636174  

In [35]:
idDF = sub_adata.obs['sampleID'].to_frame()
idDF.index.name = 'barcode'
idDF.reset_index(inplace=True)
print(idDF)

                    barcode  sampleID
0      CCACTACAGTAGCCGA-280    S-S050
1      GTGCTTCCAGCTTCGG-162  S-S021-3
2       CGACCTTCATTAGCCA-34   S-HC012
3      CACGAATCAATCCAGT-176  S-M009-1
4      AACTGGTTCCAACCAA-154  S-M008-1
...                     ...       ...
73130  CGCGTTTAGTCGTACT-218  S-M040-2
73131  GTCAAGTAGACAGGCT-270    S-S041
73132  AAGTACCCAATTGCTG-189    S-S020
73133  CTTAACTGTTTACTCT-273    S-S043
73134   AGGTCCGCAGGTTTCA-35    S-M055

[73135 rows x 2 columns]


In [41]:
resDF = pd.concat([idDF, pcaDF], axis=1)
print(resDF.head)
resDF.to_csv('/storage/holab/linxy/DCATS/Ren2021_pca.csv')

<bound method NDFrame.head of                     barcode  sampleID         0         1         2         3  \
0      CCACTACAGTAGCCGA-280    S-S050 -1.760253  3.386553  0.045719 -0.056496   
1      GTGCTTCCAGCTTCGG-162  S-S021-3 -3.632395  7.855037  1.371150 -0.197809   
2       CGACCTTCATTAGCCA-34   S-HC012 -2.659772 -2.739936 -2.674244  2.916973   
3      CACGAATCAATCCAGT-176  S-M009-1 -0.793669 -1.983642 -0.820006  3.541181   
4      AACTGGTTCCAACCAA-154  S-M008-1 -4.218463  6.004827  0.558667 -0.447683   
...                     ...       ...       ...       ...       ...       ...   
73130  CGCGTTTAGTCGTACT-218  S-M040-2 -4.004488  5.054613  0.141885 -0.676689   
73131  GTCAAGTAGACAGGCT-270    S-S041  7.303104  0.463246 -1.615991 -0.066300   
73132  AAGTACCCAATTGCTG-189    S-S020 -1.660278 -2.929883 -3.354737  3.296285   
73133  CTTAACTGTTTACTCT-273    S-S043 -3.276169 -1.390568 -3.157488  2.727828   
73134   AGGTCCGCAGGTTTCA-35    S-M055 -1.953783 -5.673783  4.907259 -3.128402  