The data is stored in H5AD files which are types of files that store annotated single-cell gene expression data,
and other metadata such as cell annotations, quality control metrics, and feature annotations


In [2]:
import anndata #this python package is used to access data from H5AD files

In [3]:
# load the data H5AD file
# data is a H5AD file object and contains the gene expression matrix (data.X), cell metadata (data.obs), and gene metadata (data.var)
data = anndata.read_h5ad('/home/data/raw/coin-seq/dixit/DixitRegev2016.h5ad') #this is the path to the data, might vary

In [4]:
# working with the count matrix (data.X)

X = data.X # Accessing the gene expression matrix. data is an anndata object, and X is a sparse matrix representing expression data

print(data) #the attributes printed after obs: refer to information about each cell, the information printed after var: contains information about each gene
print('number of genes: ', data.n_vars) # columns
print('number of cells: ', data.n_obs) # rows

print('(a, b) means gene index a is expressed in cell index b')
subset = X[:20, :20] #view first 20 cells and genes, this is normalized expression data 
print(subset) #because subset is a sparse matrix a lot of the values in the matrix will be 0, only entries not zero are stored in memory, and this prints out that info

print('\nthis is what the matrix looks like:')
print(subset.todense()) #this is here for a visual of what the matrix actually looks like, the 0s mean there is no sufficent gene expression in the cell


AnnData object with n_obs × n_vars = 51898 × 23529
    obs: 'perturbation', 'grna_lenient', 'target', 'moi', 'cell_line', 'celltype', 'perturbation_type', 'cancer', 'disease', 'guide_id', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts'
    var: 'gene_id', 'mt', 'ribo', 'ncounts', 'ncells'
number of genes:  23529
number of cells:  51898
(a, b) means gene index a is expressed in cell index b
  (5, 5)	1.0
  (17, 5)	1.0
  (13, 7)	1.0
  (7, 11)	1.0
  (14, 13)	1.0
  (13, 19)	1.0

this is what the matrix looks like:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [5]:
# working with the cell metadata (data.obs)
cells = data.obs
print(cells) # each row represents a different cell, and the columns represent metadata associated with that cell

                               perturbation                    grna_lenient  \
AAACATACAAGGTA_ph14d_B5          p-sgEGR1-2                      p-sgEGR1-2   
AAACATACCTGCAA_ph14d_B5         p-sgGABPA-9  p-sgGABPA-9;p-INTERGENIC393453   
AAACATACTCTCCG_ph14d_B5  p-INTERGENIC216151              p-INTERGENIC216151   
AAACATTGAACCAC_ph14d_B5          p-sgE2F4-6                      p-sgE2F4-6   
AAACATTGACCTGA_ph14d_B5          p-sgELF1-5   p-INTERGENIC393453;p-sgELF1-5   
...                                     ...                             ...   
TTTCTACTGATGAA_ph14d_F7         p-sgCREB1-2                     p-sgCREB1-2   
TTTGACTGACGGGA_ph14d_F7          p-sgE2F4-6                      p-sgE2F4-6   
TTTGACTGAGCATC_ph14d_F7          p-sgEGR1-3   p-sgEGR1-3;p-INTERGENIC393453   
TTTGACTGCTACCC_ph14d_F7          p-sgIRF1-2   p-sgIRF1-2;p-INTERGENIC393453   
TTTGCATGACCCTC_ph14d_F7          p-sgELF1-5   p-INTERGENIC393453;p-sgELF1-5   

                                   target   moi cel

In [6]:
# working with the gene metadata (data.var)
genes = data.var
print(genes) # each row represents a different gene, and the columns represent metadata associated with that gene

                      gene_id     mt   ribo  ncounts  ncells
MIR1302-10    ENSG00000243485  False  False      1.0       1
FAM138A       ENSG00000237613  False  False      1.0       1
OR4F5         ENSG00000186092  False  False      3.0       3
RP11-34P13.7  ENSG00000238009  False  False    253.0     249
RP11-34P13.8  ENSG00000239945  False  False      9.0       9
...                       ...    ...    ...      ...     ...
AL592183.1    ENSG00000220023  False  False  50960.0   28138
AC011841.1    ENSG00000212884  False  False     34.0      29
AL354822.1    ENSG00000215615  False  False   3890.0    3691
PNRC2-1       ENSG00000215700  False  False     20.0      20
SRSF10-1      ENSG00000215699  False  False   5297.0    4945

[23529 rows x 5 columns]


In [7]:
# dataset annotations
info = data.uns
print(info) # the only overload key is neighbors suggesting the data was processed using neighborhood based analysis (clustering or dimensionality reduction)

OverloadedDict, wrapping:
	OrderedDict()
With overloaded keys:
	['neighbors'].
