# Single Cell Sequencing
## Creates h5ad files 


Uses exon data extracted from: zUMIs_output\expression\*.dgecounts.rds and gene data from: zUMIs_output\expression\*.gene_names.txt<br>
Creates h5ad file from anndata objects. Each file represents data from one plate (sample)<br>

In [1]:
# !pip install --quiet anndata
import numpy as np
import pandas as pd
import anndata as ad
import pyreadr
import warnings
warnings.filterwarnings("ignore")

In [2]:
rds_file_sample1 = r'C:\Users\i\SingleCellSequencing\scs_analysis\my_data\sample1\sample1_R.rds'  # the file read and prepared in R
h5ad_file_sample1 = r'C:\Users\i\SingleCellSequencing\scs_analysis\my_data\sample1\sample1_R.h5ad'  # the file that will be used for Melisas script

rds_file_sample2 = r'C:\Users\i\SingleCellSequencing\scs_analysis\my_data\sample2\sample2_R.rds'  # the file read and prepared in R
h5ad_file_sample2 = r'C:\Users\i\SingleCellSequencing\scs_analysis\my_data\sample2\sample2_R.h5ad'  # the file that will be used for Melisas script

rds_file_sample3 = r'C:\Users\i\SingleCellSequencing\scs_analysis\my_data\sample3\sample3_R.rds'  # the file read and prepared in R
h5ad_file_sample3 = r'C:\Users\i\SingleCellSequencing\scs_analysis\my_data\sample3\sample3_R.h5ad'  # the file that will be used for Melisas script


In [3]:
# Read the RDS file and convert to a DataFrame
rds_sample1 = pyreadr.read_r(rds_file_sample1)

rds_sample2 = pyreadr.read_r(rds_file_sample2)

rds_sample3 = pyreadr.read_r(rds_file_sample3)

In [4]:
# convert to dataframes (they are different sizes)
rds_sample1_df = rds_sample1[list(rds_sample1.keys())[0]]

rds_sample2_df = rds_sample2[list(rds_sample2.keys())[0]]

rds_sample3_df = rds_sample3[list(rds_sample3.keys())[0]]

In [5]:
# merge whole dataframe, so that all will have the same amount of genes
unified_df = pd.concat([rds_sample1_df,rds_sample2_df,rds_sample3_df],axis=1)
# replace all NaN values with zero counts
unified_df = unified_df.fillna(0)
unified_df

Unnamed: 0_level_0,AACCATCGGCAACTACCACT,AACCATCGGCAAGTTATCGG,AACCATCGGCCACAATCCAC,AACCATCGGCCACTAACCGG,AACCATCGGCCCGTTCGTAT,AACCATCGGCCGACGGTTGT,AACCATCGGCCGAGTCATTA,AACCATCGGCGAAGCTAGCC,AACCATCGGCGTAACGTCTA,AACCATCGGCTCCAATGCAA,...,TTGTAATGCGGACCTGAAGA,TTGTAATGCGGGTATGGTGA,TTGTAATGCGTCCACAGCCA,TTGTAATGCGTGAATAGAGG,TTGTAATGCGTGCTGTTGTT,TTGTAATGCGTGGAGTTACA,TTGTAATGCGTTACTGTCGT,TTGTAATGCGTTACTTACCG,TTGTAATGCGTTCACGTCAG,TTGTAATGCGTTCGCCACAC
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001.4,0.0,1.0,4.0,0.0,2.0,4.0,2.0,2.0,1.0,0.0,...,0.0,0.0,3.0,0.0,0.0,3.0,2.0,1.0,4.0,0.0
ENSMUSG00000000028.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000031.16,16.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000037.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000049.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERCC-00025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERCC-00071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERCC-00111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERCC-00145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# split back to 3 original dataframes but same numbers of genes now
total_barcodes_df1 = rds_sample1_df.shape[1]
total_barcodes_df2 = rds_sample2_df.shape[1]
total_barcodes_df3 = rds_sample3_df.shape[1]

rds_sample1_df = unified_df.iloc[:,0:total_barcodes_df1]
till_col = total_barcodes_df1+total_barcodes_df2
rds_sample2_df = unified_df.iloc[:,total_barcodes_df1:till_col]
from_col = till_col
till_col = from_col+total_barcodes_df3
rds_sample3_df = unified_df.iloc[:,from_col:till_col]
rds_sample1_df.shape[1],rds_sample2_df.shape[1],rds_sample3_df.shape[1]

(384, 384, 384)

In [7]:
# transpose before creating anndata object, so that gene ids are columns and barcodes are rows
transposed_rds_sample1_df = rds_sample1_df.transpose()# Create an AnnData object
adata1 = ad.AnnData(
    X=transposed_rds_sample1_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample1_df.index},
    var={'gene_id': transposed_rds_sample1_df.columns}  
)

transposed_rds_sample2_df = rds_sample2_df.transpose()# Create an AnnData object
adata2 = ad.AnnData(
    X=transposed_rds_sample2_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample2_df.index},
    var={'gene_id': transposed_rds_sample2_df.columns}  
)

transposed_rds_sample3_df = rds_sample3_df.transpose()# Create an AnnData object
adata3 = ad.AnnData(
    X=transposed_rds_sample3_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample3_df.index},
    var={'gene_id': transposed_rds_sample3_df.columns}  
)

In [8]:
adata2

AnnData object with n_obs × n_vars = 384 × 37241
    obs: 'barcode'
    var: 'gene_id'

In [9]:
# save h5ad files that are necessary for further Melisas pipeline
adata1.write(h5ad_file_sample1)
adata2.write(h5ad_file_sample2)
adata3.write(h5ad_file_sample3)