# Single Cell Sequencing
## Creates h5ad files 


Uses exon data extracted from: zUMIs_output\expression\*.dgecounts.rds and gene data from: zUMIs_output\expression\*.gene_names.txt<br>
Creates h5ad file from anndata objects. Each file represents data from one plate (sample)<br>

In [1]:
# !pip install --quiet anndata
import numpy as np
import pandas as pd
import anndata as ad
import pyreadr
import config as general_paths
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
figure_path = general_paths.figures_path
data_path = general_paths.data_path
output_path = general_paths.output_path
local_path = general_paths.local_path

In [3]:
rds_file_sample1 = os.path.join(local_path, "sample1/counts_umi/sample1_R.rds") # the file read and prepared in R
h5ad_file_sample1 = os.path.join(local_path, "sample1/counts_umi/sample1_R.h5ad")  # the file that will be used for Melisas script

rds_file_sample2 = os.path.join(local_path, "sample2/counts_umi/sample2_R.rds")  # the file read and prepared in R
h5ad_file_sample2 = os.path.join(local_path, "sample2/counts_umi/sample2_R.h5ad")  # the file that will be used for Melisas script

rds_file_sample3 = os.path.join(local_path, "sample3/counts_umi/sample3_R.rds")  # the file read and prepared in R
h5ad_file_sample3 = os.path.join(local_path, "sample3/counts_umi/sample3_R.h5ad")  # the file that will be used for Melisas script

rds_file_sample4 = os.path.join(local_path, "sample4/counts_umi/sample4_R.rds")  # the file read and prepared in R
h5ad_file_sample4 = os.path.join(local_path, "sample4/counts_umi/sample4_R.h5ad")  # the file that will be used for Melisas script

rds_file_sample5 = os.path.join(local_path, "sample5/counts_umi/sample5_R.rds")  # the file read and prepared in R
h5ad_file_sample5 = os.path.join(local_path, "sample5/counts_umi/sample5_R.h5ad")  # the file that will be used for Melisas script

rds_file_sample6 = os.path.join(local_path, "sample6/counts_umi/sample6_R.rds")  # the file read and prepared in R
h5ad_file_sample6 = os.path.join(local_path, "sample6/counts_umi/sample6_R.h5ad")  # the file that will be used for Melisas script

rds_file_sample7 = os.path.join(local_path, "sample7/counts_umi/sample7_R.rds")  # the file read and prepared in R
h5ad_file_sample7 = os.path.join(local_path, "sample7/counts_umi/sample7_R.h5ad")  # the file that will be used for Melisas script

# file with gene ids and names from kallisto
t2g_file_path = os.path.join(local_path, "t2g.txt")


In [4]:
# Read the RDS file and convert to a DataFrame
rds_sample1 = pyreadr.read_r(rds_file_sample1)

rds_sample2 = pyreadr.read_r(rds_file_sample2)

rds_sample3 = pyreadr.read_r(rds_file_sample3)

rds_sample4 = pyreadr.read_r(rds_file_sample4)

rds_sample5 = pyreadr.read_r(rds_file_sample5)

rds_sample6 = pyreadr.read_r(rds_file_sample6)

rds_sample7 = pyreadr.read_r(rds_file_sample7)

In [5]:
# convert to dataframes (they are different sizes)
rds_sample1_df = rds_sample1[list(rds_sample1.keys())[0]]

rds_sample2_df = rds_sample2[list(rds_sample2.keys())[0]]

rds_sample3_df = rds_sample3[list(rds_sample3.keys())[0]]

rds_sample4_df = rds_sample4[list(rds_sample4.keys())[0]]

rds_sample5_df = rds_sample5[list(rds_sample5.keys())[0]]

rds_sample6_df = rds_sample6[list(rds_sample6.keys())[0]]

rds_sample7_df = rds_sample7[list(rds_sample7.keys())[0]]

In [6]:
# merge whole dataframe, so that all will have the same amount of genes
unified_df = pd.concat([rds_sample1_df,rds_sample2_df,rds_sample3_df,rds_sample4_df,rds_sample5_df,rds_sample6_df,rds_sample7_df],axis=1)
# replace all NaN values with zero counts
unified_df = unified_df.fillna(0.0)
unified_df

Unnamed: 0_level_0,AACCATCGGCAACTACCACT,AACCATCGGCAAGTTATCGG,AACCATCGGCCACAATCCAC,AACCATCGGCCACTAACCGG,AACCATCGGCCCGTTCGTAT,AACCATCGGCCGACGGTTGT,AACCATCGGCCGAGTCATTA,AACCATCGGCGAAGCTAGCC,AACCATCGGCGTAACGTCTA,AACCATCGGCTCCAATGCAA,...,TTGTTACTCCCGGTTGGCTA,TTGTTACTCCGAATGAACGT,TTGTTACTCCGAGCGGAACT,TTGTTACTCCTGACAGAGGT,TTGTTACTCCTGCAGGCTAC,TTGTTACTCCTTACCGGACC,TTGTTACTCCTTATGCCGCT,TTGTTACTCCTTATTCGCCT,TTGTTACTCCTTCTCGCCTC,TTGTTACTCCTTGCTAAGGA
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001.4,0.0,1.0,4.0,0.0,2.0,4.0,2.0,2.0,1.0,0.0,...,2.0,6.0,2.0,2.0,1.0,3.0,2.0,4.0,5.0,4.0
ENSMUSG00000000028.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
ENSMUSG00000000031.16,16.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,1.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
ENSMUSG00000000037.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000049.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000118124.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000118151.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000118223.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000118284.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# t2g = pd.read_csv(t2g_file_path, header=None, names=["tid", "gene_id", "gene_name"], sep="\t")
# t2g.index = t2g.gene_id### FOR THAT WE USE T2G VARIABLE THAT OPENS T2G FILE (FILE WITH LINK BETWEEN GENE CODE AND GENE NAME)

# t2g = t2g.loc[~t2g.index.duplicated(keep='first')]
# gene_ids_kallisto = t2g['gene_id'].tolist()
# gene_ids_scilifelab = list(unified_df.index)
# diffelence1 = list(set(gene_ids_kallisto) - set(gene_ids_scilifelab))
# diffelence2 = list(set(gene_ids_scilifelab) - set(gene_ids_kallisto))
# print(f"Gene ids that are in kallisto file but not in scilifelab: {len(diffelence1)}\nGene ids that are in scilifelab but not in kallisto: {len(diffelence2)}") 
# # there are 308 gene ids from scilifelab that are not in kallisto ids!


In [8]:
# split back to 3 original dataframes but same numbers of genes now
total_barcodes_df1 = rds_sample1_df.shape[1]
total_barcodes_df2 = rds_sample2_df.shape[1]
total_barcodes_df3 = rds_sample3_df.shape[1]
total_barcodes_df4 = rds_sample4_df.shape[1]
total_barcodes_df5 = rds_sample5_df.shape[1]
total_barcodes_df6 = rds_sample6_df.shape[1]
total_barcodes_df7 = rds_sample7_df.shape[1]

rds_sample1_df = unified_df.iloc[:,0:total_barcodes_df1]
till_col = total_barcodes_df1+total_barcodes_df2
rds_sample2_df = unified_df.iloc[:,total_barcodes_df1:till_col]
from_col = till_col
till_col = from_col+total_barcodes_df3
rds_sample3_df = unified_df.iloc[:,from_col:till_col]
from_col = till_col
till_col = from_col+total_barcodes_df4
rds_sample4_df = unified_df.iloc[:,from_col:till_col]
from_col = till_col
till_col = from_col+total_barcodes_df5
rds_sample5_df = unified_df.iloc[:,from_col:till_col]
from_col = till_col
till_col = from_col+total_barcodes_df6
rds_sample6_df = unified_df.iloc[:,from_col:till_col]
from_col = till_col
till_col = from_col+total_barcodes_df7
rds_sample7_df = unified_df.iloc[:,from_col:till_col]


rds_sample1_df.shape[1],rds_sample2_df.shape[1],rds_sample3_df.shape[1],rds_sample4_df.shape[1],rds_sample5_df.shape[1],rds_sample6_df.shape[1],rds_sample7_df.shape[1]

(384, 384, 384, 384, 384, 384, 384)

In [9]:
# transpose before creating anndata object, so that gene ids are columns and barcodes are rows
transposed_rds_sample1_df = rds_sample1_df.transpose()# Create an AnnData object
adata1 = ad.AnnData(
    X=transposed_rds_sample1_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample1_df.index},
    var={'gene_id': transposed_rds_sample1_df.columns}  
)

transposed_rds_sample2_df = rds_sample2_df.transpose()# Create an AnnData object
adata2 = ad.AnnData(
    X=transposed_rds_sample2_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample2_df.index},
    var={'gene_id': transposed_rds_sample2_df.columns}  
)

transposed_rds_sample3_df = rds_sample3_df.transpose()# Create an AnnData object
adata3 = ad.AnnData(
    X=transposed_rds_sample3_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample3_df.index},
    var={'gene_id': transposed_rds_sample3_df.columns}  
)

transposed_rds_sample4_df = rds_sample4_df.transpose()# Create an AnnData object
adata4 = ad.AnnData(
    X=transposed_rds_sample4_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample4_df.index},
    var={'gene_id': transposed_rds_sample4_df.columns}  
)

transposed_rds_sample5_df = rds_sample5_df.transpose()# Create an AnnData object
adata5 = ad.AnnData(
    X=transposed_rds_sample5_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample5_df.index},
    var={'gene_id': transposed_rds_sample5_df.columns}  
)

transposed_rds_sample6_df = rds_sample6_df.transpose()# Create an AnnData object
adata6 = ad.AnnData(
    X=transposed_rds_sample6_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample6_df.index},
    var={'gene_id': transposed_rds_sample6_df.columns}  
)

transposed_rds_sample7_df = rds_sample7_df.transpose()# Create an AnnData object
adata7 = ad.AnnData(
    X=transposed_rds_sample7_df.values,  # Your data as numpy array
    obs={'barcode': transposed_rds_sample7_df.index},
    var={'gene_id': transposed_rds_sample7_df.columns}  
)

In [10]:
# save h5ad files that are necessary for further Melisas pipeline
adata1.write(h5ad_file_sample1)
adata2.write(h5ad_file_sample2)
adata3.write(h5ad_file_sample3)
adata4.write(h5ad_file_sample4)
adata5.write(h5ad_file_sample5)
adata6.write(h5ad_file_sample6)
adata7.write(h5ad_file_sample7)