single cell sequencing simulation program
Set up a working directory (WorkPath/), Create several new folders (fragment_data, pcr_control_data, umi_data, Simulation_Path).
Put the files (Test_Data.txt, Error_Profile.txt, Quality_Profile.txt) and codes (SSCRNA.py, MyProcess10.py, S_S2.py, simulation_in_preDatabase.py) in the working directory.
As recorded in Main.py file:
import sys
sys.path.append('WorkPath/')
import SSCRNA as sc
import time
import os
import pickle
filename = 'WorkPath/'+'/Test_Data.txt'
cell_pro,gene_na=sc.Read_Gene_Table(filename)
sequences data can be found in ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.pc_transcripts.fa.gz
f=r"gencode.v32.pc_transcripts.fa"
f_na=sc.read_names_from_file(f)
genes_in=sc.split_GENENAMES(f_na)
gene_name_list=sc.get_iterm_list(genes_in,2)
gene_name_list=sc.del_version(gene_name_list)
sel_genes=sc.del_version(gene_na)
sel_index=sc.get_gene_index(gene_name_list,sel_genes)
f_seq=sc.read_seqs_from_file(f,sel_index)
f_na=sc.get_sel_gene_name(gene_name_list,sel_index)
cell_pro_n=sc.get_map_gene_profile(cell_pro,sel_index)
cells_profile=[]
cell_names=[]
for each in cell_pro_n:
cell_names.append(each)
cells_profile.append(cell_pro_n[each])
frag_list_path='WorkPath/'+'fragment_data/'
sc.multi_cell2(frag_list_path,cells_profile,f_na,f_seq)
pcr_list_path='WorkPath/'+'pcr_control_data/'
sc.PCR_database(pcr_list_path,frag_list_path,3)
umi_path='WorkPath/'+'umi_data/'
sc.get_UMI_bank(frag_list_path,umi_path,8)
cell_barcord=sc.get_barcord_bank(27,2,22)
xc=[cell_barcord, f_na, f_seq]
with open('WorkPath/'+'cell_barcord_gene_name_seq.pickle', 'wb') as handle:
pickle.dump(xc, handle)
run the simulation program (python simulation_in_preDatabase.py [number of reads] working_path process)
make sure using linux system, and run the command as follow (number of reads = 200, process = 5):
python simulation_in_preDatabase.py 200 'WorkPath/' 5