In [1]:
import os
import pandas as pd

In [2]:
!tree .. -L 1

[01;34m..[00m
├── README.md
├── [01;34mdataset[00m
├── [01;34mdocs[00m
├── [01;34mgtf[00m
├── [01;34mh5[00m
└── process-overview.pdf

4 directories, 2 files


In [3]:
path_data = "../dataset/"
path_gtf = "../gtf/annotations.gtf.gz"
path_aligned_bam = os.path.join(path_data, "test_Aligned.out.bam")
path_h5 = os.path.join(path_data, "test.h5")

## Load ReadArray (*.h5)

In [4]:
from seqc.read_array import ReadArray

In [5]:
ra = ReadArray.load(path_h5)

In [6]:
ra

<seqc.read_array.ReadArray at 0x7fbf50b2a310>

## Explore ReadArray

### ra.filter_codes

In [7]:
ra.filter_codes

{'no_gene': 1,
 'rmt_error': 2,
 'cell_error': 4,
 'low_polyt': 8,
 'gene_not_unique': 16,
 'primer_missing': 32,
 'lonely_triplet': 64}

### ra.data

In [8]:
ra.data

array([(1, 200983461452598, 38446774684, 0),
       (1, 232448540961013, 40377936093, 0),
       (0, 200570500499883, 41267181429, 0), ...,
       (1, 169038985719156, 33098951083, 0),
       (1, 231840668564334, 31598787309, 0),
       (1, 161332457818475, 32723551981, 0)],
      dtype=[('status', 'u1'), ('cell', '<i8'), ('rmt', '<i8'), ('n_poly_t', 'u1')])

In [9]:
len(ra.data)

252735

In [10]:
df_data = pd.DataFrame(ra.data)
df_data

Unnamed: 0,status,cell,rmt,n_poly_t
0,1,200983461452598,38446774684,0
1,1,232448540961013,40377936093,0
2,0,200570500499883,41267181429,0
3,1,231772364696924,41564723102,0
4,1,227499945647347,39153163702,0
...,...,...,...,...
252730,1,169082619287347,49237838683,0
252731,1,164701771590046,57689455348,0
252732,1,169038985719156,33098951083,0
252733,1,231840668564334,31598787309,0


### ra.genes

In [11]:
ra.genes

array([     0,      0, 225067, ...,      0,      0,      0], dtype=int32)

In [12]:
len(ra.genes)

252735

In [13]:
df_genes = pd.DataFrame(ra.genes, columns=["genes"])
df_genes

Unnamed: 0,genes
0,0
1,0
2,225067
3,0
4,0
...,...
252730,0
252731,0
252732,0
252733,0


### ra.positions

In [14]:
ra.positions

array([       0,        0, 15611873, ...,        0,        0,        0],
      dtype=int32)

In [15]:
len(ra.positions)

252735

In [16]:
df_pos = pd.DataFrame(ra.positions, columns=["pos"])
df_pos

Unnamed: 0,pos
0,0
1,0
2,15611873
3,0
4,0
...,...
252730,0
252731,0
252732,0
252733,0


## Merge

In [17]:
df_merged = pd.concat([df_data, df_genes, df_pos], axis=1)
df_merged

Unnamed: 0,status,cell,rmt,n_poly_t,genes,pos
0,1,200983461452598,38446774684,0,0,0
1,1,232448540961013,40377936093,0,0,0
2,0,200570500499883,41267181429,0,225067,15611873
3,1,231772364696924,41564723102,0,0,0
4,1,227499945647347,39153163702,0,0,0
...,...,...,...,...,...,...
252730,1,169082619287347,49237838683,0,0,0
252731,1,164701771590046,57689455348,0,0,0
252732,1,169038985719156,33098951083,0,0,0
252733,1,231840668564334,31598787309,0,0,0


## Translate Cell Barcode

In [18]:
from seqc.sequence.encodings import DNA3Bit

In [19]:
dna3bit = DNA3Bit()

In [20]:
cb = df_merged.cell.apply(lambda x: dna3bit.decode(x).decode())
cb

0         GGGAGATTCACGGACC
1         CACCAAAAGGACATCG
2         GGACGTCGTTCAACGT
3         CAGAGCCGTGCCCGTA
4         CTGCATCGTAAGCTCT
                ...       
252730    ACTATCTGTGTGTACT
252731    AGTAGCTCAGGCACTC
252732    ACTTCGCAGAACCGCA
252733    CAGGGCTAGATTGGGC
252734    AAGTGAATCGGAAGGT
Name: cell, Length: 252735, dtype: object

In [21]:
umi = df_merged.rmt.apply(lambda x: dna3bit.decode(x).decode())
umi

0         ATCTACCTACTA
1         AGACGGACATTG
2         ACTTGGCTTGCG
3         ACGGTACGTCTC
4         AATGGGTGCCCC
              ...     
252730    GGCCCTTAGGTT
252731    CGGCATCTGTCA
252732    TCCACCTCACGT
252733    TGTTTTCTGTGG
252734    TCTCTCTATTGG
Name: rmt, Length: 252735, dtype: object

In [22]:
df_merged = df_merged.assign(cb=cb)
df_merged = df_merged.assign(umi=umi)

In [23]:
df_merged

Unnamed: 0,status,cell,rmt,n_poly_t,genes,pos,cb,umi
0,1,200983461452598,38446774684,0,0,0,GGGAGATTCACGGACC,ATCTACCTACTA
1,1,232448540961013,40377936093,0,0,0,CACCAAAAGGACATCG,AGACGGACATTG
2,0,200570500499883,41267181429,0,225067,15611873,GGACGTCGTTCAACGT,ACTTGGCTTGCG
3,1,231772364696924,41564723102,0,0,0,CAGAGCCGTGCCCGTA,ACGGTACGTCTC
4,1,227499945647347,39153163702,0,0,0,CTGCATCGTAAGCTCT,AATGGGTGCCCC
...,...,...,...,...,...,...,...,...
252730,1,169082619287347,49237838683,0,0,0,ACTATCTGTGTGTACT,GGCCCTTAGGTT
252731,1,164701771590046,57689455348,0,0,0,AGTAGCTCAGGCACTC,CGGCATCTGTCA
252732,1,169038985719156,33098951083,0,0,0,ACTTCGCAGAACCGCA,TCCACCTCACGT
252733,1,231840668564334,31598787309,0,0,0,CAGGGCTAGATTGGGC,TGTTTTCTGTGG


## Filtering

In [24]:
# reads that are mapped to a gene (i.e. status=0)
x = df_merged[ df_merged.status == 0 ].head(n=1)
x

Unnamed: 0,status,cell,rmt,n_poly_t,genes,pos,cb,umi
2,0,200570500499883,41267181429,0,225067,15611873,GGACGTCGTTCAACGT,ACTTGGCTTGCG


In [25]:
cb = x.iloc[0].umi
cb

'ACTTGGCTTGCG'

In [26]:
umi = x.iloc[0].umi
umi

'ACTTGGCTTGCG'

In [27]:
gene_id = "ENSG{:011d}".format(x.iloc[0].genes)
gene_id

'ENSG00000225067'

### Look Up BAM

In [28]:
!samtools view "$path_aligned_bam" | grep "$cb" | grep "$umi"

:GGACGTCGTTCAACGT:ACTTGGCTTGCG:;A00228:279:HFWFVDMXX:1:1470:21450:5102	0	chr19	15611873	255	24S67M	*	0	0	AGCAGTGGTATCAACGCAGAGTACATGGGATCATCAAGTTTCCGCTGACCACTGAGTCTGCCATGAAGAAGATAGAAGACAACAACACACT	FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFF:FFFFFFFFFFFFFFF	NH:i:1	HI:i:1	AS:i:61	nM:i:2


### Look Up GTF

In [29]:
!gunzip -c "$path_gtf" | grep $gene_id

chr19	HAVANA	gene	15611657	15612122	.	+	.	gene_id "ENSG00000225067.4"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "RPL23AP2"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000158039.3";
chr19	HAVANA	transcript	15611657	15612122	.	+	.	gene_id "ENSG00000225067.4"; transcript_id "ENST00000471227.3"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "RPL23AP2"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "RPL23AP2-001"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000158039.3"; havana_transcript "OTTHUMT00000350064.3";
chr19	HAVANA	exon	15611657	15612122	.	+	.	gene_id "ENSG00000225067.4"; transcript_id "ENST00000471227.3"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "RPL23AP2"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "RPL23AP2-001"; exon_number 1; exon_id "ENSE00001824478.3";