# split by innerBC

In [3]:
import gzip, time
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import pandas as pd
import seaborn as sns
from Bio.Seq import Seq
from collections import Counter
import os

In [2]:
def get_innerBC(r2_line):
    innerBC = r2_line[0:6]
    num_Ns = sum([c=='N' for c in innerBC])
    innerBC_ref = ['AATCCG','AATCGC','AAGTCG','AAGCTC','AACGTG','AACTGC','ACGTAG','ATAGCG',
                  'ATTCCG','ATGCCA','ATGTTC','ATCACG','ATCCAG','ACAGTG','ACTCTG','ACTTGA',
                  'ACGATC']
    if num_Ns > 1: return None 
    elif num_Ns == 1 and innerBC == 'ANTCCG':return None # will miss some reads
    elif num_Ns == 1 and np.any([innerBC.replace('N',c) in innerBC_ref for c in 'ACTG']): 
        return [innerBC.replace('N',c) for c in 'ACTG' if innerBC.replace('N',c) in innerBC_ref ][0]
    elif innerBC in innerBC_ref: return innerBC
    else: return None
    
def hamming(bc1,bc2): return np.sum([x1 != x2 for x1,x2 in zip(bc1,bc2)])

In [3]:
sample_paths = {
    'r1': {
        'target_fastq_paths':['/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz']
    },
    'r2': {
        'target_fastq_paths':['/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz']
    },
    'r3': {
        'target_fastq_paths':['/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz']
    }
}

In [4]:
innerBC_reads_count = {}
innerBC_reads_count['r1'] = {}
innerBC_reads_count['r2'] = {}

for sample,paths in sample_paths.items():
    for fastq_path in paths['target_fastq_paths']:
        R1 = gzip.open(fastq_path.format('R1'))
        R2 = gzip.open(fastq_path.format('R2'))
        counter = 0
        start_time = time.time()
        while True:
            counter += 1
            if counter % 1000000 == 0: print(fastq_path+ ': Processed {} reads in {} seconds'.format(counter, time.time()-start_time))
            try:
                r1_line = R1.readline().decode('utf-8')
                r2_line = R2.readline().decode('utf-8')
            except:
                print('ERROR extracting {}'.format(fastq_path))
                break
            if r2_line == '' : break
            if r2_line[0] == '@': 
                read2_name = r2_line
                read1_name = r1_line
                
                read1_seq = R1.readline().decode('utf-8')
                read2_seq = R2.readline().decode('utf-8')
                
                if get_innerBC(read2_seq)!=None:
                    innerBC = get_innerBC(read2_seq)
                    read2_QI = R2.readline().decode('utf-8')
                    read2_baseQ = R2.readline().decode('utf-8')
                    read2 = [read2_name , read2_seq , read2_QI , read2_baseQ]

                    read1_QI = R1.readline().decode('utf-8')
                    read1_baseQ = R1.readline().decode('utf-8')
                    read1 = [read1_name , read1_seq , read1_QI , read1_baseQ]

                    #print(read1)
                    
                    if innerBC in innerBC_reads_count['r1']:
                        innerBC_reads_count['r1'][innerBC] += read1
                        innerBC_reads_count['r2'][innerBC] += read2
                    else:
                        innerBC_reads_count['r1'][innerBC] = read1
                        innerBC_reads_count['r2'][innerBC] = read2
                else:
                    read2_QI = R2.readline().decode('utf-8')
                    read2_baseQ = R2.readline().decode('utf-8')
                    read2 = [read2_name , read2_seq , read2_QI , read2_baseQ]
                    
                    read1_QI = R1.readline().decode('utf-8')
                    read1_baseQ = R1.readline().decode('utf-8')
                    read1 = [read1_name , read1_seq , read1_QI , read1_baseQ]

                    if 'unclassified' in innerBC_reads_count['r1']:
                        innerBC_reads_count['r1']['unclassified'] += read1
                        innerBC_reads_count['r2']['unclassified'] += read2
                    else:
                        innerBC_reads_count['r1']['unclassified'] = read1
                        innerBC_reads_count['r2']['unclassified'] = read2
                #break     

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 1000000 reads in 10.442124128341675 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 2000000 reads in 20.857176542282104 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 3000000 reads in 31.1675865650177 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fa

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 32000000 reads in 334.9098677635193 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 33000000 reads in 345.64138007164 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 34000000 reads in 356.3042137622833 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fa

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 63000000 reads in 664.1743316650391 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 64000000 reads in 674.9351074695587 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 65000000 reads in 685.6398892402649 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 94000000 reads in 994.449259519577 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 95000000 reads in 1004.9691958427429 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 96000000 reads in 1015.5487353801727 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 125000000 reads in 1325.0292806625366 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 126000000 reads in 1335.5870776176453 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combined_{}.fastq.gz: Processed 127000000 reads in 1346.273476600647 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01148_MCS-20240724-L-03-2024-07-291230/Sample_SQ24054800-YJH-cas9-73-YJH-cas9-73/SQ24054800-YJH-cas9-73-YJH-cas9-73_combine

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 8000000 reads in 85.66318774223328 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 9000000 reads in 96.23988199234009 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 10000000 reads in 107.04585146903992 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.f

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 39000000 reads in 419.71070766448975 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 40000000 reads in 430.70273900032043 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 41000000 reads in 441.63268089294434 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-131106/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 24000000 reads in 257.7742564678192 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 25000000 reads in 268.5402684211731 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 26000000 reads in 279.43656754493713 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}

/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 55000000 reads in 594.6338248252869 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 56000000 reads in 605.4824240207672 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.fastq.gz: Processed 57000000 reads in 616.264365196228 seconds
/syn1/liangzhen/jinhua_jilab_project/data/DNA_Amplicon/multiome/RNA/s2164g01162_MCS-20240905-L-02-2024-09-141723/Sample_SQ24067738-YJH-cas9-76-YJH-cas9-76/SQ24067738-YJH-cas9-76-YJH-cas9-76_combined_{}.f

In [5]:
os.system('rm /syn1/liangzhen/jinhua_jilab_project/result/DNA_Amplicon/multiome/fastq/*')
for fq,pooled_reads in innerBC_reads_count.items():
    for innerBC,reads_set in pooled_reads.items():
        #print(innerBC+fq)
        with open('/syn1/liangzhen/jinhua_jilab_project/result/DNA_Amplicon/multiome/fastq'+'/'+innerBC+'_'+fq+'.fq','w+') as f:
            f.writelines(reads_set)

rm: cannot remove ‘/syn1/liangzhen/jinhua_jilab_project/result/DNA_Amplicon/multiome/fastq/*’: No such file or directory


In [6]:
os.system('ls /syn1/liangzhen/jinhua_jilab_project/result/DNA_Amplicon/multiome/fastq/*.fq|xargs -I {} gzip {}')

0