In [65]:
import os
import glob
import canine
import pandas as pd

In [66]:
!ls

anaconda3  canine  data  ref  res_to_bucket.sh	runs  test.py  Untitled.ipynb


In [67]:
data_set = "July2021_set"
root = "/home/kstewart"    # Top level directory
proj = f"/data/{data_set}" # Project directory
out = f"{root}/runs"       # Output directory
sheet = f"gs://brca-gray/data/{data_set}/sample_sheet.July2021.barcoded.csv" # Sample sheet
chrom = "gs://brca-gray/data/Nov12_set/chromium-shared-sample-indexes-plate.csv" # Chromium protocol

In [68]:
# Set the output directory
outDir=f"{out}/July2021_set"

In [69]:
# Set the FastQ Directory
# fastQs=[f"{root}{proj}/fastqs"]
fastQs = f"{root}/{proj}/fastqs"

In [70]:
# Set the Genome Reference
hgRef=f"{root}/ref/refdata-cellranger-GRCh38-3.0.0"

In [71]:
# Load the sample sheet
S=pd.read_csv(sheet)
if "Unnamed: 0" in S.columns:
    S = pd.read_csv(sheet, index_col=[0])
S

Unnamed: 0,patient,Target cell recovery,chemistry,Index,age,meno,condition,sample_num,barcode
0,MGH21004A,10000,novo,B7,37,pre,WT,210041,WT-8-3
1,MGH21004B,10000,novo,B8,37,pre,WT,210042,WT-8-4
2,MGH21005,800,novo,B9,59,post,WT,21005,WT-9-3
3,MGH21010A,10000,novo,B10,41,pre,BRCA1,210101,BRCA1-8-1
4,MGH21010B,10000,novo,B11,41,pre,BRCA1,210102,BRCA1-8-2
5,MGH21021,800,novo,B12,45,post,BRCA2,21021,BRCA2-11
6,MGH21023,1700,novo,C1,48,peri,BRCA2,21023,BRCA2-12
7,MGH21026,200,novo,C2,48,post,BRCA2,21026,BRCA2-13
8,MGH21016,2000,novo,B2,35,post,BRCA1,21016,BRCA1-9
9,MGH21017,650,novo,B3,53,post,WT,21017,WT-10


In [72]:
# Load the Chromium protocol sample indexes file
B=pd.read_csv(chrom, header=None)
B=B.rename(columns={0:'barcode'}).melt(id_vars='barcode',var_name='barcode_no',value_name='barcode_sequence')
B

Unnamed: 0,barcode,barcode_no,barcode_sequence
0,SI-GA-A1,1,GGTTTACT
1,SI-GA-A2,1,TTTCATGA
2,SI-GA-A3,1,CAGTACTG
3,SI-GA-A4,1,TATGATTC
4,SI-GA-A5,1,CTAGGTGA
...,...,...,...
379,SI-GA-H8,4,AGCGCCTA
380,SI-GA-H9,4,TTTACCCA
381,SI-GA-H10,4,TCGTCACG
382,SI-GA-H11,4,TTGCGCGC


In [73]:
B[B['barcode'].str.contains('SI-GA-B7')]

Unnamed: 0,barcode,barcode_no,barcode_sequence
18,SI-GA-B7,1,AAACCTCA
114,SI-GA-B7,2,GCCTTGGT
210,SI-GA-B7,3,CTGGACTC
306,SI-GA-B7,4,TGTAGAAG


In [74]:
# lst = ['.pdf', '.sh', 'Undet']
# lst = '\t'.join(lst)
# sample_names = list()
# fastq_file_names = list()
# paths = list()
# for sample in os.listdir('data/July2021_set/fastqs'):
#     if any(x in sample for x in lst):
#         continue
#     sample_names.append(sample)
#     folder = os.path.join('data/July2021_set/fastqs', sample)
#     for file in os.listdir(folder):
#         if '.txt' in file:
#             continue
#         elif '.gz' in file:
#             path = os.path.join(f'data/July2021_set/fastqs/{sample}', file)
#             paths.append(path)
#             fastq_file_names.append(file)

# for idx, f in enumerate(fastq_file_names):
#     if idx < 5:
#         print(f)

In [75]:
# Create a DF where the sampleIDs are sorted by their corresponding barcode sequence
L = pd.Series(glob.glob(fastQs + '/*.fastq.gz'))
groups = ['(x\d+\w_)', '(x\w+_\d_)']
subs = list()
for group in groups:
    extract = f'{group}(CKD.*)(_[HGKJKCCX2]+)'
    sub = L.str.extract(extract ,expand=True)
    sub = sub.drop_duplicates().dropna()
    subs.append(sub)
L = pd.concat(subs)
L[0] = L[0] + L[1] + L[2] # Create sample ID from substrings
L[1] = L[1].str.extract('(SI.*)')
L = L.drop(2, axis=1)
L.columns = ['sampleID', 'barcode']
L['barcode'] = L['barcode'].str.replace('_', '-')
L

Unnamed: 0,sampleID,barcode
0,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7
24,x21004B_CKDL210014973-1a-SI_GA_B8_HGKJKCCX2,SI-GA-B8
48,x21005_CKDL210014974-1a-SI_GA_B9_HGKJKCCX2,SI-GA-B9
72,x21010A_CKDL210014975-1a-SI_GA_B10_HGKJKCCX2,SI-GA-B10
96,x21010B_CKDL210014976-1a-SI_GA_B11_HGKJKCCX2,SI-GA-B11
120,x21011A_CKDL210014984-1a-SI_GA_C3_HGKJKCCX2,SI-GA-C3
144,x21011B_CKDL210014985-1a-SI_GA_C4_HGKJKCCX2,SI-GA-C4
168,x21012_CKDL210014986-1a-SI_GA_C5_HGKJKCCX2,SI-GA-C5
192,x21016_CKDL210014980-1a-SI_GA_B2_HGKJKCCX2,SI-GA-B2
216,x21017_CKDL210014981-1a-SI_GA_B3_HGKJKCCX2,SI-GA-B3


In [77]:
L = pd.merge(L, B, on='barcode', how='left')
L

Unnamed: 0,sampleID,barcode,barcode_no,barcode_sequence
0,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,1,AAACCTCA
1,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,2,GCCTTGGT
2,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,3,CTGGACTC
3,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,4,TGTAGAAG
4,x21004B_CKDL210014973-1a-SI_GA_B8_HGKJKCCX2,SI-GA-B8,1,AAAGTGCT
...,...,...,...,...
63,xGRBI_6_CKDL210014987-1a-SI_GA_C6_HGKJKCCX2,SI-GA-C6,4,CAGCGGAA
64,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,1,GTCTCTCG
65,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,2,AATCTCTC
66,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,3,CGGAGGGA


In [78]:
L.loc[:, "Index"] = L['barcode'].str.replace("SI-GA-", '')
L

Unnamed: 0,sampleID,barcode,barcode_no,barcode_sequence,Index
0,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,1,AAACCTCA,B7
1,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,2,GCCTTGGT,B7
2,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,3,CTGGACTC,B7
3,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,4,TGTAGAAG,B7
4,x21004B_CKDL210014973-1a-SI_GA_B8_HGKJKCCX2,SI-GA-B8,1,AAAGTGCT,B8
...,...,...,...,...,...
63,xGRBI_6_CKDL210014987-1a-SI_GA_C6_HGKJKCCX2,SI-GA-C6,4,CAGCGGAA,C6
64,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,1,GTCTCTCG,C7
65,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,2,AATCTCTC,C7
66,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,3,CGGAGGGA,C7


In [79]:
L = L.drop('barcode_no', axis=1)
L

Unnamed: 0,sampleID,barcode,barcode_sequence,Index
0,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,AAACCTCA,B7
1,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,GCCTTGGT,B7
2,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,CTGGACTC,B7
3,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2,SI-GA-B7,TGTAGAAG,B7
4,x21004B_CKDL210014973-1a-SI_GA_B8_HGKJKCCX2,SI-GA-B8,AAAGTGCT,B8
...,...,...,...,...
63,xGRBI_6_CKDL210014987-1a-SI_GA_C6_HGKJKCCX2,SI-GA-C6,CAGCGGAA,C6
64,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,GTCTCTCG,C7
65,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,AATCTCTC,C7
66,xGRBI_7_CKDL210014988-1a-SI_GA_C7_HGKJKCCX2,SI-GA-C7,CGGAGGGA,C7


In [80]:
L = L.groupby('Index').agg(" ".join).reset_index()

In [81]:
# Merge sample sheet with sorted barcodes
G = pd.merge(S, L, how='right', on='Index')
G

Unnamed: 0,patient,Target cell recovery,chemistry,Index,age,meno,condition,sample_num,barcode_x,sampleID,barcode_y,barcode_sequence
0,MGH21010A,10000,novo,B10,41,pre,BRCA1,210101,BRCA1-8-1,x21010A_CKDL210014975-1a-SI_GA_B10_HGKJKCCX2 x...,SI-GA-B10 SI-GA-B10 SI-GA-B10 SI-GA-B10,ACCGTATG GATTAGAT CTGACTGA TGACGCCC
1,MGH21010B,10000,novo,B11,41,pre,BRCA1,210102,BRCA1-8-2,x21010B_CKDL210014976-1a-SI_GA_B11_HGKJKCCX2 x...,SI-GA-B11 SI-GA-B11 SI-GA-B11 SI-GA-B11,GTTCCTCA AGGTACGC TAAGTATG CCCAGGAT
2,MGH21021,800,novo,B12,45,post,BRCA2,21021,BRCA2-11,x21021_CKDL210014977-1a-SI_GA_B12_HGKJKCCX2 x2...,SI-GA-B12 SI-GA-B12 SI-GA-B12 SI-GA-B12,TACCACCA CTAAGTTT GGGTCAAG ACTGTGGC
3,MGH21016,2000,novo,B2,35,post,BRCA1,21016,BRCA1-9,x21016_CKDL210014980-1a-SI_GA_B2_HGKJKCCX2 x21...,SI-GA-B2 SI-GA-B2 SI-GA-B2 SI-GA-B2,TACTCTTC CCTGTGCG GGACACGT ATGAGAAA
4,MGH21017,650,novo,B3,53,post,WT,21017,WT-10,x21017_CKDL210014981-1a-SI_GA_B3_HGKJKCCX2 x21...,SI-GA-B3 SI-GA-B3 SI-GA-B3 SI-GA-B3,GTGTATTA TGTGCGGG ACCATAAC CAACGCCT
5,MGH21031A,3500,novo,B4,34,pre,BRCA1,210311,BRCA1-9-1,x21031A_CKDL210014982-1a-SI_GA_B4_HGKJKCCX2 x2...,SI-GA-B4 SI-GA-B4 SI-GA-B4 SI-GA-B4,ACTTCATA GAGATGAC TGCCGTGG CTAGACCT
6,MGH21031B,3500,novo,B5,34,pre,BRCA1,210312,BRCA1-9-2,x21031B_CKDL210014983-1a-SI_GA_B5_HGKJKCCX2 x2...,SI-GA-B5 SI-GA-B5 SI-GA-B5 SI-GA-B5,AATAATGG CCAGGGCA TGCCTCAT GTGTCATC
7,MGH21004A,10000,novo,B7,37,pre,WT,210041,WT-8-3,x21004A_CKDL210014972-1a-SI_GA_B7_HGKJKCCX2 x2...,SI-GA-B7 SI-GA-B7 SI-GA-B7 SI-GA-B7,AAACCTCA GCCTTGGT CTGGACTC TGTAGAAG
8,MGH21004B,10000,novo,B8,37,pre,WT,210042,WT-8-4,x21004B_CKDL210014973-1a-SI_GA_B8_HGKJKCCX2 x2...,SI-GA-B8 SI-GA-B8 SI-GA-B8 SI-GA-B8,AAAGTGCT GCTACCTG TGCTGTAA CTGCAAGC
9,MGH21005,800,novo,B9,59,post,WT,21005,WT-9-3,x21005_CKDL210014974-1a-SI_GA_B9_HGKJKCCX2 x21...,SI-GA-B9 SI-GA-B9 SI-GA-B9 SI-GA-B9,CTGTAACT TCTAGCGA AGAGTGTG GACCCTAC


In [82]:
cellranger_cmd = 'cellranger count --id=${id} ' \
               + '--transcriptome=${ref} ' \
               + '--fastqs=${fastQs} ' \
               + '--sample=${sample} ' \
               + '--expect-cells=${expect} ' \
               + '--nosecondary ' \
               + '--localcores=${nthread}'

In [83]:
# Write the Canine .yaml
canine_conf = {'retry'        : 0,
	       'name'         : 'cellranger',
	       'inputs'       : {'fastQs'  : fastQs,
	                         'ref'     : hgRef,
	                         'id'      : list(G['patient']),
	                         'sample'  : list(G['sampleID'].str.replace(' ',',')),
	                         'expect'  : list(G['Target cell recovery']),
	                         'nthread' : 16},
	       'script'       : ['rm -f */_lock', cellranger_cmd],
	       'localization' : {'strategy'    :'NFS',
	                         'staging_dir' : outDir},
               'resources'   : {'mem':'100G','cpus-per-task':16,'nodes':1}
               }

In [84]:
canine_conf

{'retry': 0,
 'name': 'cellranger',
 'inputs': {'fastQs': '/home/kstewart//data/July2021_set/fastqs',
  'ref': '/home/kstewart/ref/refdata-cellranger-GRCh38-3.0.0',
  'id': ['MGH21010A',
   'MGH21010B',
   'MGH21021',
   'MGH21016',
   'MGH21017',
   'MGH21031A',
   'MGH21031B',
   'MGH21004A',
   'MGH21004B',
   'MGH21005',
   'MGH21023',
   'MGH21026',
   'MGH21011A',
   'MGH21011B',
   'MGH21012',
   'GRBI_6',
   'GRBI_7'],
  'sample': ['x21010A_CKDL210014975-1a-SI_GA_B10_HGKJKCCX2,x21010A_CKDL210014975-1a-SI_GA_B10_HGKJKCCX2,x21010A_CKDL210014975-1a-SI_GA_B10_HGKJKCCX2,x21010A_CKDL210014975-1a-SI_GA_B10_HGKJKCCX2',
   'x21010B_CKDL210014976-1a-SI_GA_B11_HGKJKCCX2,x21010B_CKDL210014976-1a-SI_GA_B11_HGKJKCCX2,x21010B_CKDL210014976-1a-SI_GA_B11_HGKJKCCX2,x21010B_CKDL210014976-1a-SI_GA_B11_HGKJKCCX2',
   'x21021_CKDL210014977-1a-SI_GA_B12_HGKJKCCX2,x21021_CKDL210014977-1a-SI_GA_B12_HGKJKCCX2,x21021_CKDL210014977-1a-SI_GA_B12_HGKJKCCX2,x21021_CKDL210014977-1a-SI_GA_B12_HGKJKCCX2',
   'x