In [None]:
import os
import glob
import canine
import pandas as pd

In [None]:
!ls

In [None]:
data_set = "July2021_set"
root = "/home/kstewart"    # Top level directory
proj = f"/data/{data_set}" # Project directory
out = f"{root}/runs"       # Output directory
sheet = f"gs://brca-gray/data/{data_set}/sample_sheet.July2021.barcoded.csv" # Sample sheet
chrom = "gs://brca-gray/data/Nov12_set/chromium-shared-sample-indexes-plate.csv" # Chromium protocol

In [None]:
# Set the output directory
outDir=f"{out}/July2021_set"

In [None]:
# Set the FastQ Directory
# fastQs=[f"{root}{proj}/fastqs"]
fastQs = f"{root}/{proj}/fastqs"

In [None]:
# Set the Genome Reference
hgRef=f"{root}/ref/refdata-cellranger-GRCh38-3.0.0"

In [None]:
# Load the sample sheet
S=pd.read_csv(sheet)
if "Unnamed: 0" in S.columns:
    S = pd.read_csv(sheet, index_col=[0])
S

In [None]:
# Load the Chromium protocol sample indexes file
B=pd.read_csv(chrom, header=None)
B=B.rename(columns={0:'barcode'}).melt(id_vars='barcode',var_name='barcode_no',value_name='barcode_sequence')
B

In [None]:
B[B['barcode'].str.contains('SI-GA-B7')]

In [None]:
# lst = ['.pdf', '.sh', 'Undet']
# lst = '\t'.join(lst)
# sample_names = list()
# fastq_file_names = list()
# paths = list()
# for sample in os.listdir('data/July2021_set/fastqs'):
#     if any(x in sample for x in lst):
#         continue
#     sample_names.append(sample)
#     folder = os.path.join('data/July2021_set/fastqs', sample)
#     for file in os.listdir(folder):
#         if '.txt' in file:
#             continue
#         elif '.gz' in file:
#             path = os.path.join(f'data/July2021_set/fastqs/{sample}', file)
#             paths.append(path)
#             fastq_file_names.append(file)

# for idx, f in enumerate(fastq_file_names):
#     if idx < 5:
#         print(f)

In [None]:
# Create a DF where the sampleIDs are sorted by their corresponding barcode sequence
L = pd.Series(glob.glob(fastQs + '/*.fastq.gz'))
groups = ['(x\d+\w_)', '(x\w+_\d_)']
subs = list()
for group in groups:
    extract = f'{group}(CKD.*)(_[HGKJKCCX2]+)'
    sub = L.str.extract(extract ,expand=True)
    sub = sub.drop_duplicates().dropna()
    subs.append(sub)
L = pd.concat(subs)
L[0] = L[0] + L[1] + L[2] # Create sample ID from substrings
L[1] = L[1].str.extract('(SI.*)')
L = L.drop(2, axis=1)
L.columns = ['sampleID', 'barcode']
L['barcode'] = L['barcode'].str.replace('_', '-')
L

In [None]:
L = pd.merge(L, B, on='barcode', how='left')
L

In [None]:
L.loc[:, "Index"] = L['barcode'].str.replace("SI-GA-", '')
L

In [None]:
L = L.drop('barcode_no', axis=1)
L

In [None]:
L = L.groupby('Index').agg(" ".join).reset_index()

In [None]:
# Merge sample sheet with sorted barcodes
G = pd.merge(S, L, how='right', on='Index')
G

In [None]:
cellranger_cmd = 'cellranger count --id=${id} ' \
               + '--transcriptome=${ref} ' \
               + '--fastqs=${fastQs} ' \
               + '--sample=${sample} ' \
               + '--expect-cells=${expect} ' \
               + '--nosecondary ' \
               + '--localcores=${nthread}'

In [None]:
# Write the Canine .yaml
canine_conf = {'retry'        : 0,
	       'name'         : 'cellranger',
	       'inputs'       : {'fastQs'  : fastQs,
	                         'ref'     : hgRef,
	                         'id'      : list(G['patient']),
	                         'sample'  : list(G['sampleID'].str.replace(' ',',')),
	                         'expect'  : list(G['Target cell recovery']),
	                         'nthread' : 16},
	       'script'       : ['rm -f */_lock', cellranger_cmd],
	       'localization' : {'strategy'    :'NFS',
	                         'staging_dir' : outDir},
               'resources'   : {'mem':'100G','cpus-per-task':16,'nodes':1}
               }

In [None]:
canine_conf