In [1]:
from glob import glob
from os.path import basename, join
import os

import isabl_cli as ii
import pandas as pd
import pysam

In [2]:
project = 267
vcfs_dir = join(f'P{project}', 'any2lcc')
os.makedirs(vcfs_dir, exist_ok=True)

# Query the succedded ANNOT_INDELS (pk=32) from isabl
analyses = ii.get_analyses(targets__projects=project, application=32, status='SUCCEEDED')

Retrieving 102 from analyses API endpoint...


In [3]:
# Load a VCF from one analysis to get the headers as templates
template_vcf_path = glob(join(analyses[0].storage_url, 'merged', '*.indels.pass.vcf.gz'))[0]
template_vcf = pysam.VariantFile(template_vcf_path, mode="r")

In [4]:
for a in analyses:
    
    # Read the TSV with the ANY2LCC column
    pass_tsv = a.results['pass_tsv']
    print(f'📖Reading tsv {pass_tsv}')
    
    indels = []
    chunks = pd.read_csv(
        filepath_or_buffer=pass_tsv,
        compression="gzip",
        chunksize=20000,
        na_values=["NA"],
        sep="\t",
        comment="#",
        low_memory=False,
        dtype={'CHR': str},
    )
    for i in chunks:
        indels.append(i[i["ANY2_LCC"].astype(bool)])
    any2lcc_indels = pd.concat(indels)
    print(f'🦖{len(any2lcc_indels)} ANY2LCC Indels were found')

    # Write a new VCF
    tn = f'{a.targets[0].system_id}_vs_{a.references[0].system_id}'
    any2lcc_vcf = join(vcfs_dir, f'{tn}_any2lcc.vcf')
    print(f'📝Creating vcf {any2lcc_vcf}\n')
    with pysam.VariantFile(any2lcc_vcf, mode="w", header=template_vcf.header) as any2lcc:
        for _,row in any2lcc_indels.iterrows():
            rec = any2lcc.new_record()
            rec.chrom = str(row.CHR)
            rec.pos = row.START
            rec.ref = row.REF
            rec.alts = [row.ALT]
            any2lcc.write(rec)

📖Reading tsv /work/isabl/data/analyses/43/24/164324/pass/I-H-135076-T1-1-D1-1_vs_I-H-135076-N1-1-D1-1.indels.output.annot.tsv.gz
🦖524 ANY2LCC Indels were found
📝Creating vcf P267/any2lcc/I-H-135076-T1-1-D1-1_vs_I-H-135076-N1-1-D1-1_any2lcc.vcf

📖Reading tsv /work/isabl/data/analyses/43/27/164327/pass/I-H-133673-T2-1-D1-1_vs_I-H-133673-N1-1-D1-1.indels.output.annot.tsv.gz
🦖1148 ANY2LCC Indels were found
📝Creating vcf P267/any2lcc/I-H-133673-T2-1-D1-1_vs_I-H-133673-N1-1-D1-1_any2lcc.vcf

📖Reading tsv /work/isabl/data/analyses/43/35/164335/pass/I-H-134202-T1-2-D1-1_vs_I-H-134202-N1-1-D1-1.indels.output.annot.tsv.gz
🦖2291 ANY2LCC Indels were found
📝Creating vcf P267/any2lcc/I-H-134202-T1-2-D1-1_vs_I-H-134202-N1-1-D1-1_any2lcc.vcf

📖Reading tsv /work/isabl/data/analyses/43/36/164336/pass/I-H-134202-T2-1-D1-1_vs_I-H-134202-N1-1-D1-1.indels.output.annot.tsv.gz
🦖1110 ANY2LCC Indels were found
📝Creating vcf P267/any2lcc/I-H-134202-T2-1-D1-1_vs_I-H-134202-N1-1-D1-1_any2lcc.vcf

📖Reading tsv /wor

In [5]:
# Run the Indels Signature for all project. See README to install package
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen

matrices = matGen.SigProfilerMatrixGeneratorFunc(
    f"P{project}", 
    "GRCh37", 
    vcfs_dir,
    plot=True, 
    exome=False, 
    bed_file=None, 
    chrom_based=False, 
    tsb_stat=False, 
    seqInfo=False, 
    cushion=100
)

Starting matrix generation for INDELs...Completed! Elapsed time: 194.36 seconds.
Matrices generated for 0 samples with 0 errors. Total of 0 SNVs, 0 DINUCs, and 184694 INDELs were successfully analyzed.
