# Clean Up Space

I am running into space issues on BioWulf. I am trying to finish up the new samples and I am sitting at ~39TB of 40TB. Both Pre-alignment and the alignment workflows do some clean-up but there are probably some files that are still around. Here I am just going to go through systematically and try to clean things up.

In [1]:
# %load ../config/defaults.py
import os
import sys
from pathlib import Path
from itertools import chain

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from dask.delayed import delayed
from dask.distributed import Client

from lcdblib.snakemake.helpers import fill_patterns
from lcdblib.utils.utils import flatten

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs
from ncbi_remap.snakemake import get_patterns

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

# Start dask cluster
client = Client(n_workers=12, threads_per_worker=1)

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-03-09 
Git hash: 481186bc23484439db5d199bf22411933a1bca80


## Pre-Alignment Workflow

In [2]:
patterns = get_patterns('../prealn-wf/patterns.yaml')

In [3]:
patterns

{'abi_solid': 'output/samples/{srx}/{srr}/ABI_SOLID',
 'alignment_bad': 'output/samples/{srx}/{srr}/ALIGNMENT_BAD',
 'atropos': {'r1': 'output/samples/{srx}/{srr}/{srr}_1.trim.clean.fastq.gz',
  'r2': 'output/samples/{srx}/{srr}/{srr}_2.trim.clean.fastq.gz'},
 'bai': 'output/samples/{srx}/{srr}/{srr}.hisat2.bam.bai',
 'bamtools_stats': 'output/samples/{srx}/{srr}/{srr}.hisat2.bam.bamtools.stats',
 'download_bad': 'output/samples/{srx}/{srr}/DOWNLOAD_BAD',
 'fastq': {'r1': 'output/samples/{srx}/{srr}/{srr}_1.fastq.gz',
  'r2': 'output/samples/{srx}/{srr}/{srr}_2.fastq.gz',
  'summary': 'output/samples/{srx}/{srr}/{srr}.fastq.tsv'},
 'fastq_screen': 'output/samples/{srx}/{srr}/{srr}_1.fastq_screen.txt',
 'feature_counts': {'counts': 'output/samples/{srx}/{srr}/{srr}.hisat2.bam.feature_counts.counts',
  'jcounts': 'output/samples/{srx}/{srr}/{srr}.hisat2.bam.feature_counts.counts.jcounts',
  'summary': 'output/samples/{srx}/{srr}/{srr}.hisat2.bam.feature_counts.counts.summary'},
 'hisat2'

### Download Bad

In [4]:
del patterns['download_bad']
del patterns['fastq']['summary']
del patterns['hisat2']['splice_sites']

In [46]:
dln = store['prealn/download_bad']
for i, (srx, srr) in dln.iterrows():
    targets = flatten(fill_patterns(patterns, dict(srx=srx, srr=srr)))
    for target in targets:
        _t = Path('../prealn-wf', target)
        if _t.exists():
            print(_t)
            _t.unlink()

In [129]:
# Delete extra logs
dln = store['prealn/download_bad']
for i, (srx, srr) in dln.iterrows():
    odir = Path(f'../output/prealn-wf/samples/{srx}/{srr}')
    for log in odir.glob('*.log'):
        if log.name.endswith('fastq.gz.log'):
            continue
        
        print(log)
        log.unlink()

### Abi Solid

In [5]:
del patterns['abi_solid']
del patterns['layout']

In [56]:
dln = store['prealn/abi_solid']
assert len(dln) == 548    # just making sure I match the 548 from the prealn-store.py queue --print
for i, (srx, srr) in dln.iterrows():
    targets = flatten(fill_patterns(patterns, dict(srx=srx, srr=srr)))
    for target in targets:
        _t = Path('../prealn-wf', target)
        if _t.exists():
            print(_t)
            _t.unlink()

In [132]:
# Delete extra logs that are not needed
dln = store['prealn/abi_solid']
for i, (srx, srr) in dln.iterrows():
    odir = Path(f'../output/prealn-wf/samples/{srx}/{srr}')
    for log in odir.glob('*.log'):
        if log.name.endswith('fastq.gz.log'):
            continue
        print(log)
        log.unlink()

### Quality scores bad

In [6]:
del patterns['quality_scores_bad']

In [63]:
dln = store['prealn/quality_scores_bad']
assert len(dln) == 4    # just making sure I match the 4 from the prealn-store.py queue --print
for i, (srx, srr) in dln.iterrows():
    targets = flatten(fill_patterns(patterns, dict(srx=srx, srr=srr)))
    for target in targets:
        _t = Path('../prealn-wf', target)
        if _t.exists():
            print(_t)
            _t.unlink()

In [135]:
# Delete extra logs that are not needed
dln = store['prealn/quality_scores_bad']
for i, (srx, srr) in dln.iterrows():
    odir = Path(f'../output/prealn-wf/samples/{srx}/{srr}')
    for log in odir.glob('*.log'):
        if log.name.endswith('fastq.gz.log'):
            continue
        print(log)
        log.unlink()

### Alignment Bad

In [7]:
del patterns['alignment_bad']
del patterns['hisat2']['summary']
del patterns['fastq_screen']

In [100]:
@delayed
def check_aln_bad(srx, srr):
    targets = flatten(fill_patterns(patterns, dict(srx=srx, srr=srr)))
    outs = []
    for target in targets:
        _t = Path('../prealn-wf', target)
        if _t.exists():
            _t.unlink()
            outs.append(_t.as_posix())
    return outs

dln = store['prealn/alignment_bad']
assert len(dln) == 2337    # just making sure I match the 2337 from the prealn-store.py queue --print
lazy = []
for i, (srx, srr) in dln.iterrows():
    lazy.append(check_aln_bad(srx, srr))

futures = client.compute(lazy)
res = list(chain.from_iterable(client.gather(futures)))

In [None]:
len(res)

In [16]:
@delayed
def check_aln_log(odir):
    outs = []
    for log in odir.glob('*.log'):
        if log.name.endswith('fastq.gz.log') | log.name.endswith('hisat2.bam.log') | log.name.endswith('fastq_screen.txt.log'):
            continue
        outs.append(log)
        log.unlink()
    return outs

# Delete extra logs that are not needed
dln = store['prealn/alignment_bad']
lazy = []
for i, (srx, srr) in dln.iterrows():
    odir = Path(f'../output/prealn-wf/samples/{srx}/{srr}')
    lazy.append(check_aln_log(odir))
    
futures = client.compute(lazy)
res = list(chain.from_iterable(client.gather(futures)))

In [17]:
len(res)

0

### Complete

In [109]:
@delayed
def check_complete(srx, srr):
    targets = fill_patterns(patterns, dict(srx=srx, srr=srr))
    targets = [*targets['hisat2']['bam'], *targets['bai'], *targets['atropos']['r1'], *targets['atropos']['r2']]
    outs = []
    for target in targets:
        _t = Path('../prealn-wf', target)
        if _t.exists():
            outs.append(_t.as_posix())
    return outs

# This is very dangerous, could accidently delete evertyhing.
# So I am leaving it as just a print statement
dln = store['prealn/complete']
lazy = []
for i, (srx, srr) in dln.iterrows():
    lazy.append(check_complete(srx, srr))

futures = client.compute(lazy)
res = list(chain.from_iterable(client.gather(futures)))

## Alignment workflow

In [3]:
patterns = get_patterns('../aln-wf/patterns.yaml')

In [4]:
patterns

{'alignment_bad': 'output/samples/{srx}/{srr}/ALIGNMENT_BAD',
 'atropos': {'r1': 'output/samples/{srx}/{srr}/{srr}_1.trim.clean.fastq.gz',
  'r2': 'output/samples/{srx}/{srr}/{srr}_2.trim.clean.fastq.gz'},
 'chromSizes_fb': '../output/dmel_r6-11.flybase.chromsizes',
 'fastq': {'r1': '../output/prealn-wf/samples/{srx}/{srr}/{srr}_1.fastq.gz',
  'r2': '../output/prealn-wf/samples/{srx}/{srr}/{srr}_2.fastq.gz',
  'summary': '../output/prealn-wf/samples/{srx}/{srr}/{srr}.fastq.tsv'},
 'hisat2': {'bai': 'output/samples/{srx}/{srr}/{srr}.fq.bam.bai',
  'bam': 'output/samples/{srx}/{srr}/{srr}.fq.bam',
  'splice_sites': '../output/prealn-wf/known_splice_sites_r6-11.txt',
  'summary': 'output/samples/{srx}/{srr}/{srr}.fq.bam.tsv'},
 'intergenic': {'bed': '../output/dmel_r6-11.intergenic.bed',
  'gtf': '../output/dmel_r6-11.intergenic.gtf'},
 'layout': '../output/prealn-wf/samples/{srx}/{srr}/LAYOUT',
 'srxMerge': {'bai': 'output/samples/{srx}/{srx}.bam.bai',
  'bam': 'output/samples/{srx}/{srx

### Alignment Bad

In [5]:
del patterns['strand']
del patterns['layout']
del patterns['chromSizes_fb']
del patterns['intergenic']
del patterns['hisat2']['splice_sites']
del patterns['alignment_bad']

In [5]:
@delayed
def check_aln_bad(srx, srr, strand):
    targets = flatten(fill_patterns(patterns, dict(srx=srx, srr=srr, strand=strand)))
    outs = []
    for target in targets:
        _t = Path('../aln-wf', target)
        if _t.exists():
            #_t.unlink()
            outs.append(_t.as_posix())
    return outs

dln = store['aln/alignment_bad']
assert len(dln) == 102    # just making sure I match the 102 from the aln-store.py queue --print
lazy = []
for i, (srx, srr) in dln.iterrows():
    for strand in ['first', 'second']:
        lazy.append(check_aln_bad(srx, srr, strand))

futures = client.compute(lazy)
res = list(chain.from_iterable(client.gather(futures)))

In [6]:
len(res)

0

In [7]:
@delayed
def check_aln_log(odir):
    outs = []
    for log in odir.glob('*.log'):
        if log.name.endswith('hisat2.bam.log'):
            continue
        outs.append(log)
        #log.unlink()
    return outs

# Delete extra logs that are not needed
dln = store['aln/alignment_bad']
lazy = []
for i, (srx, srr) in dln.iterrows():
    odir = Path(f'../output/aln-wf/samples/{srx}/{srr}')
    lazy.append(check_aln_log(odir))
    
futures = client.compute(lazy)
res = list(chain.from_iterable(client.gather(futures)))

In [8]:
len(res)

0

### Complete

In [6]:
@delayed
def check_complete(srx, srr, strand):
    targets = fill_patterns(patterns, dict(srx=srx, srr=srr, strand=strand))
    targets = [
        *targets['fastq']['r1'],
        *targets['fastq']['r2'],
        *targets['hisat2']['bam'], 
        *targets['hisat2']['bai'], 
        *targets['atropos']['r1'], 
        *targets['atropos']['r2'],
        *targets['srxMerge']['bamCoverage'],
        *targets['srxMerge']['bamCoverageFlyBase'],
    ]
    outs = []
    for target in targets:
        if target.startswith('../prealn'):
            _t = Path(target)
        else:
            _t = Path('../aln-wf', target)
        if _t.exists():
            outs.append(_t.as_posix())
            #_t.unlink()
    return outs

# This is very dangerous, could accidently delete evertyhing.
dln = store['aln/complete']
lazy = []
for i, (srx, srr) in dln.iterrows():
    for strand in ['first', 'second']:
        lazy.append(check_complete(srx, srr, strand))

futures = client.compute(lazy)
res = list(chain.from_iterable(client.gather(futures)))

In [8]:
len(res)

0