# Explore cleanup

I am approaching my quota again. After some CLI magic I see that I need to cleanup the BAM files. However, I need to be really careful not to remove BAM files for RNA-Seq b/c they are still needed. This notebook is for thinking about the problem and estimating how useful cleanup will be. Currently I have ~29 TB in the aln-wf. If I remove all BAM that would be ~4TB.  

In [5]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [6]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-11-06 
Git hash: 2ab02467572949f50686ea6dc152955ca37f5633


In [7]:
# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')

In [16]:
pd.options.display.max_rows = 100

In [26]:
complete = store['aln/complete'].srx.values
metadata = pd.read_parquet('../output/metadata-wf/select_library_strategy.parquet')
metadata = metadata.reindex(complete)

In [27]:
metadata.Fear_et_al_library_strategy.value_counts().map(lambda x: f"{x:,}")

RNA-Seq                                18,459
EST                                     4,699
ChIP-Seq                                3,284
4C-Seq                                  2,674
WGS                                     2,071
RNA-Seq|OTHER                           1,064
ChIP-Seq|WGS                              208
ChIP-Seq|OTHER                            185
miRNA-Seq                                 115
DNA-Seq                                   111
Targeted-Capture                           93
EST|OTHER                                  86
AMPLICON                                   81
WGS|OTHER                                  65
STARR-Seq                                  56
RAD-Seq                                    55
RNA-Seq|RIP-Seq                            55
ncRNA-Seq                                  54
HiC-Seq                                    50
3C-Seq|4C-Seq|ChIP-Seq|OTHER               50
MNase-Seq|ChIP-Seq                         45
PRO-Seq                           

In [36]:
mask = metadata.Fear_et_al_library_strategy.str.contains('RNA-Seq') | \
    metadata.Fear_et_al_library_strategy.str.contains('EST')

In [82]:
to_clean = metadata[~mask].copy()

In [84]:
to_clean.Fear_et_al_library_strategy.value_counts().map(lambda x: f'{x:,}')

ChIP-Seq                               3,284
4C-Seq                                 2,674
WGS                                    2,071
ChIP-Seq|WGS                             208
ChIP-Seq|OTHER                           185
DNA-Seq                                  111
Targeted-Capture                          93
AMPLICON                                  81
WGS|OTHER                                 65
STARR-Seq                                 56
RAD-Seq                                   55
3C-Seq|4C-Seq|ChIP-Seq|OTHER              50
HiC-Seq                                   50
MNase-Seq|ChIP-Seq                        45
PRO-Seq                                   44
WGS|ChIP-Seq                              42
MNase-Seq                                 29
GRO-Seq                                   29
ChIP-Seq|MNase-Seq                        26
FAIRE-Seq|ChIP-Seq|FAIRE-seq              21
STAP-Seq                                  18
CLIP-Seq                                  15
DIP-Seq   

In [89]:
fsizes = []
for srx in to_clean.index.tolist():
    try:
        fsizes.append(os.path.getsize(f'../output/aln-wf/samples/{srx}/{srx}.bam'))
    except FileNotFoundError:
        pass

    try:
        fsizes.append(os.path.getsize(f'../output/aln-wf/samples/{srx}/{srx}.bam.bai'))
    except FileNotFoundError:
        pass

In [90]:
total = np.sum(fsizes)
total /= 1e12
print(f'{total:,.2f} TB')

17.53 TB


In [91]:
pth = Path(f'../output/aln-wf/samples/{srx}/{srx}.bam')

In [115]:
pth.stat().st_size / 1e9

18.723946704

In [94]:
os.path.getsize(f'../output/aln-wf/samples/{srx}/{srx}.bam')

18723946704

In [95]:
pth.unlink?

[0;31mSignature:[0m [0mpth[0m[0;34m.[0m[0munlink[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Remove this file or link.
If the path is a directory, use rmdir() instead.
[0;31mFile:[0m      ~/miniconda3/envs/ncbi_remap/lib/python3.6/pathlib.py
[0;31mType:[0m      method


In [96]:
from ncbi_remap.logging import logger

In [105]:
if not pth.exists():
    print('bob')

In [110]:
(~mask).sum()

18965

In [113]:
metadata.loc['SRX681773']

Fear_et_al_library_strategy    ChIP-Seq
Name: SRX681773, dtype: object

In [116]:
metadata.shape

(43658, 1)