# Bulk GEO Prep

In [56]:
import os
import sys
from pathlib import Path
import hashlib
import shutil

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from larval_gonad.notebook import Nb
from larval_gonad.plotting import make_figs
from larval_gonad.config import memory

In [27]:
# Setup notebook
nbconfig = Nb.setup_notebook()

last updated: 2018-06-05 
Git hash: 72db1802289852410ca6cc8eca9b1d269d296daa


In [28]:
samples = [
    'A10_FT',
    'A11_FT',
    'A12_FT',
    'A9_FT',
    'A1_TF',
    'A2_TF',
    'A3_TF',
    'A4_TF',
    'B5_TCP',
    'B6_TCP',
    'B7_TCP',
    'B8_TCP',
    'C1_TDT',
    'C2_TDT',
    'C3_TDT',
    'C4_TDT',
    'F10_TDP',
    'F11_TDP',
    'F12_TDP',
    'F9_TDP',
]

## Calculate md5sum

In [29]:
@memory.cache
def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

### Read 1 FQ

In [34]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}_R1.fastq.gz')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A10_FT_R1.fastq.gz,51f2b6a58ec00a93c31ee6b466816d1e
A11_FT_R1.fastq.gz,34aab4447d1189e05877726e78d9752b
A12_FT_R1.fastq.gz,75cb6e5a31c5f62d30047c97ed9254f3
A9_FT_R1.fastq.gz,60827125b25265a7debd1f3fac012b92
A1_TF_R1.fastq.gz,77f76ddc6855066f349a64b7e6481342
A2_TF_R1.fastq.gz,655e70d809c3f7d7f42c577e356abae7
A3_TF_R1.fastq.gz,6ce0cab4458f800a4ba9df65cf52d530
A4_TF_R1.fastq.gz,4832aaeba67890ead0b845eacab5a766
B5_TCP_R1.fastq.gz,d1c9dc0d2e5085ac2431489cfd458e2b
B6_TCP_R1.fastq.gz,ef769e8d1ce3689890e44d25bf0f960c


### First Strand BigWig

In [35]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A10_FT.cutadapt.bam.pos.bigwig,e6dd0032380445af08233b1a3d9a709d
A11_FT.cutadapt.bam.pos.bigwig,9fc45c0be2caadcd1f39ab282236c4d8
A12_FT.cutadapt.bam.pos.bigwig,fe99e17c25b693161bd541204f3a107c
A9_FT.cutadapt.bam.pos.bigwig,7fbab43e41be262fdfc172c36d21fda7
A1_TF.cutadapt.bam.pos.bigwig,87900569f8e809fd0a3afe7d72b58274
A2_TF.cutadapt.bam.pos.bigwig,2945dd19da8e471c05a3ca55d91c0767
A3_TF.cutadapt.bam.pos.bigwig,70810b2be99b34841696241017620fd0
A4_TF.cutadapt.bam.pos.bigwig,e2140ef6f7e2a9fa1f022eae99961e78
B5_TCP.cutadapt.bam.pos.bigwig,8697145b61f247404a2802ebdaa00a99
B6_TCP.cutadapt.bam.pos.bigwig,aab553009fc5d6346b14c16529855cda


### Second Strand BigWig

In [36]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A10_FT.cutadapt.bam.neg.bigwig,1eae14b927f6d722b666dbeb7ad9f587
A11_FT.cutadapt.bam.neg.bigwig,508aac6e8b48fcba7ff97d9d0dd01228
A12_FT.cutadapt.bam.neg.bigwig,57d7b04bc00bf758076989521687c28a
A9_FT.cutadapt.bam.neg.bigwig,2527757cd38009b09608676c39c9fffb
A1_TF.cutadapt.bam.neg.bigwig,c6a92a206b649f90617cf55f3999cb61
A2_TF.cutadapt.bam.neg.bigwig,52c8daf1588a19bb289f4a2fd5266ab6
A3_TF.cutadapt.bam.neg.bigwig,2876b575e6685e418582d78eb38b6e86
A4_TF.cutadapt.bam.neg.bigwig,6c80614ad94d621af0a3714f7ae3a920
B5_TCP.cutadapt.bam.neg.bigwig,90ca203b3a1f48246448d97cf848376b
B6_TCP.cutadapt.bam.neg.bigwig,7175e927a939ed9295481cea82c91114


### Genic and ERCC Feature Counts

In [37]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A10_FT.cutadapt.bam.featurecounts.txt,962c9114fe148a59f64349dd5b3a59ed
A11_FT.cutadapt.bam.featurecounts.txt,2bbe8b107ea4c4cfd5a04c3f5716d7e6
A12_FT.cutadapt.bam.featurecounts.txt,a0eb61842d60e646130f1b3515ab4464
A9_FT.cutadapt.bam.featurecounts.txt,c06856cd87fa686ad7de1f5bb85d3aef
A1_TF.cutadapt.bam.featurecounts.txt,43056bf9e45867e5dd3f92954c6eaf20
A2_TF.cutadapt.bam.featurecounts.txt,ab4e2284049f3197ca7cb08585450a1c
A3_TF.cutadapt.bam.featurecounts.txt,f3aa8c66951e921b942332d53cc62d4d
A4_TF.cutadapt.bam.featurecounts.txt,6d2e56952cc99ce145af4891fb324c65
B5_TCP.cutadapt.bam.featurecounts.txt,885e3e3f20dde9c0109eb862c6915b53
B6_TCP.cutadapt.bam.featurecounts.txt,8ba34797fb275e5996ad52c9f6cec96e


### Intergenic Feature Counts

In [38]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A10_FT.cutadapt.bam.featurecounts.intergenic.txt,6014a6db4a03b0f63f620ba42d946114
A11_FT.cutadapt.bam.featurecounts.intergenic.txt,91fbb05833f73e07b6e6bb5b2bfd1315
A12_FT.cutadapt.bam.featurecounts.intergenic.txt,b680b685f60837b292d36ef17c1e5c73
A9_FT.cutadapt.bam.featurecounts.intergenic.txt,f2cde88e00e0602794cbc34ab1fbd5e4
A1_TF.cutadapt.bam.featurecounts.intergenic.txt,3888852bd1601fc5b0974aa51a400e8b
A2_TF.cutadapt.bam.featurecounts.intergenic.txt,57947c689fe37b389a4ab4e40e88965e
A3_TF.cutadapt.bam.featurecounts.intergenic.txt,877ce5ac750bc35e321db84a485e8e90
A4_TF.cutadapt.bam.featurecounts.intergenic.txt,45abc449bb480e21b75b79b1e075ea52
B5_TCP.cutadapt.bam.featurecounts.intergenic.txt,da2cd9f8a41e46d046db0e14dac71d3d
B6_TCP.cutadapt.bam.featurecounts.intergenic.txt,348902606527086370797f785a73b75d


## Copy Files to Single Folder

In [47]:
paths = []
for sample in samples:
    paths.extend([
        Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}_R1.fastq.gz'),
        Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'),
        Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'),
        Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.txt'),
        Path(f'../output/bulk-rnaseq-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.txt')
    ])

In [59]:
def copy_file(path):
    new = Path('/media/psf/Promise_Pegasus/fearjm/larval_gonad/db_submission', path.name)
    shutil.copy(path, new)
    oldmd5 = md5sum(path)
    newmd5 = md5sum(new)
    try:
        assert oldmd5 == newmd5
    except AssertionError:
        new.unlink()
        print(f'Error with copying {path.name}')

In [60]:
for path in paths:
    copy_file(path)