# TaDa RNA-Seq GEO Prep

In [1]:
import os
import sys
from pathlib import Path
import hashlib
import shutil

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
samples = [
    'A1_whole_male_tub_dam_rep1',
    'A10_whole_female_tub_polII_rep1',
    'A11_whole_female_tub_polII_rep2',
    'A12_whole_female_tub_polII_rep3',
    'A2_whole_male_tub_dam_rep2',
    'A3_whole_male_tub_dam_rep3',
    'A4_whole_male_tub_polII_rep1',
    'A5_whole_male_tub_polII_rep2',
    'A6_whole_male_tub_polII_rep3',
    'A7_whole_female_tub_dam_rep1',
    'A8_whole_female_tub_dam_rep2',
    'A9_whole_female_tub_dam_rep3',
    'B1_ovary_tub_dam_rep1',
    'B10_testis_tub_polII_rep1',
    'B11_testis_tub_polII_rep2',
    'B12_testis_tub_polII_rep3',
    'B2_ovary_tub_dam_rep2',
    'B3_ovary_tub_dam_rep3',
    'B4_ovary_tub_polII_rep1',
    'B5_ovary_tub_polII_rep2',
    'B6_ovary_tub_polII_rep3',
    'B7_testis_tub_dam_rep1',
    'B8_testis_tub_dam_rep2',
    'B9_testis_tub_dam_rep3',
]

## Calculate md5sum

In [3]:
def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

### Read 1 FQ

In [4]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A1_whole_male_tub_dam_rep1_R1.fastq.gz,0ca48ee3b25afb64df8baf03e5247a73
A10_whole_female_tub_polII_rep1_R1.fastq.gz,ff0a7f3a1d04fcf0ba6c12b23d9cbb53
A11_whole_female_tub_polII_rep2_R1.fastq.gz,4cdb7c891e05c060823f750b478c74a5
A12_whole_female_tub_polII_rep3_R1.fastq.gz,6fbe7918e0930e57cd52ff3542a78003
A2_whole_male_tub_dam_rep2_R1.fastq.gz,775225d506b7c4242fdad746ff43c605
A3_whole_male_tub_dam_rep3_R1.fastq.gz,09a850ab691157b71df29d1939e261de
A4_whole_male_tub_polII_rep1_R1.fastq.gz,f87321f0bbd5d62ad0b18b6e38ce9d96
A5_whole_male_tub_polII_rep2_R1.fastq.gz,e191c1a26b18ff6ed9407d1c9df5c369
A6_whole_male_tub_polII_rep3_R1.fastq.gz,db7a5b3908262c9ced45ca9ebcd8f6b5
A7_whole_female_tub_dam_rep1_R1.fastq.gz,764b4ecaaf3939dabe321f694565a2d0


### First Strand BigWig

In [5]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A1_whole_male_tub_dam_rep1.cutadapt.bam.pos.bigwig,3b4e68bc44dda8443f06752bdb970a33
A10_whole_female_tub_polII_rep1.cutadapt.bam.pos.bigwig,c0096755241f823b7544dbe3e298e61b
A11_whole_female_tub_polII_rep2.cutadapt.bam.pos.bigwig,d81ac9aa433dc2813edcd1865fdb5596
A12_whole_female_tub_polII_rep3.cutadapt.bam.pos.bigwig,be88d4c60adcbe9a0bbb78d2f5f38566
A2_whole_male_tub_dam_rep2.cutadapt.bam.pos.bigwig,1f6f29032a33b43c7ab2ee6d1366deee
A3_whole_male_tub_dam_rep3.cutadapt.bam.pos.bigwig,0c3ccda635a0811f23c943ff990864f0
A4_whole_male_tub_polII_rep1.cutadapt.bam.pos.bigwig,d77b90911fef47eed06d756cf21999d2
A5_whole_male_tub_polII_rep2.cutadapt.bam.pos.bigwig,b52d973fee6561d770c133f82eaf3911
A6_whole_male_tub_polII_rep3.cutadapt.bam.pos.bigwig,6d0573add48e81db79eab6f7c5f37515
A7_whole_female_tub_dam_rep1.cutadapt.bam.pos.bigwig,0a41d05a1708b52b9263daf8fb7904d6


### Second Strand BigWig

In [6]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A1_whole_male_tub_dam_rep1.cutadapt.bam.neg.bigwig,0ca5a68519756c8b4150444158273137
A10_whole_female_tub_polII_rep1.cutadapt.bam.neg.bigwig,b5f276befd15226e13f6fa1d41c1865c
A11_whole_female_tub_polII_rep2.cutadapt.bam.neg.bigwig,a0be16ea906e0fb963b75c6840c76c55
A12_whole_female_tub_polII_rep3.cutadapt.bam.neg.bigwig,7ec04ac63f7b83c74a9682cbe2c57a04
A2_whole_male_tub_dam_rep2.cutadapt.bam.neg.bigwig,134ba6d4903ee1f73b3115b019b2c0ac
A3_whole_male_tub_dam_rep3.cutadapt.bam.neg.bigwig,54fc226f6a13a1116490beb36932f431
A4_whole_male_tub_polII_rep1.cutadapt.bam.neg.bigwig,4f3929500ae031108863c6b562fe6373
A5_whole_male_tub_polII_rep2.cutadapt.bam.neg.bigwig,34566a0fef2e4190964dd00c4a631f60
A6_whole_male_tub_polII_rep3.cutadapt.bam.neg.bigwig,95176d78bacee1796b89461bdd5dff7f
A7_whole_female_tub_dam_rep1.cutadapt.bam.neg.bigwig,ac8b96a023397f8aa9ccfc3dce83c482


### Genic and ERCC Feature Counts

In [7]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.s2.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A1_whole_male_tub_dam_rep1.cutadapt.bam.featurecounts.s2.txt,b378f32c793c3c122aaa046915d9b6bd
A10_whole_female_tub_polII_rep1.cutadapt.bam.featurecounts.s2.txt,6c27ca348fb9241cef3c14cd4c4ad2a3
A11_whole_female_tub_polII_rep2.cutadapt.bam.featurecounts.s2.txt,43e628bb612fee2c22853da3a33cf42e
A12_whole_female_tub_polII_rep3.cutadapt.bam.featurecounts.s2.txt,ba5aec305a9b69750bc4c9ac5d69d756
A2_whole_male_tub_dam_rep2.cutadapt.bam.featurecounts.s2.txt,ce48aa0a691cd747822127511ee13a05
A3_whole_male_tub_dam_rep3.cutadapt.bam.featurecounts.s2.txt,185c4801ae463319e50e957d6f75f730
A4_whole_male_tub_polII_rep1.cutadapt.bam.featurecounts.s2.txt,1c37a510265068bc211c6dfd6d981154
A5_whole_male_tub_polII_rep2.cutadapt.bam.featurecounts.s2.txt,7a2179a924c88f0ba26ad585cda402f9
A6_whole_male_tub_polII_rep3.cutadapt.bam.featurecounts.s2.txt,41285b8edc12f8f357a1ff70757a3227
A7_whole_female_tub_dam_rep1.cutadapt.bam.featurecounts.s2.txt,ce407950a2891a7c19e64fb20051bdaa


### Intergenic Feature Counts

In [8]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.s0.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
A1_whole_male_tub_dam_rep1.cutadapt.bam.featurecounts.intergenic.s0.txt,29bdf3d8b50430f8a9554d6173c2256b
A10_whole_female_tub_polII_rep1.cutadapt.bam.featurecounts.intergenic.s0.txt,899cef9b6da3c066b0a71fb08e5c3465
A11_whole_female_tub_polII_rep2.cutadapt.bam.featurecounts.intergenic.s0.txt,f605234714d1246a5f7afd00e388536e
A12_whole_female_tub_polII_rep3.cutadapt.bam.featurecounts.intergenic.s0.txt,501b6c26b7c8cc4fbdfb9069aa53c85f
A2_whole_male_tub_dam_rep2.cutadapt.bam.featurecounts.intergenic.s0.txt,35cc46b3d5c5e486c3f6bc5746903e57
A3_whole_male_tub_dam_rep3.cutadapt.bam.featurecounts.intergenic.s0.txt,9e1c87527729458b3afff021a68acb79
A4_whole_male_tub_polII_rep1.cutadapt.bam.featurecounts.intergenic.s0.txt,c2d5134f54405e97b252d2fcffeb7287
A5_whole_male_tub_polII_rep2.cutadapt.bam.featurecounts.intergenic.s0.txt,82b1851fff3cbe25e4de438b20a6ab16
A6_whole_male_tub_polII_rep3.cutadapt.bam.featurecounts.intergenic.s0.txt,28b5710324bf0a4d19c9d4f2778af710
A7_whole_female_tub_dam_rep1.cutadapt.bam.featurecounts.intergenic.s0.txt,de2c9663a6b6158637c88d3fcf73fd40


## Copy Files to Single Folder

In [9]:
paths = []
for sample in samples:
    paths.extend([
        Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz'),
        Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'),
        Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'),
        Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.s2.txt'),
        Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.s0.txt')
    ])

In [10]:
def copy_file(path):
    new = Path('/media/psf/Promise_Pegasus/fearjm/tada_rnaseq/justin.fear@nih.gov', path.name)
    shutil.copy(path, new)
    oldmd5 = md5sum(path)
    newmd5 = md5sum(new)
    try:
        assert oldmd5 == newmd5
    except AssertionError:
        new.unlink()
        print(f'Error with copying {path.name}')

In [11]:
for path in paths:
    copy_file(path)