# Bulk GEO Prep

In [4]:
import os
import sys
from pathlib import Path
import hashlib
import shutil

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
samples = [
    'A1_whole_male_tub_dam_rep1',
    'A10_whole_female_tub_polII_rep1',
    'A11_whole_female_tub_polII_rep2',
    'A12_whole_female_tub_polII_rep3',
    'A2_whole_male_tub_dam_rep2',
    'A3_whole_male_tub_dam_rep3',
    'A4_whole_male_tub_polII_rep1',
    'A5_whole_male_tub_polII_rep2',
    'A6_whole_male_tub_polII_rep3',
    'A7_whole_female_tub_dam_rep1',
    'A8_whole_female_tub_dam_rep2',
    'A9_whole_female_tub_dam_rep3',
    'B1_ovary_tub_dam_rep1',
    'B10_testis_tub_polII_rep1',
    'B11_testis_tub_polII_rep2',
    'B12_testis_tub_polII_rep3',
    'B2_ovary_tub_dam_rep2',
    'B3_ovary_tub_dam_rep3',
    'B4_ovary_tub_polII_rep1',
    'B5_ovary_tub_polII_rep2',
    'B6_ovary_tub_polII_rep3',
    'B7_testis_tub_dam_rep1',
    'B8_testis_tub_dam_rep2',
    'B9_testis_tub_dam_rep3',
]

## Calculate md5sum

In [6]:
def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

### Read 1 FQ

In [None]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

### First Strand BigWig

In [None]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

### Second Strand BigWig

In [None]:
res = []
for sample in samples:
    fname = Path(f'../rnaseq-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

### Genic and ERCC Feature Counts

In [10]:
res = []
for sample in samples:
    fname = Path(f'../bulk-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.s2.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
B1_FO.cutadapt.bam.featurecounts.s2.txt,50834ac98b438205a2a37ff48cbc13ba
B2_FO.cutadapt.bam.featurecounts.s2.txt,32a5af0514d9c14409e14dda82bb8c84
B3_FO.cutadapt.bam.featurecounts.s2.txt,69ba20a136d0576697c7435af246e6ce
B4_FO.cutadapt.bam.featurecounts.s2.txt,a59b50eb63aaaa37b9fe42704bd1a622
A5_OF.cutadapt.bam.featurecounts.s2.txt,d792755196617442118174ae4a2ec020
A6_OF.cutadapt.bam.featurecounts.s2.txt,e1b65991465c74d868367198f7e701f1
A7_OF.cutadapt.bam.featurecounts.s2.txt,225fff2e5567e03582e94921364ef805
A8_OF.cutadapt.bam.featurecounts.s2.txt,918a1ffa1add4fa546a098f8c8e08645
B9_OCP.cutadapt.bam.featurecounts.s2.txt,ee3d8f9462908c25a1e22f9c7f95f5ac
B10_OCP.cutadapt.bam.featurecounts.s2.txt,23e8bf8cf9db454e63d3ae450d94c30a


### Intergenic Feature Counts

In [12]:
res = []
for sample in samples:
    fname = Path(f'../bulk-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.s0.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
B1_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,d7caf634f6c7ebc36c9ed5200248be29
B2_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,241b1d38d12e90a96cd0ee585643409d
B3_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,1a0d3896eb079d108a24a0904dc83aa2
B4_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,5f49480f4718bdbe7e04b9cff2c1f8cc
A5_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,97f6b149d354187f7a5cd659b13eef70
A6_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,5c7b0afb8233d569d78bdd2ce0189f39
A7_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,8144a6afa61a71e0437e02b053b76bdc
A8_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,2740d6f160bb09a1a234fdc3358655ee
B9_OCP.cutadapt.bam.featurecounts.intergenic.s0.txt,0d2a90e0493b08d43dd4bf1711676a4e
B10_OCP.cutadapt.bam.featurecounts.intergenic.s0.txt,48661a5cb4341e072d9b54377c8f9e16


## Copy Files to Single Folder

In [13]:
paths = []
for sample in samples:
    paths.extend([
        Path(f'../bulk-wf/data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz'),
        Path(f'../bulk-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'),
        Path(f'../bulk-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'),
        Path(f'../bulk-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.s2.txt'),
        Path(f'../bulk-wf/data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.s0.txt')
    ])

In [14]:
def copy_file(path):
    new = Path('/media/psf/Promise_Pegasus/fearjm/larval_gonad_ovary/justin.fear@nih.gov', path.name)
    shutil.copy(path, new)
    oldmd5 = md5sum(path)
    newmd5 = md5sum(new)
    try:
        assert oldmd5 == newmd5
    except AssertionError:
        new.unlink()
        print(f'Error with copying {path.name}')

In [15]:
for path in paths:
    copy_file(path)