# Bulk GEO Prep

In [1]:
import os
import sys
from pathlib import Path
import hashlib
import shutil

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
samples = [
    'B1_FO',
    'B2_FO',
    'B3_FO',
    'B4_FO',
    'A5_OF',
    'A6_OF',
    'A7_OF',
    'A8_OF',
    'B9_OCP',
    'B10_OCP',
    'B11_OCP',
    'B12_OCP',
    'F5_ODP',
    'F6_ODP',
    'F7_ODP',
    'F8_ODP',
]

## Calculate md5sum

In [3]:
def md5sum(filename, blocksize=65536):
    hash = hashlib.md5()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()

### Read 1 FQ

In [8]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}_R1.fastq.gz')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
B1_FO_R1.fastq.gz,bd4e204b62cdb329f2460b2e09d9de0b
B2_FO_R1.fastq.gz,cf630803a5bba4d343729464ef933e9e
B3_FO_R1.fastq.gz,0ddb1d43dd7977b006af711a8ea0b5d4
B4_FO_R1.fastq.gz,0b7ffe84d3919afd02a3a2941ec5df8e
A5_OF_R1.fastq.gz,0d481213766daa4dac786280a1e3b139
A6_OF_R1.fastq.gz,5aeb3a810e286686924b1f4856712b56
A7_OF_R1.fastq.gz,abe14e43c4998af097fe4b6401378d95
A8_OF_R1.fastq.gz,35454a80f024ddce31ca113864dd7a70
B9_OCP_R1.fastq.gz,989380c25e7f4510ff1c6a99f52292b8
B10_OCP_R1.fastq.gz,5a4bc063a754207b8150778eda48cf0b


### First Strand BigWig

In [4]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
B1_FO.cutadapt.bam.pos.bigwig,2c320c58549ab1124e89a53849127aba
B2_FO.cutadapt.bam.pos.bigwig,2d904c9fb54d477a2ba4e562f7c549d1
B3_FO.cutadapt.bam.pos.bigwig,9259ee8f2362c56c677b0219db556723
B4_FO.cutadapt.bam.pos.bigwig,5742eaca22aea0af06b1ace67daf2798
A5_OF.cutadapt.bam.pos.bigwig,1a377036c112afbfa72a36df46c66b14
A6_OF.cutadapt.bam.pos.bigwig,48ece1a1c67618e2cec20165119fff28
A7_OF.cutadapt.bam.pos.bigwig,bfff643d40214997de754d7eb57f14c9
A8_OF.cutadapt.bam.pos.bigwig,8501ce475ff895915c9bad82d33dfcc0
B9_OCP.cutadapt.bam.pos.bigwig,165c37d735e2e26003d227343b5b3232
B10_OCP.cutadapt.bam.pos.bigwig,d88af90d46cbbe1e3e2e501c15877ad3


### Second Strand BigWig

In [9]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
B1_FO.cutadapt.bam.neg.bigwig,0d782f65a8681ee2235b4e2fc2686436
B2_FO.cutadapt.bam.neg.bigwig,9316771b4b24c374db35a71bf34f480f
B3_FO.cutadapt.bam.neg.bigwig,3e788fd83f695fd689da68b1f9935920
B4_FO.cutadapt.bam.neg.bigwig,de645a4b94d858412818d9f5d1bde68a
A5_OF.cutadapt.bam.neg.bigwig,97febfecb6369c9d7244d4932860b370
A6_OF.cutadapt.bam.neg.bigwig,a81479044ba292ddd23e2c222b82628f
A7_OF.cutadapt.bam.neg.bigwig,0bff6ade237690cf6dbc36754a63a00d
A8_OF.cutadapt.bam.neg.bigwig,b0956534527c6f10639ee329e987ab29
B9_OCP.cutadapt.bam.neg.bigwig,5db1f68a1db1f6202589d520a68d18dc
B10_OCP.cutadapt.bam.neg.bigwig,34c3173ddaa8a012047ba1ea77fb8b7c


### Genic and ERCC Feature Counts

In [10]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.s2.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
B1_FO.cutadapt.bam.featurecounts.s2.txt,50834ac98b438205a2a37ff48cbc13ba
B2_FO.cutadapt.bam.featurecounts.s2.txt,32a5af0514d9c14409e14dda82bb8c84
B3_FO.cutadapt.bam.featurecounts.s2.txt,69ba20a136d0576697c7435af246e6ce
B4_FO.cutadapt.bam.featurecounts.s2.txt,a59b50eb63aaaa37b9fe42704bd1a622
A5_OF.cutadapt.bam.featurecounts.s2.txt,d792755196617442118174ae4a2ec020
A6_OF.cutadapt.bam.featurecounts.s2.txt,e1b65991465c74d868367198f7e701f1
A7_OF.cutadapt.bam.featurecounts.s2.txt,225fff2e5567e03582e94921364ef805
A8_OF.cutadapt.bam.featurecounts.s2.txt,918a1ffa1add4fa546a098f8c8e08645
B9_OCP.cutadapt.bam.featurecounts.s2.txt,ee3d8f9462908c25a1e22f9c7f95f5ac
B10_OCP.cutadapt.bam.featurecounts.s2.txt,23e8bf8cf9db454e63d3ae450d94c30a


### Intergenic Feature Counts

In [12]:
res = []
for sample in samples:
    fname = Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.s0.txt')
    _hash = md5sum(fname)
    res.append((fname.name, _hash))

df = pd.DataFrame(res, columns=['fname', 'md5sum'])
df.set_index('fname')

Unnamed: 0_level_0,md5sum
fname,Unnamed: 1_level_1
B1_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,d7caf634f6c7ebc36c9ed5200248be29
B2_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,241b1d38d12e90a96cd0ee585643409d
B3_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,1a0d3896eb079d108a24a0904dc83aa2
B4_FO.cutadapt.bam.featurecounts.intergenic.s0.txt,5f49480f4718bdbe7e04b9cff2c1f8cc
A5_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,97f6b149d354187f7a5cd659b13eef70
A6_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,5c7b0afb8233d569d78bdd2ce0189f39
A7_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,8144a6afa61a71e0437e02b053b76bdc
A8_OF.cutadapt.bam.featurecounts.intergenic.s0.txt,2740d6f160bb09a1a234fdc3358655ee
B9_OCP.cutadapt.bam.featurecounts.intergenic.s0.txt,0d2a90e0493b08d43dd4bf1711676a4e
B10_OCP.cutadapt.bam.featurecounts.intergenic.s0.txt,48661a5cb4341e072d9b54377c8f9e16


## Copy Files to Single Folder

In [13]:
paths = []
for sample in samples:
    paths.extend([
        Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}_R1.fastq.gz'),
        Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'),
        Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'),
        Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.s2.txt'),
        Path(f'../output/bulk-wf/rnaseq_samples/{sample}/{sample}.cutadapt.bam.featurecounts.intergenic.s0.txt')
    ])

In [14]:
def copy_file(path):
    new = Path('/media/psf/Promise_Pegasus/fearjm/larval_gonad_ovary/justin.fear@nih.gov', path.name)
    shutil.copy(path, new)
    oldmd5 = md5sum(path)
    newmd5 = md5sum(new)
    try:
        assert oldmd5 == newmd5
    except AssertionError:
        new.unlink()
        print(f'Error with copying {path.name}')

In [15]:
for path in paths:
    copy_file(path)