In [1]:
import urllib
import pandas as pd

In [3]:
# Select DNase samples from metadata downloaded from:
# https://egg2.wustl.edu/roadmap/web_portal/meta.html
# https://docs.google.com/spreadsheets/d/1yikGx4MsO9Ei36b64yOy9Vb6oPC5IBGlFbYEt-N6gOM/edit#gid=15
df_ = pd.read_csv('Roadmap.metadata.qc.jul2013 - Consolidated_EpigenomeIDs_summary_Table.tsv', sep='\t', header=[0,1,2]).iloc[:, [1, 15, 35]]
df_.columns = ['EID', 'Epigenome_name', 'DNase_file']
df_ = df_.query('DNase_file == DNase_file').sort_values(['EID']).reset_index(drop=True)

# Drop ENCODE samples, confirm that we're left with "39 Roadmap reference epigenomes with DNase data":
# https://egg2.wustl.edu/roadmap/web_portal/DNase_reg.html#delieation
df_['is_encode'] = df_['DNase_file'].str.startswith('wgEncode')
#df_['is_encode'].value_counts()
df_ = df_.query('~is_encode').drop('is_encode', axis=1)
assert len(df_) == 39, f'Expected to see 39 epigenomes, found {len(df_)} instead'
print('Expected 39 epigenomes with DNase left after removing ENCODE samples')

# Transform from one row per epigenome to one row per replicate; guess pval file name
df_ = df_.assign(DNase_file=df_.DNase_file.str.split(';')).explode('DNase_file')
df_['DNase_file'] = df_['DNase_file'].str.rstrip('.filt.*') + '.pval.signal.bigwig'
df_.reset_index(drop=True, inplace=True)
print(f'Reference epigenomes split into {len(df_)} individual replicates')
df_.head()

Expected 39 epigenomes with DNase left after removing ENCODE samples
Reference epigenomes split into 104 individual replicates


Unnamed: 0,EID,Epigenome_name,DNase_file
0,E003,H1_Cell_Line,UW.H1.ChromatinAccessibility.DS19100.pval.sign...
1,E003,H1_Cell_Line,UW.H1.ChromatinAccessibility.DS18873.pval.sign...
2,E004,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,UW.H1_BMP4_Derived_Mesendoderm_Cultured_Cells....
3,E004,H1_BMP4_Derived_Mesendoderm_Cultured_Cells,UW.H1_BMP4_Derived_Mesendoderm_Cultured_Cells....
4,E005,H1_BMP4_Derived_Trophoblast_Cultured_Cells,UW.H1_BMP4_Derived_Trophoblast_Cultured_Cells....


In [4]:
# DSXXXXXX seems identify libraries, e.g. lihttps://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM665840
df_['library_name'] = df_['DNase_file'].map(lambda fp_: next(tok_ for tok_ in fp_.split('.') if tok_.startswith('DS')))

In [5]:
# Check that pval signal files exist on the server
def check_pval_url_(fp_):
    url_ = f'https://egg2.wustl.edu/roadmap/data/byFileType/signal/unconsolidated/pval/{fp_}'
    try:
        with urllib.request.urlopen(url_) as response:
            return True
    except HTTPError:
        return False

#assert all(df_['DNase_file'].map(check_pval_url_)), 'Some file names in column `DNase_file` do not exist on egg2.wustl.edu'
#print('All file names in column `DNase_file` exist on egg2.wustl.edu')

In [6]:
# Look at number of replicates per epigenome
df_['n_replicates'] = df_.groupby('EID')['DNase_file'].transform(len)
vc_ = df_[['EID', 'n_replicates']].drop_duplicates()['n_replicates'].value_counts().sort_index()
assert sum(vc_.values) == 39
vc_

1     13
2     17
3      2
4      2
5      2
10     1
11     1
12     1
Name: n_replicates, dtype: int64

In [8]:
# Write a sample sheet of epigenomes with two replicates
df_ = df_.query('n_replicates == 2').reset_index(drop=True)[['EID', 'Epigenome_name', 'library_name', 'DNase_file']]\
    .assign(DNase_file=lambda r_: "pval/" + r_.DNase_file)
df_.to_csv('sample_sheet.tsv', sep='\t', index=False, header=True)
!wc -l sample_sheet.tsv

      35 sample_sheet.tsv


In [9]:
!mkdir -p pval
for fp_ in df_['DNase_file'].values:
    !wget -nc -O pval/{fp_} https://egg2.wustl.edu/roadmap/data/byFileType/signal/unconsolidated/pval/{fp_}

pval/pval/UW.H1.ChromatinAccessibility.DS19100.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1.ChromatinAccessibility.DS18873.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1_BMP4_Derived_Mesendoderm_Cultured_Cells.ChromatinAccessibility.DS19310.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1_BMP4_Derived_Mesendoderm_Cultured_Cells.ChromatinAccessibility.DS18732.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1_BMP4_Derived_Trophoblast_Cultured_Cells.ChromatinAccessibility.DS19317.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1_BMP4_Derived_Trophoblast_Cultured_Cells.ChromatinAccessibility.DS18736.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1_Derived_Mesenchymal_Stem_Cells.ChromatinAccessibility.DS20671.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1_Derived_Mesenchymal_Stem_Cells.ChromatinAccessibility.DS21042.pval.signal.bigwig: No such file or directory
pval/pval/UW.H1_Derived_Neuronal_P