In [1]:
import csv, sys
from collections import defaultdict

In [2]:
import pandas
import numpy as np
import requests

In [3]:
def accession_metadata(acc):
    """
    Returns the metadata for ENCODE accession `acc` (e.g., ENCSR000BJN)
    """
    HEADERS = {'accept': 'application/json'}
    URL = (
        'https://www.encodeproject.org/experiments/{0}/?frame=object'
        .format(acc)
    )
    response = requests.get(URL, headers=HEADERS)
    return response.content

In [4]:
def find_controls(acc):
    """
    The metadata for an accession contains a "possible controls" field. I'm
    taking that to mean there can be multiple controls, so to be safe I'm
    returning a list of them.
    """
    m = pandas.read_json(accession_metadata(acc), typ='series')
    c = m['possible_controls']
    
    return [i.split('/')[2] for i in c]

In [15]:
def find_medatada(acc):
    """
    The metadata for an accession contains a "possible controls" field. I'm
    taking that to mean there can be multiple controls, so to be safe I'm
    returning a list of them.
    """
    m = pandas.read_json(accession_metadata(acc), typ='series')
    
    return m

In [23]:
for file in find_medatada("ENCSR667PLJ")["files"]:
    acc =  file.split('/files/')[1].strip("/")
    
    meta_acc = find_medatada(acc)
    
    
    if "file_format" in meta_acc:
        if meta_acc["file_format"] == "fastq":
            print(acc)

ENCFF162LWE
ENCFF981SWJ
ENCFF087MDC
ENCFF122FYQ


KeyboardInterrupt: 

In [17]:
find_medatada("ENCFF162LWE")

@id                                                    /files/ENCFF162LWE/
@type                                                         [File, Item]
accession                                                      ENCFF162LWE
aliases                  [brenton-graveley:non-target_BGKLV24_1_17594_G...
alternate_accessions                                                    []
assay_term_name                        shRNA knockdown followed by RNA-seq
award                                                 /awards/U54HG007005/
biological_replicates                                                  [1]
biosample_ontology                 /biosample-types/cell_line_EFO_0002067/
cloud_metadata           {'file_size': 1797870798, 'url': 'https://enco...
content_md5sum                            1ccae14731f13691072148734022452b
dataset                                          /experiments/ENCSR667PLJ/
date_created                              2015-08-14T15:49:35.172459+00:00
dbxrefs                  

In [5]:
find_controls("ENCSR336DFS")

['ENCSR667PLJ']

In [6]:
meta = pandas.read_json(accession_metadata('ENCSR392HSJ'), typ='series')
meta['possible_controls']

['/experiments/ENCSR572FFX/']

In [13]:
sample_info = dict()
paired_with = dict()

exp_control = dict()

#with open("./metadata.TOTAL.tsv",) as F: 
with open("./TOTAL/metadata.tsv",) as F: 
    
    reader = csv.DictReader(F, delimiter="\t")  
    
    for row in reader:
        
        Exp_Acc = row['Experiment accession']
        File_Acc = row['File accession']
        experiment[Exp_Acc].append(File_Acc)
        

        if row['Paired end']=="1":
            paired_with[File_Acc] = row['Paired with']
            
        
        target = row['Experiment target']
        
        
        cell = row['Biosample term name']
        
        if target=="":
            condition=Exp_Acc
        else:
            target = target.split("-")[0]
            exp_control[Exp_Acc] = find_controls(Exp_Acc)
            condition = "_".join([cell, target])
        
        
        info = [Exp_Acc, target, condition, row['Biosample term name'], row['Paired end'], row['Paired with'], row['File download URL'], row['Library strand specific']]
        
        sample_info[File_Acc] = info

 

In [24]:

compare = set([])

with  open("./units.tsv", "w") as unit, open("./samples.tsv","w") as sample, open("./diffexp.tsv","w") as diffexp, open("./sample_url.tsv", "w") as url :
    
    unit.write( "\t".join(["sample", "unit", "fq1", "fq2", "strandedness"]) + "\n" )
    sample.write( "\t".join(["sample", "condition" ]) + "\n" )
    
    diffexp.write( "diffexp:" + "\n" )
    diffexp.write( " " + "contrasts:" + "\n" )
    
    url.write("\t".join(["sample", "url"]) + "\n")

    
    condition_set = set([])
    
    for File_Acc in sample_info:

        Exp_Acc, target, condition, Biosample_term_name, Paired_end, Paired_with, File_download_URL, Library_strand_specific = sample_info[File_Acc]

        url.write("\t".join([File_Acc, File_download_URL]) + "\n" )
        
        if Paired_end=="1":

            strandedness = "none"

            if Library_strand_specific=="True":
                strandedness = "yes"


            fq1 = "FASTQ/" + File_Acc + '.fastq.gz'
            fq2 = "FASTQ/" + Paired_with + '.fastq.gz'


            unit.write("\t".join([File_Acc, "1", fq1, fq2, strandedness]) + "\n" )
            sample.write("\t".join([File_Acc, condition]) + "\n" )
            
            condition_set.add(condition)

            if condition!=Exp_Acc:  #To avoid controls
                
                compare.add( (condition, exp_control[Exp_Acc][0] ))
                
                
    for condition_pair in compare:
        
        if condition_pair[0] in condition_set and condition_pair[1] in condition_set:
        
            diffexp.write("  " + "-vs-".join(condition_pair) + ": \n")
            diffexp.write("   - " + condition_pair[0] + "\n")
            diffexp.write("   - " + condition_pair[1] + "\n")

    