Converts the current IDR metadata format into a datapackage-based json file. On loading the file using the jsonschema-pandas backend, both the library and processed data files are automatically converted to DataFrames.

In [1]:
study="idr0001-study.txt"

In [2]:
from fileinput import input
from fileinput import close

In [3]:
sections = [{}]
close()  # just in case
print ">>>> Unused lines:"
for line in input([study]):
    line = line.rstrip("\n")
    if "Screen Number" in line:
        sections.append({})
        section = int(line[13:].strip())
        assert len(sections)-1 == section
        sections[-1]["Screen Number"] = section
    else:
        if not (line.startswith("#") or line.startswith("\"#")):
            parts = line.split("\t", 1)
            sections[-1][parts[0].strip()] = parts[1].strip()
        elif line.strip():
            print "  >>", line[0:80].strip()

>>>> Unused lines:
  >> "# Section with generic information about the study including title, description
  >> # Study
  >> # Study Publication
  >> # Study Contacts
  >> "# Section containing all information relative to each screen in the study inclu
  >> # Screen; this section should be repeated if a study contains multiple screens
  >> "# Library section. The library file should be supplied separately and it should
  >> # Protocols
  >> # Phenotypes
  >> # Raw Data Files
  >> # Feature Level Data Files (give individual file details unless there is one fil
  >> #  Processed Data Files


In [4]:
from os.path import exists
from os.path import join
from jsontableschema import infer

import datapackage as dp
import csv
import io

myDP = dp.DataPackage()

# Required properties
myDP.descriptor['name'] = study

# Dynamic values from study file
for k, v in sections[0].items():  # Top-level
    myDP.descriptor[k] = v

    myDP.descriptor['resources'] = []

for section in sections[1:]:
    num = section["Screen Number"]
    isn = section["Comment[IDR Screen Name]"]
    isp = isn.split("/")[1]
    lib = section["Library File Name"].replace("txt", "tsv")
    pdf = section["Processed Data File Name"].replace("txt", "tsv")
    
    if not (exists(join(isp, lib)) and exists(join(isp, pdf))):
        raise Exception("Could not find in %s: %s and %s" % (isp, lib, pdf))
    for name, path in (("library", lib), ("processed data", pdf)):
        with io.open(join(isp, path)) as stream:
            headers = stream.readline().rstrip('\n').split('\t')
            values = csv.reader(stream, dialect="excel", delimiter="\t")
            schema = infer(headers, values)
            for field in schema['fields']:
                if field['type'] == 'geojson':
                    del field['type']
        myDP.descriptor['resources'].append(
            {
                "name": "%s %s file" % (isp, name),
                "path": join(isp, path),
                "schema": schema,
            }
        )

In [5]:
with open(study.replace("txt", "json"), "w") as f:
    f.write(myDP.to_json())

In [8]:
copyDP = dp.DataPackage(study.replace("txt", "json"))

In [9]:
# Requires: pip install jsontableschema-pandas
storage = dp.push_datapackage(descriptor=study.replace("txt", "json"), backend='pandas')

In [10]:
processed = storage[storage.buckets[0]]
library = storage[storage.buckets[1]]

In [11]:
processed.describe()

Unnamed: 0,Plate,Well Number,Well,Characteristics [Organism],Term Source 1 REF,Term Source 1 Accession,Characteristics [Strain],Gene Identifier,Gene Symbol,Reagent Design Gene Annotation Build,Analysis Gene Annotation Build,Control Type,Control Comments,Channels,Replicate Group,Plate Issues
count,18720,18720,18720,18720,18720,18720,18720,18720.0,18720.0,18720.0,18720.0,18720.0,18720.0,18720,18720,18720.0
unique,195,96,96,2,2,2,3010,3006.0,1881.0,1.0,1.0,5.0,4.0,1,3,2.0
top,JL_130116_J6_6,5,A8,Schizosaccharomyces pombe,NCBITaxon,NCBITaxon_4896,MS1404,,,,,,,GFP:endogenous alpha tubulin 2;Cascade blue:gr...,3,
freq,96,195,195,17550,17550,17550,2473,3830.0,8569.0,18720.0,18720.0,13549.0,16203.0,18720,8832,18432.0


In [12]:
library.describe()

Unnamed: 0,Gene Identifier,Gene Symbol,Reproducibility of Shape Hits,Reproducibility of Microtubule Hits,Reproducibility of Cell Cycle Progression Hits,Visual Shape Hit,Visual Microtubule Hit,Conservation in S. cerevisiae,Conservation in Vertebrates,Conservation in H. sapiens,...,Phenotype 10,Phenotype 11,Phenotype 12,Phenotype 13,Phenotype 14,Phenotype 15,Phenotype 16,Phenotype 17,Phenotype 18,Phenotype 19
count,262,262.0,262.0,262.0,262.0,262.0,262.0,262,262,262,...,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0,262.0
unique,262,199.0,41.0,41.0,12.0,2.0,2.0,2,2,2,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
top,SPCC790.02,,,,,,,yes,yes,yes,...,,,,,,,,,,
freq,1,64.0,119.0,76.0,227.0,227.0,228.0,232,204,131,...,253.0,261.0,261.0,233.0,259.0,261.0,258.0,260.0,236.0,260.0


In [19]:
import pandas as pd
annotations = pd.merge(processed, library, how='outer', on=['Gene Identifier', 'Gene Symbol'])
annotations.describe()

Unnamed: 0,Plate,Well Number,Well,Characteristics [Organism],Term Source 1 REF,Term Source 1 Accession,Characteristics [Strain],Gene Identifier,Gene Symbol,Reagent Design Gene Annotation Build,...,Phenotype 10,Phenotype 11,Phenotype 12,Phenotype 13,Phenotype 14,Phenotype 15,Phenotype 16,Phenotype 17,Phenotype 18,Phenotype 19
count,18720,18720,18720,18720,18720,18720,18720,18731.0,18731.0,18720.0,...,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0
unique,195,96,96,2,2,2,3010,3006.0,1882.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
top,JL_130116_J6_6,5,A8,Schizosaccharomyces pombe,NCBITaxon,NCBITaxon_4896,MS1404,,,,...,,,,,,,,,,
freq,96,195,195,17550,17550,17550,2473,3830.0,8579.0,18720.0,...,3514.0,3602.0,3602.0,3329.0,3205.0,3610.0,3580.0,3215.0,3359.0,3600.0


In [21]:
annotations.to_csv("test.csv", index=False)