In [52]:
import pandas as pd
import yaml
import argparse

Trying to find a way to quickly parse an SRA Metadata table into a YAML sample listing for input to my ATAC pipeline

In [64]:
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--metadata", type=str, metavar="<csv_file>", nargs=1, required=True,
                   help="CSV file containing metadata")
parser.add_argument("-i", "--id_var", type=str, nargs=1, required=True,
                   help="Column name for ID variable identifying individual files (i.e. SRR Run)")
parser.add_argument("-g", "--group", type=str, nargs='+', required=True,
                   help="Column names for grouping variables")

_StoreAction(option_strings=['-g', '--group'], dest='group', nargs='+', const=None, default=None, type=<class 'str'>, choices=None, help='Column names for grouping variables', metavar=None)

In [67]:
parser.parse_args("-m xxx.csv -i Run -g genotype cell_type".split(" "))

Namespace(group=['genotype', 'cell_type'], id_var=['Run'], metadata=['xxx.csv'])

In [32]:
data = pd.read_csv("SraRunTable.txt-3.csv")
data = data.fillna('None')

In [44]:
# This can be provided on the command line by a user, these will be aggregated into sample names in this order
meta_cols = ["Cell_type", "time_point", "treatment", 'genotype/variation']

In [45]:
data['condition'] = data[meta_cols].apply(lambda x:'-'.join(x), axis=1)

In [46]:
# Remove spaces and the plus character
data.condition = data.condition.str.replace(" ", "_").str.replace("+", "")

In [47]:
# Switch to a multi-index with condition first, then Run ID (this enables aggregation)
data_multiindex = data.sort_values("condition").set_index(["condition", "Run"]).index

In [48]:
d = {}
for condition, sample in data_multiindex:
    if condition not in d.keys():
        d[condition] = []
    d[condition].append(sample)

In [51]:
config = {'samples' : d}
with open('config.yml', 'a') as f:
    f.write('\n')
    f.write(yaml.dump(config, default_flow_style=False))