In [62]:
import pandas as pd
import os
import re
import json

In [112]:
base_dir = 'data/mgnify/studies'

def gen_study_dir_contents():
    """Iterate over every study directory and yield all file paths within each one."""
    for name in os.listdir(base_dir):
        study_dir = os.path.join(base_dir, name)
        file_paths = []
        ana_path = os.path.join(study_dir, 'analyses.json')
        with open(ana_path) as fd:
            ana_json = json.load(fd)
        exp_types = []
        for analysis in ana_json['data']:
            exp_type = analysis['attributes']['experiment-type']
            sample_id = analysis['relationships']['sample']['data']['id']
            exp_types.append((exp_type, sample_id))
        for name2 in os.listdir(study_dir):
            file_paths.append(os.path.join(study_dir, name2))
        yield (study_dir, exp_types, file_paths)

        
def gen_study_files_match(pattern):
    """Generate all files within study directories matching a substring pattern"""
    for (study_dir, exp_types, file_paths) in gen_study_dir_contents():
        for file_path in file_paths:
            if re.search(pattern, file_path):
                yield file_path, exp_types


def count_features(key, tsv_paths):
    """Count all unique features"""
    features = dict()
    for (path, exp_types) in tsv_paths:
        df = pd.read_csv(path, sep='\t')
        if key not in df:
            print(f'"{key}" column not found in this table: {path}')
            continue
        if exp_type not in features:
            features[exp_type] = set()
        features[exp_type].update(set(df[key]))
    counts = {key: len(val) for key, val in features.items()}
    print(json.dumps(counts, indent=2))
    # print(features)


def count_examples(tsv_paths):
    """Count total number of examples (eg samples or runs) grouped by experiment type"""
    examples = dict()
    # Regex pattern for column names that represent mgnify ids
    pattern = r'^[A-Z]+\d+$'
    for (path, exp_type) in tsv_paths:
        df = pd.read_csv(path, sep='\t')
        if exp_type not in features:
            examples[exp_type] = 0
        example_count = len([key for key in df.keys() if re.match(pattern, key)])
        examples[exp_type] += example_count
    return examples

In [96]:
phylum_tsv_paths = list(gen_study_files_match('.+phylum_taxonomy.+\.tsv$'))
print('Phylum file count:', len(phylum_tsv_paths))
print('Example:', phylum_tsv_paths[-1])

Phylum file count: 2716
Example: ('data/mgnify/studies/MGYS00002035/ERP104197_phylum_taxonomy_abundances_SSU_v5.0.tsv', 'assembly')


In [109]:
full_tax_tsv_paths = list(gen_study_files_match(r'.+\d_taxonomy_abundances_.+\.tsv$'))
print('Full file count:', len(full_tax_tsv_paths))
print('Example:', full_tax_tsv_paths[0])

Full file count: 2715
Example: ('data/mgnify/studies/MGYS00002076/SRP076308_taxonomy_abundances_SSU_v4.0.tsv', 'unknown')


In [110]:
go_tsv_paths = list(gen_study_files_match(r'.+GO_abundances.+\.tsv$'))
print('Gene Ontology file count:', len(go_tsv_paths))
print('Example:', go_tsv_paths[0])

Gene Ontology file count: 1990
Example: ('data/mgnify/studies/MGYS00002076/SRP076308_GO_abundances_v4.0.tsv', 'unknown')


In [106]:
print('Phylum-only feature stats:')
count_features('phylum', phylum_tsv_paths)

Phylum-only feature stats:
"phylum" column not found in this table: data/mgnify/studies/MGYS00002264/SRP049210_phylum_taxonomy_abundances_SSU_v4.1.tsv
"phylum" column not found in this table: data/mgnify/studies/MGYS00002088/ERP105559_phylum_taxonomy_abundances_SSU_v4.0.tsv
"phylum" column not found in this table: data/mgnify/studies/MGYS00003194/SRP114822_phylum_taxonomy_abundances_SSU_v4.1.tsv
"phylum" column not found in this table: data/mgnify/studies/MGYS00002361/SRP126531_phylum_taxonomy_abundances_SSU_v4.1.tsv
"phylum" column not found in this table: data/mgnify/studies/MGYS00002481/SRP091049_phylum_taxonomy_abundances_SSU_v4.1.tsv
"phylum" column not found in this table: data/mgnify/studies/MGYS00005137/SRP186503_phylum_taxonomy_abundances_SSU_v4.1.tsv
"phylum" column not found in this table: data/mgnify/studies/MGYS00001312/ERP015409_phylum_taxonomy_abundances_SSU_v4.1.tsv
"phylum" column not found in this table: data/mgnify/studies/MGYS00005142/ERP013105_phylum_taxonomy_abund

In [111]:
print('Full taxonomy feature stats:')
count_features('#SampleID', full_tax_tsv_paths)

Full taxonomy feature stats:
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00002264/SRP049210_taxonomy_abundances_SSU_v4.1.tsv
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00002088/ERP105559_taxonomy_abundances_SSU_v4.0.tsv
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00003194/SRP114822_taxonomy_abundances_SSU_v4.1.tsv
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00002361/SRP126531_taxonomy_abundances_SSU_v4.1.tsv
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00002481/SRP091049_taxonomy_abundances_SSU_v4.1.tsv
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00001312/ERP015409_taxonomy_abundances_SSU_v4.1.tsv
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00005142/ERP013105_taxonomy_abundances_SSU_v4.1.tsv
"#SampleID" column not found in this table: data/mgnify/studies/MGYS00005105/SRP173959_taxonomy_abundances_SSU_v4.1.tsv
"#SampleID"

In [113]:
print('Full Gene Ontology feature stats:')
count_features('GO', go_tsv_paths)

Full Gene Ontology feature stats:
"GO" column not found in this table: data/mgnify/studies/MGYS00002264/SRP049210_GO_abundances_v4.1.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00002088/ERP105559_GO_abundances_v4.0.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00003194/SRP114822_GO_abundances_v4.1.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00000296/SRP003580_GO_abundances_v1.0.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00000296/SRP003580_GO_abundances_v2.0.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00001312/ERP015409_GO_abundances_v4.1.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00001482/ERP006155_GO_abundances_v4.1.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00001635/ERP003497_GO_abundances_v3.0.tsv
"GO" column not found in this table: data/mgnify/studies/MGYS00001869/ERP016733_GO_abundances_v4.1.tsv
"GO" column not found in this table: da

In [114]:
print("Phylum abundance examples (total counts of run or samples)")
print(json.dumps(count_examples(phylum_tsv_paths), indent=2))

Phylum abundance examples (total counts of run or samples)
{
  "unknown": 7,
  "amplicon": 39,
  "assembly": 0,
  "metagenomic": 2,
  "metatranscriptomic": 96,
  "metatranscriptomic,amplicon": 0,
  "metagenomic,amplicon": 18,
  "metagenomic,unknown": 12,
  "metagenomic,assembly": 16,
  "metabarcoding": 1,
  "metatranscriptomic,metagenomic": 2,
  "assembly,metagenomic,amplicon": 46,
  "unknown,amplicon": 4,
  "metagenomic,metatranscriptomic,assembly": 1,
  "metabarcoding,amplicon": 0
}
