In [1]:
import os
import pandas as pd
from meta.scripts.Utilities import Utilities
from ashestopalov.nutrition.obesity_metagenomes.ProjectDescriber import ProjectDescriber

In [2]:
pipeline_files = Utilities.scan_whole_dir(os.path.join(ProjectDescriber.ROOT_DIR, "qiime2")) + \
                 Utilities.scan_whole_dir(os.path.join(ProjectDescriber.ROOT_DIR, "picrust2"))

In [3]:
otu_id_df = pd.concat([pd.read_csv(i, header=1, sep="\t", index_col="#OTU ID") 
                       for i in pipeline_files if os.path.basename(i) == "OTUs.tsv"], 
                      axis=1, sort=False, join="outer")
otu_id_df = pd.concat([otu_id_df, Utilities.load_tsv(
    "/data/reference/SILVA/SILVA_v138/SILVA_138_Taxonomy_headed.tsv").set_index("#OTU ID")], 
                      axis=1, sort=False, join="inner")
otu_id_df = otu_id_df.rename_axis(index="#OTU ID").reset_index().set_index(
    ["taxonomy", "#OTU ID"]).fillna(0).sort_index()

In [4]:
ec_id_df = pd.concat([Utilities.load_tsv(i).set_index(["description", "function"]) 
                      for i in pipeline_files 
                      if os.path.basename(i) == "pred_metagenome_unstrat_described.tsv" 
                      and os.path.basename(os.path.dirname(i)) == "EC_metagenome_out"], 
                     axis=1, sort=False).fillna(0).sort_index()
ko_id_df = pd.concat([Utilities.load_tsv(i).set_index(["description", "function"]) 
                      for i in pipeline_files 
                      if os.path.basename(i) == "pred_metagenome_unstrat_described.tsv" 
                      and os.path.basename(os.path.dirname(i)) == "KO_metagenome_out"], 
                     axis=1, sort=False).fillna(0).sort_index()

pathway_id_df = pd.concat([Utilities.load_tsv(i).set_index(["description", "pathway"]) 
                           for i in pipeline_files 
                           if os.path.basename(i) == "path_abun_unstrat_described.tsv" 
                           and os.path.basename(os.path.dirname(i)) == "pathways_out"], 
                          axis=1, sort=False).fillna(0).sort_index()

In [5]:
merged_data_dir = os.path.join(ProjectDescriber.ROOT_DIR, "data", "merged_data")
os.makedirs(merged_data_dir, exist_ok=True)
for df, name in zip([otu_id_df, ec_id_df, ko_id_df, pathway_id_df], ["OTU", "EC", "KO", "pathway"]):
    Utilities.dump_tsv(df, os.path.join(merged_data_dir, "{}_IDs.tsv".format(name)), 
                       reset_index=True)

In [6]:
qiime2_source_info_df = pd.concat([pd.read_csv(os.path.join(
    ProjectDescriber.SAMPLE_DATA_DIR, "qiime2_sample_data_{}.csv".format(i))) 
    for i in ("blood", "stool")], axis=0, ignore_index=True, sort=False)
sample_names = qiime2_source_info_df["sample-id"].unique()

In [7]:
for df, name in zip([otu_id_df, ec_id_df, ko_id_df, pathway_id_df], ["OTU", "EC", "KO", "pathway"]):
    if len([i for i in df.columns if i not in sample_names]) != 0:
        print("Failed check for results of: {}".format(name))