In [1]:
import os
import json
import pandas as pd
from shutil import copy2
from meta.scripts.Utilities import Utilities

config = json.loads(Utilities.load_string("config.json"))
raw_reads = [i for i in Utilities.scan_whole_dir(config["input_dir"]) if i.endswith(".fastq.gz")]

In [2]:
os.makedirs(config["output_dir"], exist_ok=True)
copied_reads = [copy2(i, os.path.join(config["output_dir"], os.path.basename(i))) 
                for i in raw_reads]

In [3]:
similarities = Utilities.count_index_based_similarity(copied_reads)

sampledata_dicts = []
for sampledata_pair in [(k, similarities[k][0][0]) for k in similarities]:
    if "_R2_" in sampledata_pair[0]:
        continue
    sampledata_dict = {"filename{}".format(i): os.path.basename(j) 
                       for i, j in zip(("", 2), sampledata_pair)}
    sampledata_dict["sample_name"] = Utilities.safe_findall(
        "^([^._]*)[._]", sampledata_dict["filename"])
    sampledata_dicts.append(sampledata_dict)

sampledata_df = pd.DataFrame(sampledata_dicts).set_index("sample_name").sort_index()

In [4]:
submitted_microbe_df = Utilities.load_tsv("../data/tables/ncbi/biosample_attributes_microbe.tsv")
submitted_sra_df = Utilities.load_tsv("../data/tables/sra.tsv")

_ = [i.sort_index(inplace=True) for i in (submitted_microbe_df, submitted_sra_df)]
assert submitted_microbe_df.index.equals(submitted_sra_df.index)
submitted_microbe_df["bioproject_accession"] = submitted_sra_df["bioproject_accession"]

In [5]:
biosample_df = submitted_microbe_df.loc[submitted_microbe_df["*sample_name"].str.contains(
    '_Nextera'), :]
biosample_df.index = biosample_df["*sample_name"].str.replace("_Nextera", "", regex=False)
biosample_df = pd.concat([biosample_df, pd.DataFrame(index=sampledata_df.index)], axis=1, sort=False)
biosample_df = biosample_df.loc[biosample_df.index.isin(sampledata_df.index), 
                                submitted_microbe_df.columns]
biosample_df.at["6sq", "*sample_name"] = "6sq"

In [6]:
new_sra_df = submitted_sra_df.loc[submitted_sra_df["library_ID"].str.contains("_Nextera"), 
                                  [i for i in submitted_sra_df.columns 
                                   if all(j.strip() != i for j in ("filename", "filename2"))]]
new_sra_df.index = new_sra_df["library_ID"].str.replace("_Nextera", "", regex=False)
new_sra_df = pd.concat([new_sra_df, sampledata_df, pd.DataFrame(index=sampledata_df.index)], 
                       axis=1, sort=False)
new_sra_df["title"] = new_sra_df["title"].str.replace("DNA Miseq-PE-WGS of ", "RNA Miseq-PE-WTS of ", 
                                                      regex=False)
new_sra_df["library_strategy"] = "RNA-Seq"
new_sra_df["library_source"] = "TRANSCRIPTOMIC"
new_sra_df = new_sra_df.loc[new_sra_df.index.isin(sampledata_df.index), submitted_sra_df.columns]
new_sra_df.at["6sq", "library_ID"] = "6sq"

In [7]:
for df, table_file in zip([biosample_df, new_sra_df], ["biosample", "sra"]):
    Utilities.dump_tsv(df, os.path.join(os.getcwd(), "data", "tables", "ncbi", "{}.tsv".format(
        table_file)))