In [1]:
import os
import re
import xlrd
import pandas as pd
from io import StringIO
from meta.scripts.Utilities import Utilities
from yutyurin.staphylococcus_aureus.ProjectDescriber import ProjectDescriber

In [2]:
submission_email_str = """
SUBID     	BioProject	BioSample	Accession	Organism
---------------------------------------------------
SUB9351353	PRJNA717172	SAMN18489519	JAGGIM000000000	Staphylococcus aureus KZ_188
"""
submission_email_df =  Utilities.load_tsv(StringIO(re.sub("[-]{2,}", "", submission_email_str)))
submission_dir = os.path.join(ProjectDescriber.ROOT_DIR, "ncbi")

# SRA data start from the cell 1, sheet 2 for Excel
sra_metadata_template_df = pd.read_excel(os.path.join(submission_dir, "SRA_metadata_acc.xlsx"), sheet_name="SRA_data", header=0)
pathogen_df = Utilities.load_tsv(os.path.join(submission_dir, "pathogen.tsv"))

In [3]:
# Parse sample data
sample_data_2d_array = Utilities.load_2d_array(ProjectDescriber.SAMPLE_DATA_FILE)
# Get it to JSON format
sample_data_dict = {i: j for i, j in zip(*sample_data_2d_array)}
sample_data_dict["raw_reads"] = sample_data_dict["raw_reads"].split(";")
sample_data_dict = {sample_data_dict["sample_name"]: sample_data_dict}

In [4]:
row_number = 0
sra_dict = {
    "biosample_accession": submission_email_df.loc[row_number, "BioSample"],
    "library_ID": list(sample_data_dict.values())[row_number]["sample_name"],
    "title": "DNA Miseq-PE-WGS of {}: {}, {}".format(
        submission_email_df.loc[row_number, "Organism"], pathogen_df.loc[0, "*host"], 
        pathogen_df.loc[0, "*isolation_source"]),
    "library_strategy": "WGS",
    "library_source": "GENOMIC",
    "library_selection": "RANDOM",
    "library_layout": "paired",
    "platform": "ILLUMINA",
    "instrument_model": "Illumina MiSeq",
    "design_description": "The libraries were prepared using the Nextera XT kit and run on a 500-cycle protocol into 250-bp lanes without deviations or shearing of DNA",
    "filetype": "fastq",
    "filename": os.path.basename(list(sample_data_dict.values())[row_number]["raw_reads"][0]),
    "filename2": os.path.basename(list(sample_data_dict.values())[row_number]["raw_reads"][1])}

sra_df = pd.concat([sra_metadata_template_df, pd.DataFrame(Utilities.dict2pd_series(sra_dict)).transpose()], axis=0, sort=False)
Utilities.dump_tsv(sra_df, os.path.join(submission_dir, "sra.tsv"))
