In [1]:
import os
import time
import json
import xlrd
import pandas as pd
from meta.scripts.Utilities import Utilities
from yutyurin.staphylococcus_aureus.ProjectDescriber import ProjectDescriber

In [2]:
sample_data_2d_array = Utilities.load_2d_array(ProjectDescriber.SAMPLE_DATA_FILE)
# Get it to JSON format
sample_data_dict = {i: j for i, j in zip(*sample_data_2d_array)}
sample_data_dict["raw_reads"] = sample_data_dict["raw_reads"].split(";")
sample_data_dict = {sample_data_dict["sample_name"]: sample_data_dict}

assembly_file = "/data1/bio/projects/yutyurin/staphylococcus_aureus/pga-pe/06_plasmid_merger/188staph/188staph_genome.fna"
state_dict = json.loads(Utilities.load_string("/data1/bio/projects/yutyurin/staphylococcus_aureus/pga-pe/state.json"))

assembly_statistics = Utilities.count_assembly_statistics(assembly_file)
raw_reads_statistics = [Utilities.count_raw_reads_statistics(i, "fastq_gz") for i in list(sample_data_dict.values())[0]["raw_reads"]]

combined_statistics = pd.concat([pd.DataFrame(raw_reads_statistics).set_index("file").mean(), Utilities.dict2pd_series(assembly_statistics)], axis=0, sort=False)
combined_statistics["file"] = os.path.basename(combined_statistics["file"])

In [3]:
blast_first_result_dict = list(json.loads(Utilities.load_string("/data1/bio/projects/yutyurin/staphylococcus_aureus/pga-pe/06_plasmid_merger/188staph/blast/188staph_genome_blast_results.json")).values())[0]
blast_first_result_genbank = Utilities.parse_sequences("/data1/bio/projects/yutyurin/staphylococcus_aureus/pga-pe/06_plasmid_merger/188staph/genbank/{}.gbk".format(blast_first_result_dict["geninfo_id"]), "genbank")[0]

combined_statistics["reference_title"] = blast_first_result_dict["title"]
combined_statistics["reference_total_bp"] = len(blast_first_result_genbank)
combined_statistics = pd.concat([combined_statistics, Utilities.dict2pd_series(
    Utilities.count_assembly_coverages(combined_statistics["total_reads_bp"], 
                                       combined_statistics["total_contigs_bp"], 
                                       combined_statistics["reference_total_bp"]))], axis=0, sort=False)

Utilities.dump_tsv(pd.DataFrame(combined_statistics).transpose(), os.path.join(os.path.dirname(assembly_file), "coverage.tsv"))

In [4]:
submission_dict = {
    "Assembly date": time.strftime("%m-%d-%Y", time.gmtime(os.path.getmtime(assembly_file))), 
    "Assembly method": "SPAdes", 
    "Version or date program was run": Utilities.remove_empty_values(
         [Utilities.safe_findall("\\\'([^ ]+)\\\'", j, report=False) for j in [i for i in Utilities.load_list(
             "/data1/bio/projects/yutyurin/staphylococcus_aureus/pga-pe/log/2021-03-15-18-46-36/main.log") 
                                                                 if "spades:" in i]])[0],
    "Assembly name": list(sample_data_dict.values())[0]["sample_name"],
    "Genome coverage": combined_statistics["expected_assembly_coverage"],
    "Sequencing technology": "Illumina MiSeq",
    "Submission title": "Staphylococcus aureus isolated from the skin of a child with atopic eczema (atopic dermatitis)",
    "Public description": """The Staphylococcus aureus isolate was collected from the skin surface  of a child suffering from atopic eczema (atopic dermatitis).
The microorganism samples are available from the dedicated collection of the Federal Budgetary Science Institute KNIIEM (Kazan, Russia) and the State Collection of Pathogenic Microorganisms and Cell Cultures Obolensk (Serpukhov, Russia), accession number KZ-188.   
The draft genome sequencing of this isolate was performed in 2017 as part of a dissertation research work at the KNIIEM Institute. 
The isolate was reported as resistant to cephalosporins, macrolides and penicillins.""", 
    "Relevance": "Medical",
    "(Optional) Filter packages by organism name": "Staphylococcus aureus",
    "NCBI packages": "Pathogen: clinical or host-associated",
    "Bacteria and/or source DNA is available from": "The State Collection of Pathogenic Microorganisms and Cell Cultures Obolensk (Territory \"Kvartal A\", d. 24, r.p. Obolensk, g. Serpukhov, Moscow region, 142279), accession number KZ-188"
 }

submission_dir = os.path.join(ProjectDescriber.ROOT_DIR, "ncbi")
submission_df = pd.DataFrame([submission_dict])
Utilities.dump_tsv(submission_df, os.path.join(submission_dir, "submission_form_values.tsv"))

In [5]:
# Antibiogram data start from the cell 18 for Excel
antibiogram_df = pd.read_excel(os.path.join(submission_dir, "Antibiogram_Staph188.xlsx"), sheet_name="antibiogram", header=17)
Utilities.dump_tsv(antibiogram_df, os.path.join(submission_dir, "antibiogram.tsv"))

# Pathogen data start from the cell 13 for Excel
pathogen_df = pd.read_excel(os.path.join(submission_dir, "Pathogen_Staph188.xlsx"), sheet_name="Pathogen.cl.1.0", header=12)
Utilities.dump_tsv(pathogen_df, os.path.join(submission_dir, "pathogen.tsv"))