In [1]:
import os
import shutil
import datetime
import numpy as np
import pandas as pd
from meta.scripts.Utilities import Utilities
from meta.scripts.sample_data import SampleDataArray
from ashestopalov.nutrition.obesity_metagenomes.ProjectDescriber import ProjectDescriber

_ = """


To access a JupyterLabserver on http://ip_address:61156/?token=TOKEN:
export IMG=ivasilyev/curated_projects:latest && \
docker pull ${IMG} && \
docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 --net=host -it ${IMG} bash

git pull && jupyter lab --ip=0.0.0.0 --port=61156 --no-browser --NotebookApp.token=TOKEN
"""

DIRS = """191023_M01969_0092_000000000-CMKKY
191111_M05780_0056_000000000-CMJRN
191122_M04046_0124_000000000-CMKPP
191203_M04046_0129_000000000-CMJR5
191223_M04046_0130_000000000-CMMM7
200211_M04046_0132_000000000-CMKFR
"""

SAMPLE_SOURCES = ("blood", "stool")
MAX_SAMPLE_BATCH_SIZE = 10  # Save RAM

In [2]:
source_raw_reads_files = {i: [] for i in SAMPLE_SOURCES}
for dir_base_name in Utilities.split_lines(DIRS):
    dir_name = os.path.join("/data1/bio/", dir_base_name)
    conversion_dirs = [j for j in [os.path.join(dir_name, i) for i in list(os.listdir(dir_name)) 
                                   if "conversion" in os.path.basename(i).lower()] 
                       if os.path.isdir(j)]
    for conversion_dir in conversion_dirs:
        project_dirs = [j for j in [os.path.join(conversion_dir, i) for i in list(os.listdir(
            conversion_dir)) if "rogachev" in os.path.basename(i).lower()] if os.path.isdir(j)]
        if len(project_dirs) == 0:
            continue
        for project_dir in project_dirs:
            for sample_source in SAMPLE_SOURCES:
                if sample_source in os.path.basename(project_dir).lower():
                    raw_reads_files_by_sample_source = [i for i in Utilities.scan_whole_dir(
                        project_dir) if os.path.isfile(i) and not os.path.islink(i) and any(
                        i.endswith(j) for j in [".fq.gz", ".fastq.gz"])]
                    source_raw_reads_files[sample_source].extend(raw_reads_files_by_sample_source)

In [3]:
target_raw_reads_files = {i: [] for i in SAMPLE_SOURCES}
existing_raw_reads_files = []
source_raw_reads_data = []
for sample_source in SAMPLE_SOURCES:
    target_raw_reads_dir = os.path.join(ProjectDescriber.RAW_READS_DIR, sample_source)
    for source_raw_reads_file in source_raw_reads_files[sample_source]:
        sample_name = Utilities.safe_findall("(.+)_S[0-9]+", os.path.basename(source_raw_reads_file))
        launch_date = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(
            source_raw_reads_file)))).split("_")[0]
        target_raw_reads_file = os.path.join(target_raw_reads_dir, "{}__{}".format(
            launch_date, os.path.basename(source_raw_reads_file)))
        os.makedirs(os.path.dirname(target_raw_reads_file), exist_ok=True)
        if os.path.isfile(target_raw_reads_file):
            existing_raw_reads_files.append(target_raw_reads_file)
        else:
            shutil.copy2(source_raw_reads_file, target_raw_reads_file)
        if target_raw_reads_file not in target_raw_reads_files[sample_source]:
            target_raw_reads_files[sample_source].append(target_raw_reads_file)
        source_raw_reads_data.append(dict(file_path=source_raw_reads_file, sample_name=sample_name, 
                                          file_size=os.path.getsize(source_raw_reads_file),
                                          file_change_date=datetime.datetime.fromtimestamp(
                                              os.stat(source_raw_reads_file).st_mtime), 
                                          target_name=os.path.basename(target_raw_reads_file),
                                          target_alias=Utilities.safe_findall("(.+)_S[0-9]+_",
                                              os.path.basename(target_raw_reads_file))))

Utilities.dump_tsv(pd.DataFrame(source_raw_reads_data), 
                   os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "source_raw_reads_data.tsv"))

In [4]:
sampledata_arrays = {i: SampleDataArray.generate(Utilities.split_list_by_chunk_length(sorted(
    target_raw_reads_files[i]), 2), regex="(.+)_S[0-9]+_") for i in SAMPLE_SOURCES}

Duplicate sample data line key, the regex check is considered: '191223__166b'
Duplicate sample data line key, the regex check is considered: '191223__057'
Duplicate sample data line key, the regex check is considered: '191223__058'
Duplicate sample data line key, the regex check is considered: '191223__061'


In [5]:
_ = [[sampledata_arrays[i].validate(), sampledata_arrays[i].update_lines_state(dict(
    sample_source=i))] for i in SAMPLE_SOURCES]

In [6]:
# Check if all the sample lines were parsed successfully
assert sum([len(i) for i in target_raw_reads_files.values()]) == sum(
    [len(i) for i in sampledata_arrays.values()] * 2)

# Check if all the sample lines have 2 reads files
assert len(Utilities.remove_empty_values([[j.name for j in i.lines.values() if len(j.reads) != 2 ] 
                                          for i in sampledata_arrays.values()])) == 0

# Check if all the keys are unique (even if it's redundant)
assert len(np.intersect1d(*[sampledata_arrays[i].export().keys() for i in sampledata_arrays])) == 0

In [7]:
chopped_sampledata_dir = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "chopped")
os.makedirs(chopped_sampledata_dir, exist_ok=True)
for sample_source in SAMPLE_SOURCES:
    source_info_dicts = []
    _ = "categorical"
    metadata_dicts = [{"#SampleID": "#q2:types", "BarcodeSequence": _, "LinkerPrimerSequence": _, 
                       "Description": _, "sample_source": _}, ]
    sample_source_lines = [sampledata_arrays[sample_source].lines[i] for i in sorted(
        sampledata_arrays[sample_source].lines.keys())]
    for idx, reads_direction in enumerate(["forward", "reverse"]):
        source_info_dicts.extend([{"sample-id": i.name, "absolute-filepath": i.reads[idx], 
                                   "direction": reads_direction} for i in sample_source_lines])
    metadata_dicts.extend([{"#SampleID": i.name, "BarcodeSequence": "", "LinkerPrimerSequence": "", 
                            "Description": i.name, "sample_source": sample_source} 
                           for i in sample_source_lines])
    source_info_df = pd.DataFrame(source_info_dicts).sort_values("absolute-filepath").reset_index(
        drop=True)
    metadata_df = pd.DataFrame(metadata_dicts)
    # 
    sampledata_arrays[sample_source].dump(os.path.join(
        ProjectDescriber.SAMPLE_DATA_DIR, "main_sample_data_{}.json".format(sample_source)))
    source_info_df.to_csv(os.path.join(
        ProjectDescriber.SAMPLE_DATA_DIR, "qiime2_sample_data_{}.csv".format(sample_source)), 
        header=True, index=False)    
    Utilities.dump_tsv(metadata_df, os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, 
                                                 "qiime2_meta_data_{}.tsv".format(sample_source)))
    # 
    source_info_df_index_chunks = Utilities.split_list_by_chunk_length(
        list(range(source_info_df.shape[0])), MAX_SAMPLE_BATCH_SIZE * 2)    
    for idx, source_info_df_index_chunk in enumerate(source_info_df_index_chunks):
        chopped_sampledata_dir = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "chopped")
        source_info_df.loc[source_info_df_index_chunk, :].to_csv(os.path.join(
            chopped_sampledata_dir, "qiime2_sample_data_{}_chunk_{}.csv".format(sample_source, idx)), 
            header=True, index=False)
    