# Create the Data for the EGA Submission.

This is mostly parsing the Metadata table.

In [1]:
import os
import sys
from socket import gethostname

# this is for ffmpeg
os.environ['PATH'] += os.pathsep + os.path.expanduser('~/miniconda3/envs/bioinformatics/bin')

hostname = gethostname()

if hostname.startswith('hpc-node'):
    IN_DEEPTHOUGHT = True
    sys.path.append('..')
else:
    IN_DEEPTHOUGHT = False
from cf_analysis_lib.load_libraries import *
import cf_analysis_lib


# Read the MGI sequence data and associated metadata

We may use the MGI data later!

In [2]:
sequence_type = "MGI"
datadir = '..'
sslevel = 'subsystems_norm_ss.tsv.gz'
taxa = "family"

df, metadata = cf_analysis_lib.read_the_data(sequence_type, datadir, sslevel='subsystems_norm_ss.tsv.gz', taxa="family")

Using ../MGI/FunctionalAnalysis/subsystems/MGI_subsystems_norm_ss.tsv.gz for the subsystems


list(metadata.columns)

## Samples

The samples spreadsheet has columns:

Mapping to columns:
- alias: MinION
- title: {Sample_Type} from {pwCF_ID} collected on {Sample date}
- description: {Sample_Type} from {pwCF_ID} collected on {Sample date}
- biological_sex: Gender
- subject_id: pwCF_ID
- phenotype: "Cystic Fibrosis"
- biosample_id: NaN
- case_control: NaN
- organism_part: NaN
- cell_line: NaN

In [3]:
os.makedirs('EGA', exist_ok=True)
sample_types = {'S': 'Sputum', 'BAL': 'Broncheolar lavage'}
biosex = {'M': 'male', 'F': 'female'}

# alias,title,description,biological_sex,subject_id,phenotype,biosample_id,case_control,organism_part,cell_line
samplesdf = pd.DataFrame(columns=['alias', 'title', 'description', 'biological_sex', 'subject_id', 'phenotype', 'biosample_id', 'case_control', 'organism_part', 'cell_line'], index=metadata.index)
samplesdf['alias'] = metadata.index
samplesdf['title'] = metadata.apply(
    lambda r: f"{sample_types[r['Sample_Type']]} from {r['pwCF_ID']} collected on {r['Sample date']}",
    axis=1
)
samplesdf['description'] = samplesdf['title']
samplesdf['biological_sex'] = metadata.apply(
    lambda r: f"{biosex[r['Gender']]}",
    axis=1
)
samplesdf['subject_id'] = metadata['pwCF_ID']
samplesdf['phenotype'] = "Cystic Fibrosis"

samplesdf.to_csv(os.path.join('EGA', 'samples.csv'), index=False)

## Runs

We left the MGI runs without a name, and then we've added MinION and PromethION to those runs

In [7]:
os.makedirs('EGA', exist_ok=True)
with open(os.path.join('EGA', 'MGI_runs.csv'), 'w') as out:
    print("sample,file1,file2", file=out)
    for i in df.index:
        print(f"{i},/{i}_R1.fastq.gz.c4gh,/{i}_R2.fastq.gz.c4gh", file=out)

with open(os.path.join('EGA', 'MinION_runs.csv'), 'w') as out:
    print("sample,file1", file=out)
    for i in metadata[~metadata['MinION'].isna()].index:
        print(f"{i},/{i}_minion.fastq.gz.c4gh", file=out)

with open(os.path.join('EGA', 'PromethION_runs.csv'), 'w') as out:
    print("sample,file1", file=out)
    for i in metadata[~metadata['PromethION'].isna()].index:
        print(f"{i},/{i}_promethion.fastq.gz.c4gh", file=out)