In [39]:
import pandas
import time

from schema import BPSampleType, AttributeType, CodedAttributeType, RefObjectType, ImageType, ImageFileType, SAMPLE_ATTRIBUTESType, SAMPLE_CODED_ATTRIBUTESType, SAMPLE_TYPEType, FILESType, BPSampleSetType, SAMPLE_NAMEType, TAXONType, ImageSetType, IMAGE_TYPEType

In [2]:
input_filepath = r'C:\Users\er-gac\Nextcloud\BIGPICTURE Pilot data\UMarburg\[UMR] pilot study WSI.xlsx'

In [3]:
image_df = pandas.read_excel(input_filepath, sheet_name='IMAGE_TEMPLATE')
slide_df = pandas.read_excel(input_filepath, sheet_name='SLIDE_TEMPLATE')
sample_df = pandas.read_excel(input_filepath, sheet_name='SAMPLE_TEMPLATE')
specimen_df = pandas.read_excel(input_filepath, sheet_name='SPECIMEN_TEMPLATE')
biological_being_df = pandas.read_excel(input_filepath, sheet_name='BIOLOGICAL_BEING_TEMPLATE')

In [4]:
biologcal_being_base = 'BiologicalBeing'
specimen_base = 'Specimen'
sample_base = 'Sample'
slide_base = 'Slide'
image_base = 'Image'

In [5]:
samples_xml_path = 'samples.xml'
images_xml_path = 'images.xml'

In [42]:
start = time.time()
samples = []
for index, biological_being_row in biological_being_df.iterrows():
    id_value = biological_being_row['Biological Being ID']
    sex_value = biological_being_row['Sex_2']
    if sex_value == 'M':
        sex = 'M'
    elif sex_value == 'F':
        sex = 'F'
    else:
        sex = ""

    sex_attribute = AttributeType(
        TAG='sex',
        VALUE=sex
    )
    being = BPSampleType(
        alias=f'{biologcal_being_base}_{id_value}',
        SAMPLE_NAME=SAMPLE_NAMEType(TAXON_ID=9606),
        SAMPLE_TYPE=SAMPLE_TYPEType('BIOLOGICAL_BEING'),
        SAMPLE_ATTRIBUTES=SAMPLE_ATTRIBUTESType([sex_attribute]))
    samples.append(being)

for index, specimen_row in specimen_df.iterrows():
    id_value = specimen_row['Specimen-ID']
    biological_being_id = specimen_row['Biological Being ID (related to)']
    biological_being_reference = RefObjectType(accession=f'{biologcal_being_base}_{biological_being_id}')
    fixation_type_code = str(specimen_row['Fixation Type_1']).strip()
    if fixation_type_code == 'C-2141C':
        fixation = CodedAttributeType(
            TAG='Fixation',
            VALUE=fixation_type_code,
            SCHEME='SRT',
            MEANING='Neutral Buffered Formalin'
        )
    else:
        fixation = None
    anatomical_site_code_meaning = specimen_row['Anatomical Site_1'].strip()
    if anatomical_site_code_meaning == 'Colon':
        anatomical_site_code = '71854001'
    elif anatomical_site_code_meaning == 'Liver':
        anatomical_site_code = '10200004'
    elif anatomical_site_code_meaning == 'Pancreas':
        anatomical_site_code = '15776009'
    elif anatomical_site_code_meaning == 'Lymph node':
        anatomical_site_code = '59441001'
    else:
        anatomical_site_code = None

    if anatomical_site_code is not None:
        anatomical_site_attribute = CodedAttributeType(
            TAG='Anatomical Site',
            VALUE=anatomical_site_code,
            SCHEME='SCT',
            MEANING=anatomical_site_code_meaning
        )
    specimen = BPSampleType(
        alias=f'{specimen_base}_{id_value}',
        SAMPLE_NAME=SAMPLE_NAMEType(TAXON_ID=9606),
        SAMPLE_TYPE=SAMPLE_TYPEType('SPECIMEN'),
        SAMPLE_REF=[biological_being_reference],
        SAMPLE_CODED_ATTRIBUTES=SAMPLE_CODED_ATTRIBUTESType([fixation]+[anatomical_site_attribute])
    )
    samples.append(specimen)

for index, sample_row in sample_df.iterrows():
    id_value = sample_row['Sample-ID']
    specimen_id = sample_row['Specimen-ID (related to)']
    specimen_id_reference = RefObjectType(accession=f'{specimen_base}_{specimen_id}')
    sample = BPSampleType(
        alias=f'{sample_base}_{id_value}',
        SAMPLE_NAME=SAMPLE_NAMEType(TAXON_ID=9606),
        SAMPLE_TYPE=SAMPLE_TYPEType('SAMPLE'),
        SAMPLE_REF=[specimen_id_reference]
    )
    samples.append(sample)

for index, slide_row in slide_df.iterrows():
    id_value = slide_row['Slide-ID']
    sample_id = slide_row['Sample-ID (related to)']
    specimen_id_reference = RefObjectType(accession=f'{slide_base}_{sample_id}')
    staining_type_codes = str(slide_row['Staining_1'])
    staining_attributes = []
    if 'C-22968' in staining_type_codes:
        staining_attribute = CodedAttributeType(
            TAG='Staining',
            VALUE='C-22968',
            SCHEME='SRT',
            MEANING='hematoxylin stain'
        )
        staining_attributes.append(staining_attribute)
    if 'C-22921' in staining_type_codes:
        staining_attribute = CodedAttributeType(
            TAG='Staining',
            VALUE='C-22921',
            SCHEME='SRT',
            MEANING='blue shade eosin stain'
        )
        staining_attributes.append(staining_attribute)

    slide = BPSampleType(
        alias=f'{slide_base}_{id_value}',
        SAMPLE_NAME=SAMPLE_NAMEType(TAXON_ID=9606),
        SAMPLE_TYPE=SAMPLE_TYPEType('SLIDE'),
        SAMPLE_REF=[specimen_id_reference],
        SAMPLE_CODED_ATTRIBUTES=SAMPLE_CODED_ATTRIBUTESType(staining_attributes)
    )
    samples.append(slide)

sample_set = BPSampleSetType(BP_SAMPLE=samples)
with open(samples_xml_path, 'w') as samples_file:
    sample_set.export(samples_file, 0, name_='BP_SAMPLE_SET')


        
images = []
for index, image_row in image_df.iterrows():
    id_value = image_row['Image ID']
    slide_id = image_row['Slide-ID (related to)']
    filename = image_row['File name']
    scanner_name = image_row['Scanner Name']
    scanner_manufacturer = 'Leica'

    magnification = image_row['Magnification']

    slide_id_reference = RefObjectType(accession=f'{slide_base}_{slide_id}')

    image_file = ImageFileType(
        filetype='dcm',
        filename=filename,
        checksum_method='MD5',
        checksum='checksum',
        unencrypted_checksum='unencrypted_checksum'
    )
    image = ImageType(
        alias = f'{image_base}_{id_value}',
        SAMPLE_REF=[slide_id_reference],
        IMAGE_TYPE=IMAGE_TYPEType('WSI'),
        FILES=FILESType([image_file])
    )
    images.append(image)
    
with open(images_xml_path, 'w') as images_file:
    image_set = ImageSetType(IMAGE=images)
    image_set.export(images_file, 0, name_='IMAGE_SET')
    
end = time.time()


print(end-start)



0.09226012229919434
