In [1]:
import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config
import numpy as np
import re

In [21]:
# Set up a --no-sign-request client to S3
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

# Paginator allows for more than 1000 results per list_objects_v2 to be returned
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket='cellpainting-gallery', Prefix='cpg0002-jump-scope/source_4/images/')

objects = []
for page in pages:
    # Each page represents 1000 results. Iterate through these and extract the contents
    objects.append(page["Contents"])
# Convert list of lists into a humble list
# Each element in the list is a dictionary that contains information about a single file
objects_flat = [item for sublist in objects for item in sublist]

print(f"---- {len(objects_flat)} files found ----")

In [100]:
# Save to a CSV. Each element of objects_flat becomes a row
pd.DataFrame(objects_flat).to_csv("output/jump-scope-image-file-info.csv", index=False)

In [131]:
# Open the file info csv
file_df = pd.read_csv("output/jump-scope-image-file-info.csv")
file_df.head()

In [132]:
# We know the file extension for the images taken by the microscope. Drop any other files
# Also, drop columns that are not required and add a size in MB
used_img_formats = [".nd2", ".tiff", ".tif", ".TIF"]
file_df = file_df[file_df["Key"].str.contains("|".join(used_img_formats))]
file_df["Size_MB"] = file_df["Size"] * 1e-6
file_df.drop(["LastModified", "ETag", "StorageClass"], axis=1, inplace=True)
file_df.head()

In [134]:
# Save image file-specific info
file_df.to_csv("output/jump-scope-image-info-only.csv", index=False)

In [3]:
file_df = pd.read_csv("output/jump-scope-image-info-only.csv")
metadata_df = pd.read_csv("output/all-profile-metadata.csv")
file_df.head()

Unnamed: 0,Key,Size,Size_MB
0,cpg0002-jump-scope/source_4/images/2020_10_27_...,1722374,1.722374
1,cpg0002-jump-scope/source_4/images/2020_10_27_...,7259176,7.259176
2,cpg0002-jump-scope/source_4/images/2020_10_27_...,6734426,6.734426
3,cpg0002-jump-scope/source_4/images/2020_10_27_...,7111214,7.111214
4,cpg0002-jump-scope/source_4/images/2020_10_27_...,7910442,7.910442


In [258]:
# metadata_subset = metadata_df[~metadata_df["Assay_Plate_Barcode"].str.contains("1Plane|BRO0117033_20xb|BRO0117056_20x|BRO0117056_20xb|BRO0117033_20x")]
metadata_subset = metadata_df[~metadata_df["Assay_Plate_Barcode"].str.contains("BRO0117033_20xb|BRO0117056_20xb")]
metadata_subset[metadata_subset["Assay_Plate_Barcode"] == "BRO0117033_20x"]


Unnamed: 0.1,Vendor,Batch,Plate_Map_Name,Assay_Plate_Barcode,Modality,Images_per_well,Sites-SubSampled,Binning,Magnification,Number_of_channels,z_plane,BF_Zplanes,Anomaly,Unnamed: 0,spinning-disc,aperture,dry-immersion,vs-brightfield,simultaneous-excitation,sites
180,Yokogawa_US,Scope1_Yokogawa_US_20X_6Ch_BRO0117033,JUMP-MOA_compound_platemap,BRO0117033_20x,Confocal,9,,1,20,6,12,11,none,52.0,,1.0,water,yes,2.0,9


In [29]:
def get_plate_file_sizes(metadata_df, filesize_df):
    """
    Some filenames contain "-" whereas their metadata equivalent contains "_" or " ".
    Homogenise everything and remove these characters to allow for better comparison.
    
    This function is imperfect as it returns more of the files found in file_df. 
    Oh last check, it found an extra ~17,300 images that were found in the input filesize_df
    The cause for this could be: 
        1. Missed plate name that was unused in JUMP-scope analysis 
        2. Unaccounted for images (ie. files that end with .tiff/.nd2 etc) that are not part of 
            an image set
                This seems likely, since the following calibration subdirectory exists
                "2020_10_27_Scope1_YokogawaJapan/images/20201020T134356_/Calibration/"
        3.  Assay_Plate_Barcode to platename mismatch. ie. there is a substring match somewhere

    Despite this, the function does accurately get the average file sizes for PE microscopes, which
    is the desired outcome for comparison of binning settings on computation cost.
    """
    meta = metadata_df.copy()
    # Another out copy so the output does not have Assay_Plate_Barcode modified
    out_meta = metadata_df.copy()
    files = filesize_df.copy()
    # Remove inconsistent characters
    meta["Assay_Plate_Barcode"] = meta["Assay_Plate_Barcode"].str.replace("-|_", "", regex=True)
    files["Key"] = files["Key"].str.replace("-|_| ", "", regex=True)
    # Some platenames have had suffixes added. Remove.
    problem_platenames = {
        "BRO011701410x": "BRO0117014",
        # *20xb plates are ignored since they are removed from metadata_df in the input. 
        # This is because they are duplicates - the "b" refers to analysis included brightfield
        "BRO011703320xb": "BRO0117033",
        "BRO011705620xb": "BRO0117056",
        "BRO011705620x": "BRO0117056",
        "BRO011703320x": "BRO0117033"
        }
    # Can use count to report mismatches with input files and files found to match barcode
    count = 0
    # It isn't necessary to remove siteSub rows, tbh
    # for i, row in meta[~meta["Batch"].str.contains("siteSub")].iterrows():

    for i, row in meta.iterrows():
        barcode = row["Assay_Plate_Barcode"]
        if barcode in problem_platenames.keys():
            barcode = problem_platenames[barcode]
        # print(barcode)
        subset_df = files[files["Key"].str.contains(barcode)]
        out_meta.loc[i, "Size_MB"] = subset_df["Size_MB"].mean()
        out_meta.loc[i, "Size_MB_std"] = subset_df["Size_MB"].std()
        # Remove rows that have been matched
        # files = files[~files["Key"].str.contains(barcode)]
        # print(files.shape)
        # print(subset_df.shape[0])
        count += subset_df.shape[0]
    print(f"---- Input files: {filesize_df.shape[0]} -- Found files: {count} -- Unaccounted for files {filesize_df.shape[0] - count} ----")
    return out_meta #, files

metadata = get_plate_file_sizes(metadata_df, file_df)
# metadata = get_plate_file_sizes(metadata_subset, file_df)
metadata[(metadata["Vendor"].str.contains("PE")) & ~metadata["Batch"].str.contains("siteSub")][["Size_MB", "Size_MB_std", "Assay_Plate_Barcode", "Binning"]]


---- Input files: 1521532 -- Found files: 6198976 -- Unaccounted for files -4677444 ----


Unnamed: 0,Size_MB,Size_MB_std,Assay_Plate_Barcode,Binning
142,9.526377,2.056519,CP_Broad_Phenix_C_BIN1_1Plane_P1,1
143,9.484644,2.025929,CP_Broad_Phenix_C_BIN1_1Plane_P2,1
144,9.352902,1.995189,CP_Broad_Phenix_C_BIN1_1Plane_P3,1
145,9.587195,1.989622,CP_Broad_Phenix_C_BIN1_1Plane_P4,1
146,9.427739,1.770552,CP_Broad_Phenix_C_BIN1_P1,1
147,9.40003,1.744418,CP_Broad_Phenix_C_BIN1_P2,1
148,9.294915,1.715099,CP_Broad_Phenix_C_BIN1_P3,1
149,9.473822,1.716538,CP_Broad_Phenix_C_BIN1_P4,1
150,9.361273,1.410804,CP_Broad_Phenix_NC_BIN1_1Plane_P1,1
151,9.244346,1.354504,CP_Broad_Phenix_NC_BIN1_1Plane_P2,1


In [31]:
metadata.to_csv("output/all-profile-metadata-INCLUDING-FILESIZES.csv", index=False)

In [25]:
# Example for checking a string outside 
file_df[file_df["Key"].str.replace("-|_| ", "", regex=True).str.contains("Plate2PCO6ch4site10XPA")].reset_index().loc[0, "Key"]

'cpg0002-jump-scope/source_4/images/2020_11_06_Scope1_MolDev/images/Plate2-PCO-6ch-1site-10XPA_Plate20092/Plate2-PCO-6ch-4site-10XPA_A01_s1_w1.TIF'

In [4]:
# When were images acquired?

image_df = pd.read_csv("output/jump-scope-image-file-info.csv")

In [5]:
image_df.columns

Index(['Key', 'LastModified', 'ETag', 'Size', 'StorageClass'], dtype='object')