In [119]:
import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config
import numpy as np
import re

In [21]:
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
# response = s3.list_objects_v2(Bucket="cellpainting-gallery", Prefix="cpg0002-jump-scope/source_4/images/").get('Contents')

paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket='cellpainting-gallery', Prefix='cpg0002-jump-scope/source_4/images/')

objects = []
for page in pages:
    objects.append(page["Contents"])
    # for obj in page['Contents']:
    #     print(obj['Size'])

In [28]:
objects_flat = [item for sublist in objects for item in sublist]

In [31]:
len(objects_flat)

1526520

In [100]:
pd.DataFrame(objects_flat).to_csv("output/jump-scope-image-file-info.csv", index=False)

In [131]:
file_df = pd.read_csv("output/jump-scope-image-file-info.csv")

In [105]:
file_df.head()

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass
0,cpg0002-jump-scope/source_4/images/2020_10_27_...,2022-06-23 10:54:02+00:00,"""cd21ddba116b5339ea414bdf40d256c9-2""",16000128,INTELLIGENT_TIERING
1,cpg0002-jump-scope/source_4/images/2020_10_27_...,2022-06-23 10:54:02+00:00,"""e6ce120be1049839c8344ed3bafd2d95-2""",16000128,INTELLIGENT_TIERING
2,cpg0002-jump-scope/source_4/images/2020_10_27_...,2022-06-23 10:54:02+00:00,"""0d46efd864ea4a9f02f635ce5bece2d0-2""",16000128,INTELLIGENT_TIERING
3,cpg0002-jump-scope/source_4/images/2020_10_27_...,2022-06-23 10:54:02+00:00,"""1ab1461c8191302b046474c527f228d8-2""",16000128,INTELLIGENT_TIERING
4,cpg0002-jump-scope/source_4/images/2020_10_27_...,2022-06-23 10:54:02+00:00,"""89dcb1df5164436806c77608c8df52bd-2""",16000128,INTELLIGENT_TIERING


In [132]:
used_img_formats = [".nd2", ".tiff", ".tif", ".TIF"]
file_df = file_df[file_df["Key"].str.contains("|".join(used_img_formats))]
file_df["Size_MB"] = file_df["Size"] * 1e-6
file_df.drop(["LastModified", "ETag", "StorageClass"], axis=1, inplace=True)

In [133]:
file_df.head()

Unnamed: 0,Key,Size,Size_MB
786,cpg0002-jump-scope/source_4/images/2020_10_27_...,1722374,1.722374
788,cpg0002-jump-scope/source_4/images/2020_10_27_...,7259176,7.259176
789,cpg0002-jump-scope/source_4/images/2020_10_27_...,6734426,6.734426
790,cpg0002-jump-scope/source_4/images/2020_10_27_...,7111214,7.111214
791,cpg0002-jump-scope/source_4/images/2020_10_27_...,7910442,7.910442


In [134]:
file_df.to_csv("output/jump-scope-image-info-only.csv", index=False)

In [240]:
file_df = pd.read_csv("output/jump-scope-image-info-only.csv")
metadata_df = pd.read_csv("output/all-profile-metadata.csv")

In [113]:
file_df.head()

Unnamed: 0,Key,Size,Size_MB
0,cpg0002-jump-scope/source_4/images/2020_10_27_...,1722374,1.722374
1,cpg0002-jump-scope/source_4/images/2020_10_27_...,7259176,7.259176
2,cpg0002-jump-scope/source_4/images/2020_10_27_...,6734426,6.734426
3,cpg0002-jump-scope/source_4/images/2020_10_27_...,7111214,7.111214
4,cpg0002-jump-scope/source_4/images/2020_10_27_...,7910442,7.910442


In [258]:
# metadata_subset = metadata_df[~metadata_df["Assay_Plate_Barcode"].str.contains("1Plane|BRO0117033_20xb|BRO0117056_20x|BRO0117056_20xb|BRO0117033_20x")]
metadata_subset = metadata_df[~metadata_df["Assay_Plate_Barcode"].str.contains("BRO0117033_20xb|BRO0117056_20xb")]
metadata_subset[metadata_subset["Assay_Plate_Barcode"] == "BRO0117033_20x"]


Unnamed: 0.1,Vendor,Batch,Plate_Map_Name,Assay_Plate_Barcode,Modality,Images_per_well,Sites-SubSampled,Binning,Magnification,Number_of_channels,z_plane,BF_Zplanes,Anomaly,Unnamed: 0,spinning-disc,aperture,dry-immersion,vs-brightfield,simultaneous-excitation,sites
180,Yokogawa_US,Scope1_Yokogawa_US_20X_6Ch_BRO0117033,JUMP-MOA_compound_platemap,BRO0117033_20x,Confocal,9,,1,20,6,12,11,none,52.0,,1.0,water,yes,2.0,9


In [280]:
def get_plate_file_sizes(metadata_df, filesize_df):
    # Some filenames contain "-" whereas their metadata equivalent contains "_"
    # Homogenise everything and remove
    meta = metadata_df.copy()
    files = filesize_df.copy()
    meta["Assay_Plate_Barcode"] = meta["Assay_Plate_Barcode"].str.replace("-|_", "", regex=True)
    files["Key"] = files["Key"].str.replace("-|_| ", "", regex=True)
    problem_platenames = {
        "BRO011701410x": "BRO0117014",
        # "BRO011703320xb": "BRO0117033",
        # "BRO011705620xb": "BRO0117056",
        "BRO011705620x": "BRO0117056",
        "BRO011703320x": "BRO0117033"
        }
    count = 0
    for i, row in meta[~meta["Batch"].str.contains("siteSub")].iterrows():
        barcode = row["Assay_Plate_Barcode"]
        if barcode in problem_platenames.keys():
            barcode = problem_platenames[barcode]
        # if "1Plane" in barcode:
        #     barcode = re.sub("1Plane.*", "", barcode)
        print(barcode)
        subset_df = files[files["Key"].str.contains(barcode)]
        meta.loc[i, "Size_MB"] = subset_df["Size_MB"].mean()
        # Remove rows that have been matched
        # files = files[~files["Key"].str.contains(barcode)]
        # print(files.shape)
        print(subset_df.shape[0])
        count += subset_df.shape[0]
    #     print("\\b"+row["Assay_Plate_Barcode"]+"\\b")
    #     subset_df = filesize_df[filesize_df["Key"].str.contains("\\b"+row["Assay_Plate_Barcode"]+"\\b")]
    #     print(subset_df.shape)
    #     subset_df = subset_df["Size_MB"].apply(lambda x: np.mean(x))
    # return subset_df
    return (filesize_df.shape[0], count), meta #, files



# get_plate_file_sizes(metadata_df, file_df)
t = get_plate_file_sizes(metadata_subset, file_df)
t


Plate2PCO6ch4site10XPA
11519
Plate3PCO6ch4site10XPACrest
23022
Plate3PCO6ch4site10XPACrestz
11502
Plate3PCO6ch4site20XPACrestz
9216
Plate2PCO6ch9site20XPA
20736
Plate3PCO6ch9site20XPACrest
20736
Plate3PCO6chAdaptive20XPA
4914
BR00117060a10x
1920
BR00117061a10x
1920
BR00117062a10x
1911
BR00117063b10x
1920
BR00117061a
16128
BR00117062a
16119
BR00117063b
16128
CPBroadPhenixCBIN11PlaneP1
5760
CPBroadPhenixCBIN11PlaneP2
5760
CPBroadPhenixCBIN11PlaneP3
5760
CPBroadPhenixCBIN11PlaneP4
5760
CPBroadPhenixCBIN1P1
23040
CPBroadPhenixCBIN1P2
23040
CPBroadPhenixCBIN1P3
23040
CPBroadPhenixCBIN1P4
23040
CPBroadPhenixNCBIN11PlaneP1
5760
CPBroadPhenixNCBIN11PlaneP2
5760
CPBroadPhenixNCBIN11PlaneP3
5760
CPBroadPhenixNCBIN11PlaneP4
5760
CPBroadPhenixNCBIN1P1
23040
CPBroadPhenixNCBIN1P2
23040
CPBroadPhenixNCBIN1P3
23035
CPBroadPhenixNCBIN1P4
23040
CPBroadPhenixC1PlaneP1
11520
CPBroadPhenixC1PlaneP2
11520
CPBroadPhenixC1PlaneP3
11520
CPBroadPhenixC1PlaneP4
11520
CPBroadPhenixCP1
46080
CPBroadPhenixCP2
4608

((1521532, 1538785),
           Vendor                                    Batch  \
 0         MolDev         1siteSubSample_Scope1_MolDev_10X   
 1         MolDev         1siteSubSample_Scope1_MolDev_10X   
 2         MolDev  1siteSubSample_Scope1_MolDev_10X_4siteZ   
 3         MolDev   1siteSubSample_Scope1_MolDev_20X_4site   
 4         MolDev   1siteSubSample_Scope1_MolDev_20X_9site   
 ..           ...                                      ...   
 178  Yokogawa_US               Scope1_Yokogawa_US_20X_5Ch   
 180  Yokogawa_US    Scope1_Yokogawa_US_20X_6Ch_BRO0117033   
 181  Yokogawa_US    Scope1_Yokogawa_US_20X_6Ch_BRO0117059   
 182  Yokogawa_US   Scope1_Yokogawa_US_20X_6Ch_BRO01177034   
 183  Yokogawa_US        Scope1_Yokogawa_US_40X_BRO0117059   
 
                  Plate_Map_Name           Assay_Plate_Barcode  Modality  \
 0    JUMP-MOA_compound_platemap        Plate2PCO6ch4site10XPA  Confocal   
 1    JUMP-MOA_compound_platemap   Plate3PCO6ch4site10XPACrest  Confocal   
 2   

In [287]:
# t = t[1]
t[(t["Vendor"].str.contains("PE")) & ~t["Batch"].str.contains("siteSub")][["Size_MB", "Assay_Plate_Barcode", "Binning"]]

Unnamed: 0,Size_MB,Assay_Plate_Barcode,Binning
142,9.526377,CPBroadPhenixCBIN11PlaneP1,1
143,9.484644,CPBroadPhenixCBIN11PlaneP2,1
144,9.352902,CPBroadPhenixCBIN11PlaneP3,1
145,9.587195,CPBroadPhenixCBIN11PlaneP4,1
146,9.427739,CPBroadPhenixCBIN1P1,1
147,9.40003,CPBroadPhenixCBIN1P2,1
148,9.294915,CPBroadPhenixCBIN1P3,1
149,9.473822,CPBroadPhenixCBIN1P4,1
150,9.361273,CPBroadPhenixNCBIN11PlaneP1,1
151,9.244346,CPBroadPhenixNCBIN11PlaneP2,1


In [260]:
t.reset_index().loc[0, "Key"]

'cpg0002jumpscope/source4/images/20201116Scope1PE/images/CBIN11Plane/CP Broad Phenix C BIN1 1Plane P120201111T092612Measurement 1/Images/r01c01f01p01ch1sk1fk1fl1.tiff'

In [187]:
# Plate2_PCO_6ch_4site_10XPA
# Plate2-PCO-6ch-4site-10XPA
# file_df[file_df["Key"].str.contains("CPBroadPhenixCBIN1")].reset_index().loc[0:2,"Key"].values

# BRO011701410x
# BRO011703320xb

file_df[file_df["Key"].str.contains("BRO0117033")].reset_index().loc[0:2,"Key"].values

array(['cpg0002jumpscope/source4/images/20201116Scope1YokogawaUS/BRO01170333D2x2x1dpc20x20201023151718/AssayPlatePerkinElmerCellCarrier384/AssayPlatePerkinElmerCellCarrier384A01T0001F001L01A01Z01C02.tif',
       'cpg0002jumpscope/source4/images/20201116Scope1YokogawaUS/BRO01170333D2x2x1dpc20x20201023151718/AssayPlatePerkinElmerCellCarrier384/AssayPlatePerkinElmerCellCarrier384A01T0001F001L01A01Z01C03.tif',
       'cpg0002jumpscope/source4/images/20201116Scope1YokogawaUS/BRO01170333D2x2x1dpc20x20201023151718/AssayPlatePerkinElmerCellCarrier384/AssayPlatePerkinElmerCellCarrier384A01T0001F001L01A02Z01C04.tif'],
      dtype=object)

In [279]:
file_df["Size_MB"].mean()

8.991779374853115

In [141]:
metadata_df["Assay_Plate_Barcode"]

0             Plate2_PCO_6ch_4site_10XPA
1       Plate3_PCO_6ch_4site_10XPA_Crest
2      Plate3_PCO_6ch_4site_10XPA_Crestz
3      Plate3_PCO_6ch_4site_20XPA_Crestz
4             Plate2_PCO_6ch_9site_20XPA
                     ...                
179                      BRO0117056_20xb
180                       BRO0117033_20x
181                       BRO0117059_20X
182                      BRO01177034_20x
183                       BRO0117059_40x
Name: Assay_Plate_Barcode, Length: 184, dtype: object

In [194]:
import re

# s = "CPBroadPhenixCBIN11PlaneP1"
s = "BRO01177034_20x"
# s = re.sub("1Plane.*", "", s)
s = re.sub("_20x.*|_20xa.*", "", s)
s

'BRO01177034'

In [198]:
pr = "BRO011705620x"
problem_platenames = {
    "BRO011701410x": "BRO0117014",
    "BRO011703320xb": "BRO0117033",
    "BRO011705620x": "BRO0117056",
    "BRO011705620xb": "BRO0117056",
    "BRO011703320x": "BRO0117033"
    }
if pr in problem_platenames.keys():
    print(problem_platenames[pr])

BRO0117056


In [None]:
import math
# Within a profile, assess how many sites per well. 
# Is there an anomaly? Ie. some plates with wells containing two sites, others with 3

def check_sites(metadata_df, profile_path, feature):
    """
    For reported metadata sites, check that they
    are consistent
    """
    out_df = []
    for ind, row in metadata_df.iterrows():
        df_path = os.path.join(profile_path, row["Batch"], row["Assay_Plate_Barcode"], "*_normalized_feature_select_negcon_batch.csv.gz")
        df_path = glob.glob(df_path)[0]
        load_df = pd.read_csv(df_path)
        found_sites = set(load_df[feature])
        if len(found_sites) > 1:
            out_df.append(pd.DataFrame({"Batch": [row["Batch"]], "Assay_Plate_Barcode": [row["Assay_Plate_Barcode"]], "found_sites": [found_sites]}))
    out_df = pd.concat(out_df)
        # print(set(load_df[feature]), row["Assay_Plate_Barcode"])
        # try:
        #     sites = sum(load_df[feature]) / len(load_df)
        #     # if sites != load_df.loc[0, feature]:
        #     if not math.isclose(sites, row["sites"], rel_tol=1e-2):
        #         print(f"sites: {sites}", row["Batch"], row["Assay_Plate_Barcode"])
        # except:
        #     print(row["Batch"])
    return out_df
site_check = check_sites(experiment_df, "../jump-scope/profiles/", "Metadata_Site_Count")
site_check.to_csv("checkpoints/metadata_site_check.csv", index=False)
