In [1]:
from pathlib import Path
import vaex
import numpy as np
import re
import pandas as pd


### BBBC001

In [36]:
dataset = "BBBC001"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpath = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"][0]
x = vaex.open(arrowpath)
x['path'] =  np.array([path] * len(x))
x['well'] =  np.array([None] * len(x))
x['wellnumber'] =  np.array([None] * len(x))
x['site'] = np.array([None] * len(x))
x['z_position'] =  np.array([None] * len(x))
x['channel'] = np.array(['Hoechst-33342']* len(x))
x['organism'] =  np.array(['human'] * len(x))
x['cell'] =  np.array(['HT29'] * len(x))
x['perturbation'] = np.array(['Rock1_1885_k27']* len(x))
x['control_type'] = np.array(['siRNA']* len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array(["RNA interference"] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))





### BBBC002

In [37]:
dataset = "BBBC002"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpath = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"][0]
x = vaex.open(arrowpath)

images = x['intensity_image'].tolist()

pattern = re.compile(".*_(?P<perturbation>\w+)_40x")
match = [pattern.match(i) for i in images]
pert= [r.group("perturbation") for r in match]

pert_dict = {
    'nodsRNA': 'wild-type',
    '340': 'siRNA',
    'Anillin' : 'siRNA',
    'mad2': 'siRNA', 
    '48' : 'siRNA'  
}

x['path'] =  np.array([path] * len(x))
x['well'] =  np.array([None] * len(x))
x['wellnumber'] =  np.array([None] * len(x))
x['site'] = np.array([None] * len(x))
x['z_position'] =  np.array([None] * len(x))
x['channel'] = np.array(['Hoechst-33342']* len(x))
x['organism'] =  np.array(['drosophila melanogaster'] * len(x))
x['cell'] =  np.array(['Kc167'] * len(x))
x['perturbation'] = np.array(pert)
x['control_type'] = x['perturbation'].map(pert_dict)
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array(["RNA interference"] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))


### BBBC003

In [38]:
dataset = "BBBC003"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpath = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"][0]
x = vaex.open(arrowpath)

x['path'] =  np.array([path] * len(x))
x['well'] =  np.array([None] * len(x))
x['wellnumber'] =  np.array([None] * len(x))
x['site'] = np.array([None] * len(x))
x['z_position'] =  np.array([None] * len(x))
x['channel'] = np.array(['DIC']* len(x))
x['organism'] =  np.array(['mouse'] * len(x))
x['cell'] =  np.array(['embryos'] * len(x))
x['perturbation'] = np.array(["None"] * len(x))
x['control_type'] = np.array(["None"] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array(["embryo viability assay"] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))

### BBBC004

In [39]:
dataset = "BBBC004"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]

x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)

images = x['plate'].tolist()
pattern = re.compile(".*_(?P<perturbation>\d+)_images")
match = [pattern.match(i) for i in images]
pert= [r.group("perturbation") for r in match]


pert_dict = {
    '000': '0',
    '015': '0.15',
    '030' : '0.3',
    '045': '0.45', 
    '060' : '0.6'  
}

x['path'] =  np.array([path] * len(x))
x['well'] =  np.array([None] * len(x))
x['wellnumber'] =  np.array([None] * len(x))
x['site'] = np.array([None] * len(x))
x['z_position'] =  np.array([None] * len(x))
x['channel'] = np.array(['synthetic-nuclei']* len(x))

x['organism'] =  np.array([None] * len(x))
x['cell'] =  np.array(['synthetic-cell'] * len(x))
x['perturbation'] = np.array(pert)
x['perturbation'] = x['perturbation'].map(pert_dict)
x['control_type'] = np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))



### BBBC005

In [40]:
dataset = "BBBC005"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]

x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
pattern = re.compile(".*_(?P<well>\w+).*_(?P<focus>\w+)_s(?P<site>\d+)_w(?P<channel>\d).ome.tif")
match = [pattern.match(i) for i in images]
well = [r.group("well") for r in match]
wellnumber = [''.join(re.findall(r'\d', w)) for w in well]
pert= [r.group("focus") for r in match]
site = [r.group("site") for r in match]
channel = [r.group("channel") for r in match]

pert_dict = {
    '1': 'cell body stain',
    '2': 'nuclei stain', 
}

x['path'] =  np.array([path] * len(x))
x['well'] =  np.array(well)
x['wellnumber'] =  np.array(wellnumber)
x['site'] = np.array(site)
x['z_position'] =  np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(pert_dict)

x['organism'] =  np.array([None] * len(x))
x['cell'] =  np.array(['synthetic-cell'] * len(x))
x['perturbation'] = np.array(pert)
x['control_type'] = np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))



### BBBC006

In [45]:
dataset = "BBBC006"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
plates = x['plate'].tolist()
pattern_zpos = re.compile("BBBC006_v1_images_z_(?P<zposition>\w+)")
match_zpos = [pattern_zpos.match(i) for i in plates]
z_pos = [r.group("zposition") for r in match_zpos]

pattern = re.compile(".*_(?P<well>\w+)_s(?P<site>\d+)_w(?P<channel>\d).*.ome.tif")
match = [pattern.match(i) for i in images]
well = [r.group("well") for r in match]
wellnumber = [''.join(re.findall(r'\d', w)) for w in well]
site = [r.group("site") for r in match]
channel = [r.group("channel") for r in match]

channel_dict = {
    '1': 'Hoechst-33342',
    '2': 'Phalloidin', 
}

x['path'] =  np.array([path] * len(x))
x['well'] =  np.array(well)
x['wellnumber'] =  np.array(wellnumber)
x['site'] = np.array(site)
x['z_position'] =  np.array(z_pos)
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)
x['organism'] =  np.array(['human'] * len(x))
x['cell'] =  np.array(['U2OS'] * len(x))
x['perturbation'] =  np.array([None] * len(x))
x['control_type'] = np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))


### BBBC007

In [173]:
import itertools
dataset = "BBBC007"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
images = [f.rsplit('.ome.tif')[0] for f in images]
pattern = re.compile(r'[df]', re.IGNORECASE)
channel = [pattern.findall(im) for im in images]
channel = list(itertools.chain.from_iterable(channel))
channel_dict = {
    'd': 'DNA',
    'D': 'DNA',
    'f': 'actin',
    'F': 'actin',
}

x['path'] =  np.array([path] * len(x))
x['well'] =  np.array([None] * len(x))
x['wellnumber'] =  np.array([None] * len(x))
x['site'] = np.array([None] * len(x))
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)
x['organism'] =  np.array(['drosophila melanogaster'] * len(x))
x['cell'] =  np.array([' Kc167'] * len(x))
x['perturbation'] =  np.array([None] * len(x))
x['control_type'] = np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))

### BBBC008

In [14]:
dataset = "BBBC008"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
pattern = re.compile(".*050116000001_(?P<well>\w+)f(?P<site>\d+)d.*channel(?P<channel>\d).*.ome.tif")
match = [pattern.match(i) for i in images]
well = [r.group("well") for r in match]
wellnumber = [''.join(re.findall(r'\d', w)) for w in well]
site = [r.group("site") for r in match]
channel = [r.group("channel") for r in match]
channel_dict = {
    '1': 'Hoechst-33342',
    '3': 'Phalloidin', 
}

x['path'] =  np.array([path] * len(x))
x['well'] =  np.array(well)
x['wellnumber'] =  np.array(wellnumber )
x['site'] = np.array(site)
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)
x['organism'] =  np.array(['human'] * len(x))
x['cell'] =  np.array(['HT29'] * len(x))
x['perturbation'] =  np.array([None] * len(x))
x['control_type'] = np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array(['screen for mitotic regulators'] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))


### BBBC009

In [17]:
dataset = "BBBC009"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
x['path'] =  np.array([path] * len(x))
x['well'] =  np.array([None] * len(x))
x['wellnumber'] =  np.array([None] * len(x))
x['site'] = np.array([None] * len(x))
x['z_position'] = np.array([None] * len(x))
x['channel'] =  np.array(["DIC"] * len(x))
x['organism'] =  np.array(['human'] * len(x))
x['cell'] =  np.array(['RBC'] * len(x))
x['perturbation'] =  np.array([None] * len(x))
x['control_type'] = np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))

### BBBC010

In [25]:
dataset = "BBBC010"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
pattern = re.compile(".*20070424_(?P<well>\w+)_w(?P<channel>\d)")
match = [pattern.match(i) for i in images]
well = [r.group("well") for r in match]
wellnumber = [''.join(re.findall(r'\d', w)) for w in well]
channel = [r.group("channel") for r in match]
channel_dict = {
        "1": "brightfield",
        "2": "GFP",
    }

pert_dict = {
    '01': 'ampicillin',
    '02': 'ampicillin',
    '03': 'ampicillin',
    '04': 'ampicillin',
    '05': 'ampicillin',
    '06': 'ampicillin',
    '07': 'ampicillin',
    '08': 'ampicillin',
    '09': 'ampicillin',
    '10': 'ampicillin',
    '11': 'ampicillin',
    '12': 'ampicillin',
    '13': 'untreated',
    '14': 'untreated',
    '15': 'untreated',
    '16': 'untreated',
    '17': 'untreated',
    '18': 'untreated',
    '19': 'untreated',
    '20': 'untreated',
    '21': 'untreated',
    '22': 'untreated',
    '23': 'untreated',
    '24': 'untreated'
}
perturbation = {
    'ampicillin': 'positive controls',
    'untreated': 'negative controls'
   
}

x['path'] =  np.array([path] * len(x))
x['well'] = np.array(well)
x['wellnumber'] = np.array(wellnumber)
x['site'] = np.array([None] * len(x))
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)

x['organism'] =  np.array(['C.elegans'] * len(x))
x['cell'] =  np.array([None] * len(x))
x['perturbation'] = x['wellnumber'].map(pert_dict)
x['control_type'] = x['perturbation'].map(perturbation)
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))
x


### BBBC011

In [80]:
dataset = "BBBC011"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    foldername = '/'.join(str(arrowpath).split("BBBC011/")[-1].lstrip('/').split('/')[:-1])
    x_df = vaex.open(arrowpath)
    x_df['plate'] =  np.array([foldername] * len(x_df))
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
plates = x['plate'].tolist()
pattern = re.compile("(?P<well>\w+)s_c.*.ome.tif")
match = [pattern.match(i) for i in images]
pattern2 = re.compile("BBBC011_v1_images/(?P<perturbation>.*)_rep1/")
match2 = [pattern2.match(i) for i in plates]
perturbation = [r.group("perturbation") for r in match2]


perturbation_dict = {
    'L4440':'negative controls', 
    'daf-2': 'positive controls',
   
}


well = [r.group("well") for r in match]
wellnumber = [''.join(re.findall(r'\d', w)) for w in well]
x['path'] =  np.array([path] * len(x))
x['well'] = np.array(well)
x['wellnumber'] = np.array(wellnumber)
x['site'] = np.array([None] * len(x))
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array([None] * len(x))

x['organism'] =  np.array(['C.elegans'] * len(x))
x['cell'] =  np.array([None] * len(x))
x['perturbation'] = np.array(perturbation)
x['control_type'] = x['perturbation'].map(perturbation_dict)
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))



### BBBC012

In [96]:
dataset = "BBBC012"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
pattern = re.compile("101210OranePlt2_(?P<well>\w+)_w(?P<channel>\d)")
match = [pattern.match(i) for i in images]
well = [r.group("well") for r in match]
wellnumber = [''.join(re.findall(r'\d', w)) for w in well]
channel = [r.group("channel") for r in match]
channel_dict = {
        "1": "brightfield",
        "2": "GFP",
        "3": "mCherry",
    }

perturbation_dict = {
        "wt_plate2": "wt",
        "pmk1_plate2": "pmk-1(km25)",
    }


control_dict = {
        "wt": 'negative controls',
        "pmk-1(km25)": 'positive controls',
    }

x['path'] =  np.array([path] * len(x))
x['well'] = np.array(well)
x['wellnumber'] = np.array(wellnumber)
x['site'] = np.array([None] * len(x))
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)

x['organism'] =  np.array(['C.elegans'] * len(x))
x['cell'] =  np.array([None] * len(x))
x['perturbation'] = x['plate'].map(perturbation_dict)
x['control_type'] = x['perturbation'].map(control_dict)
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array([None] * len(x))

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))


### BBBC013

In [5]:
dataset = "BBBC013"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'
platemap = Path('/projects/PanMicroscopy/data/BBBC/BBBC013/raw/Ground_Truth/BBBC013_v1_platemap_all.txt')

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
pattern = re.compile("Channel(?P<channel>\d).*(?P<well>\w)-(?P<wellnumber>\d+)")
match = [pattern.match(i) for i in images]
channel = [r.group("channel") for r in match]
well = [r.group("well") for r in match]
wellnumber = [r.group("wellnumber") for r in match]
well = [f"{w}{n}" for w, n in zip(well, wellnumber)]

channel_dict = {
        "1": " FKHR-GFP",
        "2": "DRAQ5",
    }



with open(platemap, 'r') as m:
    lines = m.readlines()
doses = [line.strip() for line in lines][1:]

pert_dict = {'A01': '',
 'A02' : '',
 'A03' : 'Wortmannin',
 'A04': 'Wortmannin',
 'A05': 'Wortmannin',
 'A06': 'Wortmannin',
 'A07': 'Wortmannin',
 'A08': 'Wortmannin',
 'A09': 'Wortmannin',
 'A10': 'Wortmannin',
 'A11': 'Wortmannin',
 'A12':'',
 'B01': '',
 'B02':'',
 'B03': 'Wortmannin',
 'B04': 'Wortmannin',
 'B05': 'Wortmannin',
 'B06': 'Wortmannin',
 'B07': 'Wortmannin',
 'B08': 'Wortmannin',
 'B09': 'Wortmannin',
 'B10': 'Wortmannin',
 'B11': 'Wortmannin',
 'B12':'',
 'C01':'',
 'C02':'',
 'C03': 'Wortmannin',
 'C04': 'Wortmannin',
 'C05': 'Wortmannin',
 'C06': 'Wortmannin',
 'C07': 'Wortmannin',
 'C08': 'Wortmannin',
 'C09': 'Wortmannin',
 'C10': 'Wortmannin',
 'C11': 'Wortmannin',
 'C12':'',
 'D01':'',
 'D02':'',
 'D03': 'Wortmannin',
 'D04': 'Wortmannin',
 'D05': 'Wortmannin',
 'D06': 'Wortmannin',
 'D07': 'Wortmannin',
 'D08': 'Wortmannin',
 'D09': 'Wortmannin',
 'D10': 'Wortmannin',
 'D11': 'Wortmannin',
 'D12':'',
 'E01':'',
 'E02':'',
 'E03': 'LY294002',
 'E04': 'LY294002',
 'E05': 'LY294002',
 'E06': 'LY294002',
 'E07': 'LY294002',
 'E08': 'LY294002',
 'E09': 'LY294002',
 'E10': 'LY294002',
 'E11': 'LY294002',
 'E12':'',
 'F01':'',
 'F02':'',
 'F03': 'LY294002',
 'F04': 'LY294002',
 'F05': 'LY294002',
 'F06': 'LY294002',
 'F07': 'LY294002',
 'F08': 'LY294002',
 'F09': 'LY294002',
 'F10': 'LY294002',
 'F11': 'LY294002',
 'F12':'',
 'G01':'',
 'G02':'',
 'G03': 'LY294002',
 'G04': 'LY294002',
 'G05': 'LY294002',
 'G06': 'LY294002',
 'G07': 'LY294002',
 'G08': 'LY294002',
 'G09': 'LY294002',
 'G10': 'LY294002',
 'G11': 'LY294002',
 'G12':'',
 'H01':'',
 'H02':'',
 'H03': 'LY294002',
 'H04': 'LY294002',
 'H05': 'LY294002',
 'H06': 'LY294002',
 'H07': 'LY294002',
 'H08': 'LY294002',
 'H09': 'LY294002',
 'H10': 'LY294002',
 'H11': 'LY294002',
 'H12':''}

control_dict = {'A01': 'negative controls',
 'A02' : 'emptywell',
 'A03' : '',
 'A04': '',
 'A05': '',
 'A06': '',
 'A07': '',
 'A08': '',
 'A09': '',
 'A10': '',
 'A11': '',
 'A12':'positive controls',
 'B01': 'negative controls',
 'B02':'emptywell',
 'B03': '',
 'B04': '',
 'B05': '',
 'B06': '',
 'B07': '',
 'B08': '',
 'B09': '',
 'B10': '',
 'B11': '',
 'B12':'positive controls',
 'C01':'negative controls',
 'C02':'emptywell',
 'C03': '',
 'C04': '',
 'C05': '',
 'C06': '',
 'C07': '',
 'C08': '',
 'C09': '',
 'C10': '',
 'C11':'',
 'C12':'positive controls',
 'D01':'negative controls',
 'D02':'emptywell',
 'D03':'',
 'D04': '',
 'D05': '',
 'D06': '',
 'D07': '',
 'D08': '',
 'D09': '',
 'D10': '',
 'D11': '',
 'D12':'positive controls',
 'E01':'positive controls',
 'E02':'emptywell',
 'E03': '',
 'E04': '',
 'E05': '',
 'E06': '',
 'E07': '',
 'E08': '',
 'E09': '',
 'E10': '',
 'E11': '',
 'E12':'negative controls',
 'F01':'positive controls',
 'F02':'emptywell',
 'F03': '',
 'F04': '',
 'F05':'',
 'F06': '',
 'F07': '',
 'F08': '',
 'F09': '',
 'F10': '',
 'F11': '',
 'F12':'negative controls',
 'G01':'positive controls',
 'G02':'emptywell',
 'G03': '',
 'G04': '',
 'G05': '',
 'G06': '',
 'G07': '',
 'G08': '',
 'G09': '',
 'G10': '',
 'G11': '',
 'G12':'negative controls',
 'H01':'positive controls',
 'H02':'emptywell',
 'H03': '',
 'H04': '',
 'H05': '',
 'H06': '',
 'H07': '',
 'H08': '',
 'H09': '',
 'H10': '',
 'H11': '',
 'H12':'negative controls'}


x['path'] =  np.array([path] * len(x))
x['well'] = np.array(well)
x['wellnumber'] = np.array(wellnumber)
x['site'] = np.array([None] * len(x))
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)

x['organism'] =  np.array(['human'] * len(x))
x['cell'] =  np.array(["U2OS"] * len(x))
x['perturbation'] = x['well'].map(pert_dict)
x['control_type'] = x['well'].map(control_dict)
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array(['cytoplasm nucleus translocation'] * len(x))
keys = list(control_dict.keys())

dose_dict = {keys[i]: doses[i] for i in range(len(keys))}
x['dose'] =  x['well'].map(dose_dict)

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))



### BBBC014

In [4]:
dataset = "BBBC014"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'
platemap = Path('/projects/PanMicroscopy/data/BBBC/BBBC014/raw/Ground_Truth/BBBC014_v1_platemap.xls')
meta = pd.read_excel(platemap, sheet_name='DoseKey')
padded_numbers = [str(num).zfill(2) for num in meta.Col]
meta['well'] = [f"{w}{n}" for w, n in zip(meta['Row'], padded_numbers)]
keys = list(meta.well)
cell_dict = {keys[i]: list(meta.CellType)[i] for i in range(len(keys))}
dose_dict = {keys[i]: list(meta.Dose)[i] for i in range(len(keys))}

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)
images = x['intensity_image'].tolist()
pattern = re.compile("Channel (?P<channel>\d)-.*-(?P<well>\w)-(?P<wellnumber>\d+).*")
match = [pattern.match(i) for i in images]
channel = [r.group("channel") for r in match]
well = [r.group("well") for r in match]
wellnumber = [r.group("wellnumber") for r in match]
well = [f"{w}{n}" for w, n in zip(well, wellnumber)]
channel_dict = {
        "1": "FITC",
        "2": "DAPI",
    }


x['path'] =  np.array([path] * len(x))
x['well'] = np.array(well)
x['wellnumber'] = np.array(wellnumber)
x['site'] = np.array([None] * len(x))
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)

x['organism'] =  np.array(['human'] * len(x))
x['cell'] =  x['well'].map(cell_dict)
x['perturbation'] = np.array(['TNFα'] * len(x))
x['control_type'] =np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array(['cytoplasm nucleus translocation'] * len(x))

x['dose'] =  x['well'].map(dose_dict)

outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))



### BBBC015

In [2]:
dataset = "BBBC015"
inp_dir = Path('/projects/PanMicroscopy/NyxusFeatures/BBBC')
path = f'/projects/PanMicroscopy/data/BBBC/{dataset}/omeconverted'

inp_dir = inp_dir.joinpath(dataset)
arrowpaths = [path for path in inp_dir.rglob('*') if path.is_file() if path.name == "combined_NyxusFeatures.arrow"]
x = []

for arrowpath in arrowpaths:
    x_df = vaex.open(arrowpath)
    x.append(x_df)

x = vaex.concat(x)

images = x['intensity_image'].tolist()
pattern = re.compile(".*_c(?P<channel>\d)")
match = [pattern.match(i) for i in images]
channel = [r.group("channel") for r in match]
x['channel'] = np.array(channel)
x = x[x['channel']=='0']
images = x['intensity_image'].tolist()
pattern = re.compile("(?P<wellnumber>\d+)_(?P<site>\d)_(?P<channel>\d)")
match = [pattern.match(i) for i in images]
channel = [r.group("channel") for r in match]

site = [r.group("site") for r in match]
wellnumber = [r.group("wellnumber") for r in match]

well_map = {'49': 'E01', 
            '50': 'E02', 
            '51': 'E03',
            '52': 'E04',
            '53': 'E05',
            '54': 'E06',
            '55': 'E07',
            '56': 'E08',
            '57': 'E09',
            '58': 'E10',
            '59': 'E11',
            '60': 'E12',
            '61': 'F01', 
            '62': 'F02', 
            '63': 'F03', 
            '64': 'F04', 
            '65': 'F05', 
            '66': 'F06', 
            '67': 'F07', 
            '68': 'F08', 
            '69': 'F09', 
            '70': 'F10', 
            '71': 'F11', 
            '72': 'F12', 
            '73': 'G01', 
            '74': 'G02', 
            '75': 'G03', 
            '76': 'G04', 
            '77': 'G05', 
            '78': 'G06', 
            '79': 'G07', 
            '80': 'G08', 
            '81': 'G09', 
            '82': 'G10',  
            '83': 'G11', 
            '84': 'G12'
            }

dose_dict = {'E01': '1000',
            'E02': '333',
            'E03': '111',
            'E04': '37',
            'E05': '12',
            'E06': '4.1',
            'E07': '1.4',
            'E08': '0.457',
            'E09': '0.152',
            'E10': '0.051',
            'E11': '0.017',
            'E12': '0.0056',
            'F01': '1000',
            'F02': '333',
            'F03': '111',
            'F04': '37',
            'F05': '12',
            'F06': '4.1',
            'F07': '1.4',
            'F08': '0.457',
            'F09': '0.152',
            'F10': '0.051',
            'F11': '0.017',
            'F12': '0.0056', 
            'G01': '1000',
            'G02': '333',
            'G03': '111',
            'G04': '37',
            'G05': '12',
            'G06': '4.1',
            'G07': '1.4',
            'G08': '0.457',
            'G09': '0.152',
            'G10': '0.051',
            'G11': '0.017',
            'G12': '0.0056',        
            }

channel_dict = {
        "0": "beta2 (b2AR) adrenergic receptor",
        "1": "arrestin-GFP",
    }


x = x.to_pandas_df()
x =x.assign(wellnumber=np.array(wellnumber))
x['path'] =  np.array([path] * len(x))
x['well'] =x['wellnumber'].map(well_map)
x.drop(columns=['channel', 'wellnumber'], axis=1, inplace=True)
wellnumber = [''.join(re.findall(r'\d', w)) for w in x['well']]
x =x.assign(wellnumber=np.array(wellnumber))
x =x.assign(site=np.array(site))
x['z_position'] = np.array([None] * len(x))
x['channel'] = np.array(channel)
x['channel'] = x['channel'].map(channel_dict)

x['organism'] =  np.array(['human'] * len(x))
x['cell'] =   np.array(['U2OS'] * len(x))
x['perturbation'] = np.array(['isoproterenol'] * len(x))
x['control_type'] =np.array([None] * len(x))
x['modality'] =  np.array(["Optical Imaging"] * len(x))
x['experimental_technique'] =  np.array(['ligand stimulation assay'] * len(x))

x['dose'] =  x['well'].map(dose_dict)
x = vaex.from_pandas(x)
outpath = Path(f'{inp_dir}/combined')
if not Path(outpath).exists():
    Path(outpath).mkdir(parents=True, exist_ok=False)

x.export_feather(outpath.joinpath(f"{dataset}_combined.arrow"))

