In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import flowkit as fk


np.random.seed(777)

In [2]:
# check version so users can verify they have the same version/API
fk.__version__

'1.0.1'

In [3]:
rawdatadir = '/Users/ashesh.ashesh/Documents/PhD/mbldata_solvation/data/'
outputdatadir = '/Users/ashesh.ashesh/Documents/PhD/mbldata_solvation/data_with_tiff/'
c6818_fcs = os.path.join(rawdatadir,"C.6818.fcs")
emiliana_fcs = os.path.join(rawdatadir,"Emiliana_huxley.fcs")

In [4]:
c6818_sample = fk.Sample(c6818_fcs)
emiliana_sample = fk.Sample(emiliana_fcs)

c6818_sample, emiliana_sample

(Sample(v3.2, C.6818.fcs, 450 channels, 299976 events),
 Sample(v3.2, Emiliana_huxley.fcs, 450 channels, 126206 events))

In [5]:
c6818_sample

Sample(v3.2, C.6818.fcs, 450 channels, 299976 events)

In [6]:
# emiliana_sample.subsample_events()
# _df = emiliana_sample.as_dataframe(source="raw", subsample=True)
# _df.to_csv("./emiliana_subsamples.csv", index=False)

# emiliana_sample.as_dataframe(source="raw").to_csv("./emiliana.csv", index=False)

In [7]:
df_c6818 = c6818_sample.as_dataframe(source="raw")
df_emiliana = emiliana_sample.as_dataframe(source='raw')

In [8]:
df_c6818.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299976 entries, 0 to 299975
Columns: 450 entries, ('FSC-A', 'FSC-A') to ('WaveformPresent', '')
dtypes: float64(450)
memory usage: 1.0 GB


In [9]:
import os

def has_tif(datadir):
    hastif_indices = []
    for dir_idx, dir in enumerate(os.listdir(datadir)):
        for fpath in os.listdir(os.path.join(datadir,dir)):
            if fpath.endswith(".tiff"):
                fname = fpath[:-5]
                hastif_indices.append(int(fname.split('_')[-1]))
            else:
                raise ValueError("Something other than tiff file found in {}".format(os.path.join(datadir,dir)))
        # print(dir, dir_idx, len(hastif_indices))
    return hastif_indices

In [10]:
hastifentries_c6818 = has_tif('/Volumes/T7 T/10_sorting/data/230907/C.6818/C.6818_images_20230824_1235_19')
hastifentries_Emiliana = has_tif('/Volumes/T7 T/10_sorting/data/230907/Emiliana_huxley/Emiliana_huxley_images_20230824_1235_33/')


In [11]:
# is_all_numeric = df_c6818.applymap(np.isreal).all(1)
# is_all_numeric.sum()

In [12]:
df_emiliana.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126206 entries, 0 to 126205
Columns: 450 entries, ('FSC-A', 'FSC-A') to ('WaveformPresent', '')
dtypes: float64(450)
memory usage: 433.3 MB


In [13]:
# is_all_numeric = df_emiliana.applymap(np.isreal).all(1)
# is_all_numeric.sum() == len(df_emiliana)

In [14]:
feature_names = c6818_sample.pnn_labels
assert c6818_sample.pnn_labels == emiliana_sample.pnn_labels
print(feature_names)

['FSC-A', 'FSC-H', 'FSC-W', 'SSC (Violet)-A', 'SSC (Violet)-H', 'SSC (Violet)-W', 'LightLoss (Imaging)-A', 'LightLoss (Imaging)-H', 'LightLoss (Imaging)-W', 'SSC (Imaging)-A', 'SSC (Imaging)-H', 'SSC (Imaging)-W', 'LightLoss (Violet)-A', 'LightLoss (Violet)-H', 'LightLoss (Violet)-W', 'Size (LightLoss (Imaging))', 'Size (FSC)', 'Size (SSC (Imaging))', 'Size (Green*)', 'Size (Orange*)', 'Size (Red*)', 'Max Intensity (LightLoss (Imaging))', 'Max Intensity (FSC)', 'Max Intensity (SSC (Imaging))', 'Max Intensity (Green*)', 'Max Intensity (Orange*)', 'Max Intensity (Red*)', 'Long Axis Moment (LightLoss (Imaging))', 'Long Axis Moment (FSC)', 'Long Axis Moment (SSC (Imaging))', 'Long Axis Moment (Green*)', 'Long Axis Moment (Orange*)', 'Long Axis Moment (Red*)', 'Short Axis Moment (LightLoss (Imaging))', 'Short Axis Moment (FSC)', 'Short Axis Moment (SSC (Imaging))', 'Short Axis Moment (Green*)', 'Short Axis Moment (Orange*)', 'Short Axis Moment (Red*)', 'Center of Mass (Y) (LightLoss (Imagin

In [15]:
# remove the Time and DropId columns,
# because they are distinctive features for each sample class.
drop_cols = ["Time", "DropId"]
feature_names = [name for name in feature_names if name not in drop_cols]
df_c6818.drop(columns=drop_cols, level=0, inplace=True)
df_emiliana.drop(columns=drop_cols, level=0, inplace=True)

len(feature_names)

448

In [16]:
df_c6818.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299976 entries, 0 to 299975
Columns: 448 entries, ('FSC-A', 'FSC-A') to ('WaveformPresent', '')
dtypes: float64(448)
memory usage: 1.0 GB


In [17]:
num_class_samples = min(len(df_emiliana), len(df_c6818))
num_class_samples

126206

## adding the info about has tiff

In [18]:
df_c6818['HasTiff'] = 0
df_c6818.loc[hastifentries_c6818]['HasTiff'] = 1
df_emiliana['HasTiff'] = 0
df_emiliana.loc[hastifentries_Emiliana]['HasTiff'] = 1

In [19]:
all_data = np.vstack((
    df_c6818.to_numpy(dtype=np.float_)[:num_class_samples],
    df_emiliana.to_numpy(dtype=np.float_)[:num_class_samples]
))

num_data = len(all_data)

all_data.shape

(252412, 449)

In [20]:
# class labels (first half is 0, second half is 1)
labels = np.zeros(num_data, dtype=int)
labels[:num_class_samples] = 1

print((labels == 0).sum(), (labels == 1).sum())

126206 126206


In [21]:
# make train / test datasets by mixing two data randomly
num_train = int(0.85 * num_data)
shuffled_indices = np.random.choice(num_data, size=num_data, replace=False)

train_indices = shuffled_indices[: num_train]
test_indices = shuffled_indices[num_train:]

train_data = all_data[train_indices]
train_labels = labels[train_indices]

test_data = all_data[test_indices]
test_labels = labels[test_indices]


print(train_data.shape, test_data.shape)
print(np.unique(test_labels))
print((train_labels == 0).sum(), (train_labels == 1).sum())

(214550, 449) (37862, 449)
[0 1]
107242 107308


In [22]:
# save datasets
with open(os.path.join(outputdatadir,"train_ds.bin"), mode="wb") as f:
    pickle.dump({
        "train_data": train_data,
        "train_labels": train_labels,
        'train_has_tiff'
        "feature_names": feature_names
    }, f)

with open(os.path.join(outputdatadir,"test_ds.bin"), mode="wb") as f:
    pickle.dump({
        "test_data": test_data,
        "test_labels": test_labels,
        "feature_names": feature_names
    }, f)

In [23]:
meta_keys = list(c6818_sample.get_metadata().keys())
for k in emiliana_sample.get_metadata().keys():
    if k not in meta_keys:
        print(f"{k} is not in other sample meta data!")
        break

In [24]:
# get different types of columns
COL_TYPE_SCATTER = "SCATTER"
COL_TYPE_COLOR = "COLOR"
COL_TYPE_TIME = "TIME"

scatter_cols = []
color_cols = []
time_cols = []
other_cols = []

meta_dict = c6818_sample.get_metadata()

for i in range(len(feature_names)):
    key = f"p{i}kind"
    col_type = meta_dict.get(key, None)
    if col_type is None:
        other_cols.append(i)
    elif col_type.upper() == COL_TYPE_SCATTER:
        scatter_cols.append(i)
    elif col_type.upper() == COL_TYPE_COLOR:
        color_cols.append(i)
    elif col_type.upper() == COL_TYPE_TIME:
        time_cols.append(i)

len_all = [len(scatter_cols), len(color_cols), len(time_cols), len(other_cols)]
print(len_all, sum(len_all))


[20, 324, 1, 103] 448


In [25]:
# get imaging columns
imaging_cols = []
for i, name in enumerate(feature_names):
    if name.count("Imaging") > 0:
        imaging_cols.append(i)

len(imaging_cols)

37

In [26]:
# create dataset based on only imaging columns
all_imaging_data = np.vstack((
    df_c6818.iloc[:, imaging_cols].to_numpy(dtype=np.float_)[:num_class_samples],
    df_emiliana.iloc[:, imaging_cols].to_numpy(dtype=np.float_)[:num_class_samples]
))

all_imaging_data.shape

(252412, 37)

In [27]:
train_imaging_data = all_imaging_data[train_indices]
train_imaging_labels = labels[train_indices]

test_imaging_data = all_imaging_data[test_indices]
test_imaging_labels = labels[test_indices]

print(train_imaging_data.shape, test_imaging_data.shape)

(214550, 37) (37862, 37)


In [28]:
# save datasets
import os

with open(os.path.join(outputdatadir, "train_imaging_ds.bin"), mode="wb") as f:
    pickle.dump({
        "train_data": train_imaging_data,
        "train_labels": train_imaging_labels,
        "feature_names": np.array(feature_names)[imaging_cols].tolist()
    }, f)

with open(os.path.join(outputdatadir, "test_imaging_ds.bin"), mode="wb") as f:
    pickle.dump({
        "test_data": test_imaging_data,
        "test_labels": test_imaging_labels,
        "feature_names": np.array(feature_names)[imaging_cols].tolist()
    }, f)

In [29]:
!ls "$outputdatadir"

test_ds.bin          train_ds.bin
test_imaging_ds.bin  train_imaging_ds.bin
