#### Boilerplate

In [None]:
%cd CellModeller-ingallslab
%pip install -e . --use-pep517
#%cd CellProfilerAnalysis/
#%pip install - e . --use-pep517
#%pip install CellProfiler


### The below cell converts all pickle files in a directory into Numpy arrays, then merges all pickle files to form a final Numpy array
The final numpy array has shape `[num_pickle_files, feature_id, 2]`, where `feature_id` is a list from 0-17, `idx[num_pickle_files, feature_id, 0]` contains the feature name (eg: time, radius etc) and `idx[num_pickle_files, feature_id, 1]` contains the value of the feature

In [2]:

import os
import numpy as np
import torch


def make_numpy_array(pickle_to_dict):
    property_dict = {
        "stepNum": pickle_to_dict["stepNum"], 'lineage': pickle_to_dict['lineage'], 'id': [], 'label': [], 'cellType': [], 'divideFlag': [], 'cellAge': [], 'growthRate': [], 'startVol': [], 'targetVol': [], 'pos': [], 'time': [], 'radius': [], 'length': [], 'dir': [], 'ends': [], 'strainRate': [], 'strainRate_rolling': []
    }

    for key in pickle_to_dict['cellStates'].keys():
        property_dict['id'].append(pickle_to_dict['cellStates'][key].id)
        property_dict['label'].append(pickle_to_dict['cellStates'][key].label)
        property_dict['cellType'].append(
            pickle_to_dict['cellStates'][key].cellType)
        property_dict['divideFlag'].append(
            pickle_to_dict['cellStates'][key].divideFlag)
        property_dict['cellAge'].append(
            pickle_to_dict['cellStates'][key].cellAge)
        property_dict['growthRate'].append(
            pickle_to_dict['cellStates'][key].growthRate)
        property_dict['startVol'].append(
            pickle_to_dict['cellStates'][key].startVol)
        property_dict['targetVol'].append(
            pickle_to_dict['cellStates'][key].targetVol)
        property_dict['pos'].append(pickle_to_dict['cellStates'][key].pos)
        property_dict['time'].append(pickle_to_dict['cellStates'][key].time)
        property_dict['radius'].append(
            pickle_to_dict['cellStates'][key].radius)
        property_dict['length'].append(
            pickle_to_dict['cellStates'][key].length)
        property_dict['dir'].append(pickle_to_dict['cellStates'][key].dir)
        property_dict['ends'].append(pickle_to_dict['cellStates'][key].ends)
        property_dict['strainRate'].append(
            pickle_to_dict['cellStates'][key].strainRate)
        property_dict['strainRate_rolling'].append(
            pickle_to_dict['cellStates'][key].strainRate_rolling)

    for key in property_dict:
        property_dict[key] = np.array(property_dict[key])

    arr = np.array(list(property_dict.items()), dtype=object)

    return arr


pickle_directory = '/home/stormageddon/MITACS/test/simulation/adh_0.01_1/'

pickle_files = [f for f in os.listdir(pickle_directory) if f.endswith('.pickle')]

combined_array = []

for pickle_file in pickle_files:
    pickle_path = os.path.join(pickle_directory, pickle_file)
    pickle_to_dict = np.load(pickle_path, allow_pickle=True)
    arr = make_numpy_array(pickle_to_dict)
    combined_array.append(arr)

combined_array = np.array(combined_array)

print(combined_array.shape)




(20, 18, 2)


### Now, let's sort the elements based on `step_num`, to get a nice, uniform and sequential time-series

In [3]:
indices = np.argsort(combined_array[:, 0, 1])
sorted_arr = combined_array[indices]
print("idx Feature  Dtype")
for j in range(sorted_arr.shape[1]):
    #print(, j, sorted_arr[0][j][1].dtype)
    print(j, sorted_arr[0][j][0], sorted_arr[0][j][1].dtype)


idx Feature  Dtype
0 stepNum int64
1 lineage object
2 id int64
3 label int64
4 cellType int64
5 divideFlag bool
6 cellAge int64
7 growthRate float64
8 startVol float32
9 targetVol float64
10 pos float32
11 time float64
12 radius float32
13 length float32
14 dir float32
15 ends float32
16 strainRate float32
17 strainRate_rolling float64


### The below cell will print the whole array, I have uploaded an inputs_raw.txt which contains the output

In [4]:
for i in range(sorted_arr.shape[0]):
    for j in range(sorted_arr.shape[1]):
        print(f"{sorted_arr[i][j][0]} | {sorted_arr[i][j][1]} | {sorted_arr[i][j][1].dtype}")
        print("")

stepNum | 0 | int64

lineage | {} | object

id | [1] | int64

label | [1] | int64

cellType | [0] | int64

divideFlag | [False] | bool

cellAge | [1] | int64

growthRate | [1.6] | float64

startVol | [3.5] | float32

targetVol | [7.7] | float64

pos | [[0. 0. 0.]] | float32

time | [0.] | float64

radius | [0.5] | float32

length | [3.5] | float32

dir | [[1. 0. 0.]] | float32

ends | [[[-1.75  0.    0.  ]
  [ 1.75  0.    0.  ]]] | float32

strainRate | [0.] | float32

strainRate_rolling | [nan] | float64

stepNum | 10 | int64

lineage | {} | object

id | [1] | int64

label | [1] | int64

cellType | [0] | int64

divideFlag | [False] | bool

cellAge | [11] | int64

growthRate | [1.6] | float64

startVol | [3.5] | float32

targetVol | [7.7] | float64

pos | [[0. 0. 0.]] | float32

time | [0.25] | float64

radius | [0.5] | float32

length | [5.1808553] | float32

dir | [[1. 0. 0.]] | float32

ends | [[[-2.5904276  0.         0.       ]
  [ 2.5904276  0.         0.       ]]] | float32

str

#### Now, we see that the input data has some NaN values in the `strainRate_rolling` feature, but pytorch will handle this. We also have Boolean values in the `divideflag` section, which will also be handled by Pytorch
#### The problem is `Lineages`, which is a `numpy object` and this is the sole feature preventing us from converting our `numpy array` to a `pytorch tensor` Let us explore the `Lineage` feature in the cell below.

In [5]:
#Extract the lineages column from the sorted array
lineages=sorted_arr[:,1,1]
#Print the lineages for all 20 timesteps
for i in range(lineages.shape[0]):
    print(lineages[i])




{}
{}
{}
{2: 1, 3: 1}
{2: 1, 3: 1}
{2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3}
{2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3}
{2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3, 8: 4, 9: 4, 10: 5, 11: 5, 12: 6, 13: 6, 14: 7, 15: 7}
{2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3, 8: 4, 9: 4, 10: 5, 11: 5, 12: 6, 13: 6, 14: 7, 15: 7}
{2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3, 8: 4, 9: 4, 10: 5, 11: 5, 12: 6, 13: 6, 14: 7, 15: 7, 16: 8, 17: 8, 18: 9, 19: 9, 20: 10, 21: 10, 22: 12, 23: 12, 24: 11, 25: 11, 26: 13, 27: 13, 28: 14, 29: 14, 30: 15, 31: 15}
{2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3, 8: 4, 9: 4, 10: 5, 11: 5, 12: 6, 13: 6, 14: 7, 15: 7, 16: 8, 17: 8, 18: 9, 19: 9, 20: 10, 21: 10, 22: 12, 23: 12, 24: 11, 25: 11, 26: 13, 27: 13, 28: 14, 29: 14, 30: 15, 31: 15}
{2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: 3, 8: 4, 9: 4, 10: 5, 11: 5, 12: 6, 13: 6, 14: 7, 15: 7, 16: 8, 17: 8, 18: 9, 19: 9, 20: 10, 21: 10, 22: 12, 23: 12, 24: 11, 25: 11, 26: 13, 27: 13, 28: 14, 29: 14, 30: 15, 31: 15, 32: 16, 33: 16, 34: 17, 35: 17, 36: 18, 37: 18, 38: 22, 39: 22, 

#### This is my progress today (15th May, Monday)
#### The Todo plan for tomorrow will be:
- [ ] Work on numerically representing Lineage to serve as input to the model
- [ ] Finish converting the pickle files to a pytorch format, finalize the shape of the input data to the model
- [ ] Get started coding the model in Pytorch


#### Concerns @Ati:
#### 2 Do we need the Lineage data? The Lineage data seems to be serial numbers for the cells. I observed a pattern such that one cell is the parent of two cells (eg: [1:2,1:3], [2:4,2:5], [3:6 3:7] and on). So my question was, do we need this as input to the model, since this would be meaningless (the model would not be able to learn anything useful from this feature vector). I had the same doubt for the label and Celltype columns.


#### Save Feature_Names for reference

In [6]:
new_arr = np.delete(sorted_arr, 1, axis=1)
feature_names = new_arr[0, :, 0]
feature_names


array(['stepNum', 'id', 'label', 'cellType', 'divideFlag', 'cellAge',
       'growthRate', 'startVol', 'targetVol', 'pos', 'time', 'radius',
       'length', 'dir', 'ends', 'strainRate', 'strainRate_rolling'],
      dtype=object)

#### Delete Feature_Names column from original array (Model can't take string as input, does not need it as well)

In [7]:
final_arr = np.delete(new_arr, 0, axis=2)

In [8]:
for i in range(final_arr.shape[0]):
    for j in range(final_arr.shape[1]):
        print(f"{feature_names[j]} | {final_arr[i][j]} | {final_arr[i][j].dtype}")

stepNum | [array(0)] | object
id | [array([1])] | object
label | [array([1])] | object
cellType | [array([0])] | object
divideFlag | [array([False])] | object
cellAge | [array([1])] | object
growthRate | [array([1.6])] | object
startVol | [array([3.5], dtype=float32)] | object
targetVol | [array([7.7])] | object
pos | [array([[0., 0., 0.]], dtype=float32)] | object
time | [array([0.])] | object
radius | [array([0.5], dtype=float32)] | object
length | [array([3.5], dtype=float32)] | object
dir | [array([[1., 0., 0.]], dtype=float32)] | object
ends | [array([[[-1.75,  0.  ,  0.  ],
         [ 1.75,  0.  ,  0.  ]]], dtype=float32)] | object
strainRate | [array([0.], dtype=float32)] | object
strainRate_rolling | [array([nan])] | object
stepNum | [array(10)] | object
id | [array([1])] | object
label | [array([1])] | object
cellType | [array([0])] | object
divideFlag | [array([False])] | object
cellAge | [array([11])] | object
growthRate | [array([1.6])] | object
startVol | [array([3.5], dty

#### Convert Boolean to float, and Drop NaN values

In [9]:
#Boolean to Float
for i in range(final_arr.shape[0]):
    final_arr[i][4][0]=final_arr[i][4][0].astype(np.float64)
#Drop NaN values from last column
for i in range(final_arr.shape[0]):
    final_arr[i][16][0] = final_arr[i][16][0][~np.isnan(final_arr[i][16][0])]
