# 0.1.0 Load Data
* Codex Data obtained from: http://welikesharingdata.blob.core.windows.net/forshare/index.html
* Manuscript: https://www.cell.com/cell/pdf/S0092-8674(18)30904-8.pdf

In [7]:
import pandas as pd
df = {}

In [30]:
import bqplot

In [2]:
from glob import glob

## Check available Files

In [33]:
glob('../data/big_data/*.txt')

['../data/big_data/channelNames_BALBcMRLdataset.txt']

In [34]:
glob('../data/big_data/*.csv')

['../data/big_data/CODEX_MRLdataset_neighborhood_graph.csv',
 '../data/big_data/BALBcMRLdataset_exposuretimes.csv',
 '../data/big_data/Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv']

### Expression Data
See http://welikesharingdata.blob.core.windows.net/forshare/index.html

* CSV file contains mean marker intensities and X, Y, Z corrdinates for each cell relative to the top left corner of its tile. 
* Tile IDs correspond to tile images sorted in the alphanumeric order
* Size parameter corresponds to the cell object sizes in voxels
* "imaging phenotype Cluster ID" column specifies the phenotypic cluster identity as determined by X-shift
* "niche Cluster ID" column specifies the i-niche cluster determined by K-means (K=100)
* neighborhood graph files describes which pairs of cells are adjacent to one another

In [36]:
%%time
df['exp'] = pd.read_csv('../data/big_data/Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv')
print(df['exp'].shape)

(734101, 38)
CPU times: user 8.07 s, sys: 512 ms, total: 8.58 s
Wall time: 8.29 s


In [37]:
df['exp'].head()

Unnamed: 0,Imaging phenotype cluster ID,CD45,Ly6C,TCR,Ly6G,CD19,CD169,CD106,CD3,CD1632,...,CD44,NKp46,X.X,Y.Y,Z.Z,MHCII,blank_Cy3_cyc15,blank_Cy5_cyc15,sample_Xtile_Ytile,niche cluster ID
0,9600,1577.675415,-154.301758,130.692184,-4.168493,560.691345,-504.231476,854.670105,-631.294189,385.935242,...,422.408691,515.130066,10,70,13,6712.812988,1665.967896,398.348389,BALBc-3_X05_Y03,32.0
1,9600,1017.83844,-93.069397,144.076584,40.010998,885.59552,-391.357544,62.764454,-474.201172,-469.634583,...,448.70166,171.88031,1000,294,12,2024.678711,1287.959229,421.991425,BALBc-3_X01_Y02,99.0
2,9600,5978.459961,-330.099365,139.631744,-82.840302,1747.897583,-395.50882,954.326782,-1026.204468,3744.718262,...,2229.804443,512.220764,1003,107,8,8647.193359,2817.173828,709.545105,BALBc-2_X05_Y04,74.0
3,9600,6119.109375,-54.384808,-768.871704,25.625927,1065.31189,-485.535431,538.404175,-611.836426,865.84259,...,665.720459,351.108246,1003,113,13,4838.463379,1646.660278,408.52359,BALBc-2_X03_Y04,98.0
4,9600,6272.474609,-235.512405,74.058075,-101.729919,1186.295044,-782.744995,1261.625366,-915.565552,1361.536011,...,2065.742676,259.003235,1003,148,9,5092.891602,2161.109131,713.416199,BALBc-2_X02_Y01,71.0


In [38]:
cols = df['exp'].columns.tolist()
cols

['Imaging phenotype cluster ID',
 'CD45',
 'Ly6C',
 'TCR',
 'Ly6G',
 'CD19',
 'CD169',
 'CD106',
 'CD3',
 'CD1632',
 'CD8a',
 'CD90',
 'F480',
 'CD11c',
 'Ter119',
 'CD11b',
 'IgD',
 'CD27',
 'CD5',
 'CD79b',
 'CD71',
 'CD31',
 'CD4',
 'IgM',
 'B220',
 'ERTR7',
 'CD35',
 'CD2135',
 'CD44',
 'NKp46',
 'X.X',
 'Y.Y',
 'Z.Z',
 'MHCII',
 'blank_Cy3_cyc15',
 'blank_Cy5_cyc15',
 'sample_Xtile_Ytile',
 'niche cluster ID']

In [46]:
unique_dict = {}
for inst_col in cols:
    inst_list_unique = list(df['exp'][inst_col].unique())
    unique_dict[inst_col] = inst_list_unique    
    inst_num_unique = len(inst_list_unique)
    print(inst_col, inst_num_unique)

Imaging phenotype cluster ID 58
CD45 726877
Ly6C 731257
TCR 731035
Ly6G 729442
CD19 726981
CD169 730498
CD106 727870
CD3 730526
CD1632 728506
CD8a 728773
CD90 728172
F480 731091
CD11c 725943
Ter119 728879
CD11b 725897
IgD 730293
CD27 727817
CD5 730295
CD79b 727003
CD71 729364
CD31 725039
CD4 730112
IgM 727925
B220 729995
ERTR7 729781
CD35 728492
CD2135 729290
CD44 727003
NKp46 723719
X.X 1342
Y.Y 1006
Z.Z 15
MHCII 729269
blank_Cy3_cyc15 721818
blank_Cy5_cyc15 720065
sample_Xtile_Ytile 565
niche cluster ID 101


### Select Single Image Tile
BALBc: normal tissue
MRL/lpr: spleen from animals with systemic autoimmune disease

Start with: 'BALBc-1_X01_Y01'

In [53]:
unique_dict['sample_Xtile_Ytile']

['BALBc-3_X05_Y03',
 'BALBc-3_X01_Y02',
 'BALBc-2_X05_Y04',
 'BALBc-2_X03_Y04',
 'BALBc-2_X02_Y01',
 'BALBc-3_X06_Y02',
 'BALBc-2_X02_Y04',
 'BALBc-1_X01_Y05',
 'BALBc-2_X05_Y06',
 'BALBc-2_X04_Y09',
 'BALBc-1_X02_Y05',
 'BALBc-3_X02_Y06',
 'BALBc-2_X05_Y01',
 'BALBc-1_X02_Y09',
 'BALBc-1_X01_Y04',
 'BALBc-3_X06_Y05',
 'BALBc-2_X05_Y03',
 'BALBc-1_X03_Y01',
 'BALBc-1_X03_Y09',
 'BALBc-1_X04_Y06',
 'BALBc-2_X02_Y05',
 'BALBc-1_X07_Y06',
 'BALBc-2_X01_Y09',
 'BALBc-1_X03_Y04',
 'BALBc-1_X07_Y03',
 'BALBc-2_X05_Y05',
 'BALBc-1_X04_Y05',
 'BALBc-2_X02_Y08',
 'BALBc-1_X05_Y01',
 'BALBc-2_X01_Y06',
 'BALBc-3_X03_Y09',
 'BALBc-2_X02_Y09',
 'BALBc-3_X06_Y03',
 'BALBc-1_X06_Y06',
 'BALBc-1_X03_Y08',
 'BALBc-1_X01_Y09',
 'BALBc-2_X03_Y06',
 'BALBc-3_X02_Y03',
 'BALBc-2_X02_Y02',
 'BALBc-2_X06_Y08',
 'BALBc-3_X02_Y02',
 'BALBc-2_X03_Y07',
 'BALBc-3_X05_Y07',
 'BALBc-1_X01_Y02',
 'BALBc-3_X04_Y03',
 'BALBc-2_X03_Y01',
 'BALBc-3_X01_Y06',
 'BALBc-3_X04_Y01',
 'BALBc-2_X06_Y04',
 'BALBc-3_X03_Y02',


In [49]:
x_id_list = []
y_id_list = []
for inst_tile in unique_dict['sample_Xtile_Ytile']:
    inst_x = int(inst_tile.split('_')[1].replace('X',''))
    x_id_list.append(inst_x)
    
    inst_y = int(inst_tile.split('_')[2].replace('Y',''))
    y_id_list.append(inst_y)

print('x max', max(x_id_list))
print('y max', max(y_id_list))

x max 7
y max 9


In [None]:
for 