# 0.1.0 Load Data
* Codex Data obtained from: http://welikesharingdata.blob.core.windows.net/forshare/index.html
* Manuscript: https://www.cell.com/cell/pdf/S0092-8674(18)30904-8.pdf

In [1]:
import pandas as pd
df = {}

In [2]:
from clustergrammer2 import net

>> clustergrammer2 backend version 0.4.0


In [42]:
import ipywidgets as widgets
import numpy as np
from bqplot import pyplot as plt
import bqplot

In [4]:
from glob import glob

## Check available Files

In [5]:
glob('../data/big_data/*.txt')

['../data/big_data/channelNames_BALBcMRLdataset.txt']

In [6]:
glob('../data/big_data/*.csv')

['../data/big_data/CODEX_MRLdataset_neighborhood_graph.csv',
 '../data/big_data/BALBcMRLdataset_exposuretimes.csv',
 '../data/big_data/Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv']

### Expression Data
See http://welikesharingdata.blob.core.windows.net/forshare/index.html

* CSV file contains mean marker intensities and X, Y, Z corrdinates for each cell relative to the top left corner of its tile. 
* Tile IDs correspond to tile images sorted in the alphanumeric order
* Size parameter corresponds to the cell object sizes in voxels
* "imaging phenotype Cluster ID" column specifies the phenotypic cluster identity as determined by X-shift
* "niche Cluster ID" column specifies the i-niche cluster determined by K-means (K=100)
* neighborhood graph files describes which pairs of cells are adjacent to one another

In [7]:
%%time
df['exp'] = pd.read_csv('../data/big_data/Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv')
new_rows = ['C-' + str(x) for x in df['exp'].index.tolist()]
df['exp'].index = new_rows
print(df['exp'].shape)

(734101, 38)
CPU times: user 8.4 s, sys: 431 ms, total: 8.83 s
Wall time: 8.11 s


In [8]:
df['exp'].head()

Unnamed: 0,Imaging phenotype cluster ID,CD45,Ly6C,TCR,Ly6G,CD19,CD169,CD106,CD3,CD1632,...,CD44,NKp46,X.X,Y.Y,Z.Z,MHCII,blank_Cy3_cyc15,blank_Cy5_cyc15,sample_Xtile_Ytile,niche cluster ID
C-0,9600,1577.675415,-154.301758,130.692184,-4.168493,560.691345,-504.231476,854.670105,-631.294189,385.935242,...,422.408691,515.130066,10,70,13,6712.812988,1665.967896,398.348389,BALBc-3_X05_Y03,32.0
C-1,9600,1017.83844,-93.069397,144.076584,40.010998,885.59552,-391.357544,62.764454,-474.201172,-469.634583,...,448.70166,171.88031,1000,294,12,2024.678711,1287.959229,421.991425,BALBc-3_X01_Y02,99.0
C-2,9600,5978.459961,-330.099365,139.631744,-82.840302,1747.897583,-395.50882,954.326782,-1026.204468,3744.718262,...,2229.804443,512.220764,1003,107,8,8647.193359,2817.173828,709.545105,BALBc-2_X05_Y04,74.0
C-3,9600,6119.109375,-54.384808,-768.871704,25.625927,1065.31189,-485.535431,538.404175,-611.836426,865.84259,...,665.720459,351.108246,1003,113,13,4838.463379,1646.660278,408.52359,BALBc-2_X03_Y04,98.0
C-4,9600,6272.474609,-235.512405,74.058075,-101.729919,1186.295044,-782.744995,1261.625366,-915.565552,1361.536011,...,2065.742676,259.003235,1003,148,9,5092.891602,2161.109131,713.416199,BALBc-2_X02_Y01,71.0


In [9]:
cols = df['exp'].columns.tolist()
cols

['Imaging phenotype cluster ID',
 'CD45',
 'Ly6C',
 'TCR',
 'Ly6G',
 'CD19',
 'CD169',
 'CD106',
 'CD3',
 'CD1632',
 'CD8a',
 'CD90',
 'F480',
 'CD11c',
 'Ter119',
 'CD11b',
 'IgD',
 'CD27',
 'CD5',
 'CD79b',
 'CD71',
 'CD31',
 'CD4',
 'IgM',
 'B220',
 'ERTR7',
 'CD35',
 'CD2135',
 'CD44',
 'NKp46',
 'X.X',
 'Y.Y',
 'Z.Z',
 'MHCII',
 'blank_Cy3_cyc15',
 'blank_Cy5_cyc15',
 'sample_Xtile_Ytile',
 'niche cluster ID']

In [10]:
exp_cols = cols[1:30]
exp_cols

['CD45',
 'Ly6C',
 'TCR',
 'Ly6G',
 'CD19',
 'CD169',
 'CD106',
 'CD3',
 'CD1632',
 'CD8a',
 'CD90',
 'F480',
 'CD11c',
 'Ter119',
 'CD11b',
 'IgD',
 'CD27',
 'CD5',
 'CD79b',
 'CD71',
 'CD31',
 'CD4',
 'IgM',
 'B220',
 'ERTR7',
 'CD35',
 'CD2135',
 'CD44',
 'NKp46']

In [11]:
unique_dict = {}
for inst_col in cols:
    inst_list_unique = list(df['exp'][inst_col].unique())
    unique_dict[inst_col] = inst_list_unique    
    inst_num_unique = len(inst_list_unique)
    print(inst_col, inst_num_unique)

Imaging phenotype cluster ID 58
CD45 726877
Ly6C 731257
TCR 731035
Ly6G 729442
CD19 726981
CD169 730498
CD106 727870
CD3 730526
CD1632 728506
CD8a 728773
CD90 728172
F480 731091
CD11c 725943
Ter119 728879
CD11b 725897
IgD 730293
CD27 727817
CD5 730295
CD79b 727003
CD71 729364
CD31 725039
CD4 730112
IgM 727925
B220 729995
ERTR7 729781
CD35 728492
CD2135 729290
CD44 727003
NKp46 723719
X.X 1342
Y.Y 1006
Z.Z 15
MHCII 729269
blank_Cy3_cyc15 721818
blank_Cy5_cyc15 720065
sample_Xtile_Ytile 565
niche cluster ID 101


### Select Single Image Tile
BALBc: normal tissue
MRL/lpr: spleen from animals with systemic autoimmune disease

Start with: 'BALBc-1_X01_Y01'

In [12]:
ser_tile = df['exp']['sample_Xtile_Ytile']
ser_found = ser_tile[ser_tile == 'BALBc-1_X01_Y01']
ser_found.shape

(1127,)

In [13]:
keep_rows = ser_found.index.tolist()

In [14]:
df['tile'] = df['exp'].loc[keep_rows]
df['tile'].shape

(1127, 38)

In [15]:
df['tile'].head()

Unnamed: 0,Imaging phenotype cluster ID,CD45,Ly6C,TCR,Ly6G,CD19,CD169,CD106,CD3,CD1632,...,CD44,NKp46,X.X,Y.Y,Z.Z,MHCII,blank_Cy3_cyc15,blank_Cy5_cyc15,sample_Xtile_Ytile,niche cluster ID
C-643,9640,4616.200684,109.183311,56.608055,16.112137,1381.89978,-117.029999,2926.800781,-535.518738,2204.415283,...,1706.060791,557.22937,10,277,9,2071.258301,1494.528198,635.465454,BALBc-1_X01_Y01,84.0
C-1264,9640,1820.992798,-193.428802,-253.548187,-108.620682,1434.961548,-649.734192,1689.791626,-951.155151,1138.589111,...,3177.853271,317.274445,1006,490,9,2678.717529,1627.87854,582.699951,BALBc-1_X01_Y01,99.0
C-1589,9640,2748.407715,1080.912109,155.408829,290.299347,1872.106323,-265.814697,1627.306641,-900.948425,3623.654297,...,3632.079346,439.367004,101,504,9,5202.269043,2970.291016,1610.80542,BALBc-1_X01_Y01,91.0
C-1611,9640,1670.639282,162.77533,479.413391,156.224304,898.696411,312.997375,490.886322,262.397064,3911.160889,...,2143.739746,412.681335,101,759,8,521.5,760.530396,606.023438,BALBc-1_X01_Y01,84.0
C-1873,9640,1783.56897,-83.423607,980.420288,79.349449,1206.731445,469.170441,1557.618408,539.484131,1223.671753,...,1267.455688,306.992889,1013,254,6,307.57309,860.657837,818.229736,BALBc-1_X01_Y01,93.0


### Plot Tile Expression Levels

In [25]:
df['tile-exp'] = df['tile'][exp_cols].transpose()
df['tile-exp'].shape

(29, 1127)

### Set Negative Expression Levels to Zero

In [28]:
df['tile-exp'][df['tile-exp'] < 0] = 0

In [29]:
df['tile-exp'].transpose().describe()

Unnamed: 0,CD45,Ly6C,TCR,Ly6G,CD19,CD169,CD106,CD3,CD1632,CD8a,...,CD71,CD31,CD4,IgM,B220,ERTR7,CD35,CD2135,CD44,NKp46
count,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,...,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0,1127.0
mean,2227.386933,320.46262,801.855738,237.354865,995.305121,372.464357,2218.644636,393.465683,2289.842578,953.25745,...,345.248557,1612.862087,694.813556,487.676127,1011.677069,1496.511917,485.104105,90.703214,2639.04676,605.289154
std,1744.207747,1804.326714,2026.346472,1401.532611,2246.645501,1614.248883,2642.061696,1281.174377,3948.655321,1918.905962,...,602.629283,1408.478027,1612.171357,1650.832599,1146.848826,2180.44625,781.089923,121.852499,2305.064993,862.13446
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,855.441833,0.0,88.477158,0.0,321.815338,0.0,690.838135,0.0,451.506943,133.870949,...,0.0,761.176971,0.0,80.450752,0.0,225.052689,212.526001,0.0,820.787323,270.897263
50%,1975.485962,76.415504,394.730377,42.606205,726.309387,3.330147,1627.306641,0.0,1352.493652,551.650635,...,112.943901,1348.967896,231.637939,215.14856,672.301147,903.783081,368.268921,62.909431,2262.735596,427.055115
75%,3301.116211,226.595772,756.756622,133.327179,1163.773071,363.036209,2966.002197,411.800278,2564.916748,1082.343811,...,360.455078,2138.005981,655.641144,467.981979,1649.294434,1780.571533,539.913208,124.81559,3894.320557,616.0159
max,13019.573242,36380.929688,28300.804688,27910.125,32945.34375,27001.564453,33125.109375,16590.681641,53697.683594,27838.951172,...,5450.304688,15061.892578,18621.835938,25429.027344,5967.472168,19644.873047,10692.055664,1460.630005,17053.4375,9143.376953


In [30]:
import numpy as np
df['tile-exp-ash'] = np.arcsinh(df['tile-exp']/5)

In [31]:
df['tile-exp-ash'].head()

Unnamed: 0,C-643,C-1264,C-1589,C-1611,C-1873,C-1877,C-1935,C-2478,C-2866,C-2988,...,C-252970,C-253039,C-253092,C-253107,C-253453,C-253953,C-253959,C-253982,C-253998,C-254183
CD45,7.521037,6.590848,7.002487,6.504673,6.570083,6.926748,6.544562,5.505844,6.756563,5.172671,...,0.0,5.821766,0.0,0.0,5.178123,7.028071,0.0,7.055849,6.356819,1.623972
Ly6C,3.777261,0.0,6.069275,4.176316,0.0,0.0,0.0,4.819843,4.795737,0.0,...,4.226939,0.0,1.866848,0.805144,3.890067,4.816325,3.774785,4.805585,0.0,0.0
TCR,3.121805,0.0,4.130027,5.2563,5.971697,4.533851,3.11086,5.747103,6.651165,0.0,...,3.460937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ly6G,1.886532,0.0,4.754696,4.135258,3.458562,3.647431,0.0,3.991211,4.252762,3.001435,...,3.931575,0.0,0.0,0.0,3.925621,4.481862,3.586068,3.807004,0.0,4.728002
CD19,6.314927,6.352606,6.618531,5.884662,6.179384,5.941539,5.156831,6.023632,7.082634,6.187234,...,0.0,0.0,0.0,0.0,0.0,4.358561,0.0,0.0,0.0,0.0


### Expression Levels

In [34]:
net.load_df(df['tile-exp'])
net.normalize(axis='row', norm_type='zscore')
net.clip(-5,5)
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "CD45", "ini": 29, "clust": 25, "rank": 27, "rankvar": 27, "gro…

In [41]:
df['tile-loc'] = df['tile'][['X.X', 'Y.Y']]
df['tile-loc'].shape

(1127, 2)

In [82]:
fig = plt.figure(title='Scatter', min_height='10px', min_width='10px')
# def_tt = Tooltip(fields=['x'], formats=[''])
scatter_1 = plt.scatter(df['tile-loc']['X.X'], 
                        df['tile-loc']['Y.Y'], 
                        figsize=(20,10), ylim=(0,1000), xlim=(0,1000))

# panzoom = bqplot.interacts.PanZoom()

# fig.scale_x.min=0
# fig.scale_x.max=10

# fig.layout.height = '10000'
inst_height = 400
fig.layout.min_height = str(inst_height) + 'px'
fig.layout.min_width = str(inst_height * 1.2) + 'px'

fig

Figure(axes=[Axis(scale=LinearScale()), Axis(orientation='vertical', scale=LinearScale())], fig_margin={'top':…