In [11]:
import pandas as pd
import numpy as np
from anndata import AnnData
from os.path import join
import altair as alt
from altair_saver import save as alt_save
from sklearn.mixture import GaussianMixture

In [2]:
df = pd.read_csv(join("data", "lung", "quantification", "unmicst-lung.csv"))
df = df.rename(columns={
    "GFP+KP1.9_cellMask": "GFP_pos_KP19_cellMask"
})

In [3]:
df.head()

Unnamed: 0,CellID,DAPI_1_cellMask,GFP_pos_KP19_cellMask,CD206_cellMask,BODIPY630-NP_cellMask,DAPI_2_cellMask,Arg1_cellMask,CD11c_cellMask,CD45_cellMask,DAPI_3_cellMask,...,Y_centroid,column_centroid,row_centroid,Area,MajorAxisLength,MinorAxisLength,Eccentricity,Solidity,Extent,Orientation
0,1,27958.555556,3295.222222,15162.777778,65415.888889,6190.0,2957.0,2229.0,1074.0,5538.0,...,3.333333,61.444444,3.333333,9,4.277663,2.64302,0.786284,0.9,0.75,1.46525
1,2,25515.222222,4099.666667,10348.111111,8096.055556,6190.0,2957.0,2229.0,1074.0,5538.0,...,4.0,95.111111,4.0,18,5.483148,4.614446,0.54015,0.857143,0.72,1.098141
2,3,29898.636364,4334.818182,7998.272727,3248.0,6190.0,2957.0,2229.0,1074.0,5538.0,...,4.5,133.272727,4.5,22,5.75247,4.851514,0.537319,1.0,0.733333,0.0
3,4,28181.194444,3880.583333,8758.777778,10483.722222,6190.0,2957.0,2229.0,1074.0,5538.0,...,5.055556,180.666667,5.055556,36,8.179409,6.159284,0.657994,0.878049,0.642857,-0.847787
4,5,23782.769231,4431.769231,16984.769231,28399.153846,6190.0,2957.0,2229.0,1074.0,5538.0,...,3.769231,187.615385,3.769231,13,5.942609,2.86979,0.875666,0.866667,0.541667,-1.153365


## Classify cells by GFP+ tumor vs. GFP- (host)

In [4]:
df.columns.values.tolist()

['CellID',
 'DAPI_1_cellMask',
 'GFP_pos_KP19_cellMask',
 'CD206_cellMask',
 'BODIPY630-NP_cellMask',
 'DAPI_2_cellMask',
 'Arg1_cellMask',
 'CD11c_cellMask',
 'CD45_cellMask',
 'DAPI_3_cellMask',
 'C12-D nanoparticle_cellMask',
 '7C1-F5 nanoparticle_cellMask',
 'G0-P5 nanoparticle_cellMask',
 'X_centroid',
 'Y_centroid',
 'column_centroid',
 'row_centroid',
 'Area',
 'MajorAxisLength',
 'MinorAxisLength',
 'Eccentricity',
 'Solidity',
 'Extent',
 'Orientation']

In [5]:
id_vars = [
 'CellID',
 'X_centroid',
 'Y_centroid',
 'column_centroid',
 'row_centroid',
 'Area',
 'MajorAxisLength',
 'MinorAxisLength',
 'Eccentricity',
 'Solidity',
 'Extent',
 'Orientation'
]
value_vars = [
 'DAPI_1_cellMask',
 'GFP_pos_KP19_cellMask',
 'CD206_cellMask',
 'BODIPY630-NP_cellMask',
 'DAPI_2_cellMask',
 'Arg1_cellMask',
 'CD11c_cellMask',
 'CD45_cellMask',
 'DAPI_3_cellMask',
 'C12-D nanoparticle_cellMask',
 '7C1-F5 nanoparticle_cellMask',
 'G0-P5 nanoparticle_cellMask'
]
molten_df = df.melt(id_vars=id_vars, value_vars=value_vars)
molten_df.head()

Unnamed: 0,CellID,X_centroid,Y_centroid,column_centroid,row_centroid,Area,MajorAxisLength,MinorAxisLength,Eccentricity,Solidity,Extent,Orientation,variable,value
0,1,61.444444,3.333333,61.444444,3.333333,9,4.277663,2.64302,0.786284,0.9,0.75,1.46525,DAPI_1_cellMask,27958.555556
1,2,95.111111,4.0,95.111111,4.0,18,5.483148,4.614446,0.54015,0.857143,0.72,1.098141,DAPI_1_cellMask,25515.222222
2,3,133.272727,4.5,133.272727,4.5,22,5.75247,4.851514,0.537319,1.0,0.733333,0.0,DAPI_1_cellMask,29898.636364
3,4,180.666667,5.055556,180.666667,5.055556,36,8.179409,6.159284,0.657994,0.878049,0.642857,-0.847787,DAPI_1_cellMask,28181.194444
4,5,187.615385,3.769231,187.615385,3.769231,13,5.942609,2.86979,0.875666,0.866667,0.541667,-1.153365,DAPI_1_cellMask,23782.769231


In [38]:
plot = alt.Chart(df).mark_circle().encode(
    x=alt.X("DAPI_1_cellMask:Q", axis=alt.Axis(title="DAPI (Cycle 1) intensity")),
    y=alt.Y("GFP_pos_KP19_cellMask:Q", axis=alt.Axis(title="GFP intensity"))
).properties(
    title="GFP vs. DAPI (Cycle 1) intensity"
)

plot

In [37]:
plot = alt.Chart(df).mark_circle().encode(
    x=alt.X("DAPI_1_cellMask:Q", scale=alt.Scale(zero=False, type='log'), axis=alt.Axis(title="DAPI (Cycle 1) intensity")),
    y=alt.Y("GFP_pos_KP19_cellMask:Q", scale=alt.Scale(zero=False, type='log'), axis=alt.Axis(title="GFP intensity"))
).properties(
    title="GFP vs. DAPI (Cycle 1) intensity"
)

plot

In [42]:
plot = alt.Chart(df).mark_circle().encode(
    x=alt.X("CD45_cellMask:Q", axis=alt.Axis(title="CD45 intensity"), scale=alt.Scale(zero=False)),
    y=alt.Y("GFP_pos_KP19_cellMask:Q", axis=alt.Axis(title="GFP intensity"), scale=alt.Scale(zero=False))
).properties(
    title="CD45 vs. GFP intensity"
)

plot

In [40]:
plot = alt.Chart(df).mark_circle().encode(
    x=alt.X("CD45_cellMask:Q", scale=alt.Scale(zero=False, type='log'), axis=alt.Axis(title="CD45 intensity")),
    y=alt.Y("GFP_pos_KP19_cellMask:Q", scale=alt.Scale(zero=False, type='log'), axis=alt.Axis(title="GFP intensity"))
).properties(
    title="CD45 vs. DAPI (Cycle 1) intensity"
)

plot

In [30]:
plot = alt.Chart(df).mark_bar().encode(
    y=alt.Y("count()", axis=alt.Axis(title="Number of cells")),
    x=alt.X("GFP_pos_KP19_cellMask:Q", axis=alt.Axis(title="GFP intensity"), bin=alt.Bin(maxbins=30))
).properties(
    title="GFP intensity distribution"
)

plot

In [15]:
# Use gaussian mixture model to determine the two GFP groups

X = df[["GFP_pos_KP19_cellMask"]].values
X

array([[ 3295.22222222],
       [ 4099.66666667],
       [ 4334.81818182],
       ...,
       [ 4017.        ],
       [ 9266.        ],
       [13923.33333333]])

In [16]:
gm = GaussianMixture(n_components=2, random_state=2445).fit(X)
gm.means_

array([[16209.88644174],
       [ 5941.93329562]])

In [18]:
y = gm.predict(X)

In [19]:
df["GFP_pos_KP19_cellMask_state"] = y

In [20]:
plot = alt.Chart(df).mark_circle().encode(
    y=alt.Y("CD45_cellMask:Q", axis=alt.Axis(title="CD45"), scale=alt.Scale(zero=False, type='log')),
    x=alt.X("GFP_pos_KP19_cellMask:Q", axis=alt.Axis(title="GFP"), scale=alt.Scale(zero=False, type='log')),
    color=alt.Color("GFP_pos_KP19_cellMask_state:N")
).properties(
    title="CD45 vs. GFP intensity"
)

plot

In [None]:
plot = alt.Chart(df).mark_area(inter).encode(
    y=alt.Y("count()", axis=alt.Axis(title="Number of cells")),
    x=alt.X("GFP_pos_KP19_cellMask:Q", axis=alt.Axis(title="GFP intensity"), bin=alt.Bin(maxbins=30)),
    color=alt.Color("GFP_pos_KP19_cellMask_state:N")
).properties(
    title="GFP intensity distribution"
)

plot

## Classify host cells as CD45- vs. CD45+

## Classify as one of the three (simplex?) GFP+ tumor vs. CD45- vs. CD45+