In [143]:
import json
import pandas as pd
import numpy as np
from anndata import AnnData
from os.path import join
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPoint, MultiPolygon, shape

In [144]:
df = pd.read_csv(join("data", "segmentation.csv"))

In [145]:
with open(join("data", "poly_per_z.json")) as f:
    poly_per_z = json.load(f)

In [146]:
df.head()

Unnamed: 0,mol_id,x_raw,y_raw,z_raw,gene,area,brightness,total_magnitude,qc_score,x,y,z,molecule_id,confidence,compartment,nuclei_probs,cell,assignment_confidence,is_noise,ncv_color
0,3048145,-2935.386,-1218.58,2.5,Maoa,4,2.021306,420.1126,0.954363,1705.0,1271.0,0.0,1,0.80133,Unknown,1.0,75,0.625,False,#A1750D
1,3048147,-2933.229,-1147.614,2.5,Maoa,4,1.82864,269.5874,0.908246,1725.0,1922.0,0.0,2,1.0,Unknown,1.0,189,0.95,False,#605211
2,3048148,-2930.104,-1154.062,2.5,Maoa,5,2.001268,501.4615,0.977219,1753.0,1863.0,0.0,3,1.0,Unknown,1.0,188,1.0,False,#615210
3,3048149,-2929.339,-1153.784,2.5,Maoa,7,1.960428,639.0364,0.991316,1760.0,1865.0,0.0,4,1.0,Unknown,1.0,188,1.0,False,#605212
4,3048153,-2913.718,-1270.474,2.5,Maoa,6,1.93728,519.3154,0.98321,1904.0,794.0,0.0,5,0.33546,Unknown,1.0,0,0.575,True,#EBE2C7


In [147]:
mol_slice_df = df.loc[df["z"] == 0]
poly_slice = poly_per_z[0]["geometries"]

mol_geometry = gpd.points_from_xy(x=mol_slice_df["x"], y=mol_slice_df["y"])

mol_slice_gdf = gpd.GeoDataFrame(mol_slice_df, geometry=mol_geometry)
mol_slice_gdf

Unnamed: 0,mol_id,x_raw,y_raw,z_raw,gene,area,brightness,total_magnitude,qc_score,x,...,z,molecule_id,confidence,compartment,nuclei_probs,cell,assignment_confidence,is_noise,ncv_color,geometry
0,3048145,-2935.386,-1218.580,2.5,Maoa,4,2.021306,420.1126,0.954363,1705.0,...,0.0,1,0.80133,Unknown,1.000000,75,0.625,False,#A1750D,POINT (1705.000 1271.000)
1,3048147,-2933.229,-1147.614,2.5,Maoa,4,1.828640,269.5874,0.908246,1725.0,...,0.0,2,1.00000,Unknown,1.000000,189,0.950,False,#605211,POINT (1725.000 1922.000)
2,3048148,-2930.104,-1154.062,2.5,Maoa,5,2.001268,501.4615,0.977219,1753.0,...,0.0,3,1.00000,Unknown,1.000000,188,1.000,False,#615210,POINT (1753.000 1863.000)
3,3048149,-2929.339,-1153.784,2.5,Maoa,7,1.960428,639.0364,0.991316,1760.0,...,0.0,4,1.00000,Unknown,1.000000,188,1.000,False,#605212,POINT (1760.000 1865.000)
4,3048153,-2913.718,-1270.474,2.5,Maoa,6,1.937280,519.3154,0.983210,1904.0,...,0.0,5,0.33546,Unknown,1.000000,0,0.575,True,#EBE2C7,POINT (1904.000 794.000)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819627,17214058,-2504.731,-1346.713,2.5,Nlrp6,3,1.815016,195.9465,0.815858,5658.0,...,0.0,819628,0.61235,Unknown,0.704931,0,0.775,True,#00498F,POINT (5658.000 95.000)
819638,17221710,-2506.871,-1348.371,2.5,Sdc1,4,1.819487,263.9651,0.905910,5638.0,...,0.0,819639,0.00000,Unknown,0.678154,0,1.000,True,#003E87,POINT (5638.000 79.000)
819647,17231166,-2499.367,-1355.649,2.5,Ptger4,5,1.935176,430.6716,0.968161,5707.0,...,0.0,819648,0.00032,Cyto,0.655439,0,1.000,True,#004F9A,POINT (5707.000 13.000)
819648,17231167,-2498.492,-1351.923,2.5,Ptger4,4,1.978598,380.7660,0.946520,5715.0,...,0.0,819649,0.84022,Cyto,0.652399,1020,0.800,False,#006B9C,POINT (5715.000 47.000)


In [148]:
poly_slice_gdf = gpd.GeoDataFrame(geometry=[ shape(x) for x in poly_slice ])
poly_slice_gdf;

In [149]:
mol_with_poly_gdf = gpd.sjoin(mol_slice_gdf, poly_slice_gdf, how="left", op="intersects")
mol_with_poly_gdf = mol_with_poly_gdf.rename(columns={"index_right": "poly_index"})

In [150]:
df = mol_with_poly_gdf

In [151]:
df["poly_index"].unique().shape

(3741,)

In [153]:
df = df.loc[pd.notna(df["poly_index"])]

In [154]:
df["poly_index"] = df["poly_index"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


In [155]:
df["compartment"].unique()

array(['Unknown', 'Nuclei', 'Cyto'], dtype=object)

In [156]:
z_vals = sorted(df["z"].unique())
z_to_z_index = dict(zip(z_vals, range(len(z_vals))))
z_to_z_index

{0.0: 0}

In [157]:
df["z_index"] = df["z"].apply(lambda z: z_to_z_index[z])

In [158]:
df.loc[df["gene"] == "Acsl1"]

Unnamed: 0,mol_id,x_raw,y_raw,z_raw,gene,area,brightness,total_magnitude,qc_score,x,...,confidence,compartment,nuclei_probs,cell,assignment_confidence,is_noise,ncv_color,geometry,poly_index,z_index
606,3049507,-2940.436,-1172.3160,2.5,Acsl1,5,1.877872,377.4349,0.960224,1659.0,...,1.00000,Unknown,1.000000,139,0.850,False,#FBC98E,POINT (1659.000 1695.000),134,0
10511,3070973,-3022.381,-705.4058,2.5,Acsl1,8,2.025499,848.3765,0.995716,906.0,...,1.00000,Unknown,1.000000,1609,0.600,False,#FAD79E,POINT (906.000 5981.000),1251,0
10512,3070974,-3021.441,-704.1991,2.5,Acsl1,4,2.115677,522.0795,0.964912,915.0,...,1.00000,Unknown,1.000000,1659,0.500,False,#F8D191,POINT (915.000 5992.000),1287,0
10513,3070975,-3019.662,-589.1284,2.5,Acsl1,5,2.066038,582.1144,0.981291,931.0,...,0.92752,Unknown,0.912556,2104,0.925,False,#CA9437,POINT (931.000 7048.000),1638,0
10514,3070976,-3015.492,-668.7278,2.5,Acsl1,19,1.924741,1597.6990,0.998356,970.0,...,1.00000,Unknown,0.998662,1844,0.875,False,#EFBE6D,POINT (970.000 6318.000),1438,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774078,17073229,-2600.696,-518.5146,2.5,Acsl1,4,1.932656,342.5438,0.934794,4777.0,...,1.00000,Cyto,0.006836,4012,1.000,False,#0052B9,POINT (4777.000 7696.000),2943,0
774079,17073232,-2585.172,-383.6891,2.5,Acsl1,8,1.773877,475.2993,0.990412,4919.0,...,1.00000,Cyto,0.000774,4159,0.750,False,#0092FF,POINT (4919.000 8934.000),3034,0
774080,17073233,-2583.505,-382.4764,2.5,Acsl1,13,1.838407,896.0872,0.997979,4935.0,...,1.00000,Cyto,0.005756,4159,0.850,False,#0094FF,POINT (4935.000 8945.000),3034,0
774081,17073235,-2583.197,-393.1448,2.5,Acsl1,6,1.904899,482.0040,0.980147,4938.0,...,1.00000,Cyto,0.027423,4147,0.975,False,#006FE6,POINT (4938.000 8847.000),3031,0


## Restrict to the first Z slice

In [159]:
df = df.loc[df["z"] == 0]

## Save molecules data

In [160]:
molecules_json = {}
for gene_id, gene_df in df.groupby("gene"):
    molecules_json[gene_id] = gene_df[["x", "y"]].values.tolist()
    #molecules_json[gene_id] = gene_df[["x", "y", "z_index"]].values.tolist()

In [161]:
with open(join("data", "molecules.json"), "w") as f:
    json.dump(molecules_json, f)

## Unmelt to get per-cell molecule counts for the zeroth z slice

In [162]:
df2 = df[["mol_id", "poly_index", "gene", "total_magnitude"]].pivot_table(index=["poly_index", "mol_id"], columns="gene").groupby("poly_index").count()
df2.columns = df2.columns.droplevel().rename(None)
df2.head()

Unnamed: 0_level_0,Acsl1,Acta2,Ada,Adgrd1,Adgrf5,Adra1a,Adra1b,Adra1d,Adra2a,Adra2b,...,Tm4sf4,Tnfrsf21,Tpsb2,Trdc,Trpm5,Tspan13,Txndc5,Tymp,Vcan,Vim
poly_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,10,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
df.loc[(df["gene"] == "Ada") & (df["poly_index"] == 2219)]

Unnamed: 0,mol_id,x_raw,y_raw,z_raw,gene,area,brightness,total_magnitude,qc_score,x,...,confidence,compartment,nuclei_probs,cell,assignment_confidence,is_noise,ncv_color,geometry,poly_index,z_index
819531,17202017,-2504.342,-944.0928,2.5,Ada,5,2.035116,542.1087,0.979347,5661.0,...,0.99993,Cyto,0.102841,2933,0.975,False,#00A5FF,POINT (5661.000 3790.000),2219,0
819532,17202018,-2502.662,-944.4103,2.5,Ada,3,1.907849,242.6445,0.8604,5677.0,...,0.99999,Cyto,0.060877,2933,1.0,False,#00A4FF,POINT (5677.000 3787.000),2219,0
819533,17202019,-2502.656,-942.6475,2.5,Ada,5,1.81749,328.4431,0.951861,5677.0,...,1.0,Cyto,0.027988,2933,1.0,False,#00A0FF,POINT (5677.000 3803.000),2219,0
819534,17202020,-2501.823,-942.1064,2.5,Ada,9,1.921549,751.2627,0.995721,5684.0,...,1.0,Cyto,0.143175,2933,1.0,False,#009FFF,POINT (5684.000 3808.000),2219,0
819535,17202021,-2501.051,-940.8337,2.5,Ada,2,2.063637,231.5616,0.815988,5692.0,...,1.0,Nuclei,0.926081,2933,1.0,False,#009DFF,POINT (5692.000 3820.000),2219,0


In [164]:
df2.loc[2219]["Ada"]

5

## Use `segmentation_counts.tsv` to get aggregate counts across z slices

In [165]:
#df3 = pd.read_csv("segmentation_counts.tsv", sep='\t', index_col=0).T
#df3.head()

In [166]:
X = df2.values
obs_df = pd.DataFrame(index=df2.index.values.tolist())
var_df = pd.DataFrame(index=df2.columns.values.tolist())

adata = AnnData(X=X, obs=obs_df, var=var_df)




In [167]:
adata.obs

0
1
3
4
5
...
3859
3860
3861
3863
3864


In [168]:
adata.X

array([[ 0.,  2.,  0., ...,  0.,  0.,  2.],
       [ 0., 20.,  0., ...,  0.,  0.,  0.],
       [ 0., 10.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [169]:
adata.write_zarr(join("data", "segmentation.zarr"))

In [170]:
len(poly_per_z)

9

In [171]:
with open(join("data", "poly_per_z_0.json"), "w") as f:
    json.dump(poly_per_z[0]["geometries"], f)

In [172]:
poly_per_z[0]["geometries"]

[{'coordinates': [[[2053.0, 51.0],
    [2053.0, 54.0],
    [2053.0, 57.0],
    [2056.0, 57.0],
    [2059.0, 57.0],
    [2059.0, 54.0],
    [2059.0, 51.0],
    [2059.0, 48.0],
    [2056.0, 48.0],
    [2056.0, 45.0],
    [2056.0, 42.0],
    [2056.0, 39.0],
    [2056.0, 36.0],
    [2053.0, 33.0],
    [2050.0, 30.0],
    [2047.0, 27.0],
    [2044.0, 27.0],
    [2041.0, 27.0],
    [2038.0, 27.0],
    [2035.0, 30.0],
    [2032.0, 33.0],
    [2029.0, 36.0],
    [2029.0, 39.0],
    [2026.0, 39.0],
    [2023.0, 42.0],
    [2020.0, 45.0],
    [2017.0, 48.0],
    [2014.0, 48.0],
    [2011.0, 45.0],
    [2011.0, 42.0],
    [2008.0, 39.0],
    [2008.0, 36.0],
    [2008.0, 33.0],
    [2008.0, 30.0],
    [2008.0, 27.0],
    [2008.0, 24.0],
    [2008.0, 21.0],
    [2008.0, 18.0],
    [2008.0, 15.0],
    [2008.0, 12.0],
    [2008.0, 9.0],
    [2011.0, 9.0],
    [2014.0, 6.0],
    [2014.0, 3.0],
    [2017.0, 3.0],
    [2020.0, 3.0],
    [2023.0, 3.0],
    [2026.0, 3.0],
    [2029.0, 3.0],
    [2032.0, 3