In [19]:
import json
import pandas as pd
import numpy as np
from anndata import AnnData
from os.path import join
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPoint, MultiPolygon, shape

In [20]:
df = pd.read_csv(join("data", "segmentation.csv"))

In [21]:
with open(join("data", "poly_per_z.json")) as f:
    poly_per_z = json.load(f)

In [22]:
df.head()

Unnamed: 0,mol_id,x_raw,y_raw,z_raw,gene,area,brightness,total_magnitude,qc_score,x,y,z,molecule_id,confidence,compartment,nuclei_probs,cell,assignment_confidence,is_noise,ncv_color
0,3048145,-2935.386,-1218.58,2.5,Maoa,4,2.021306,420.1126,0.954363,1705.0,1271.0,0.0,1,0.80133,Unknown,1.0,75,0.625,False,#A1750D
1,3048147,-2933.229,-1147.614,2.5,Maoa,4,1.82864,269.5874,0.908246,1725.0,1922.0,0.0,2,1.0,Unknown,1.0,189,0.95,False,#605211
2,3048148,-2930.104,-1154.062,2.5,Maoa,5,2.001268,501.4615,0.977219,1753.0,1863.0,0.0,3,1.0,Unknown,1.0,188,1.0,False,#615210
3,3048149,-2929.339,-1153.784,2.5,Maoa,7,1.960428,639.0364,0.991316,1760.0,1865.0,0.0,4,1.0,Unknown,1.0,188,1.0,False,#605212
4,3048153,-2913.718,-1270.474,2.5,Maoa,6,1.93728,519.3154,0.98321,1904.0,794.0,0.0,5,0.33546,Unknown,1.0,0,0.575,True,#EBE2C7


In [23]:
# Take only the zeroth Z slice (for now)
mol_slice_df = df.loc[df["z"] == 0]
poly_slice = poly_per_z[0]["geometries"]

mol_geometry = gpd.points_from_xy(x=mol_slice_df["x"], y=mol_slice_df["y"])

mol_slice_gdf = gpd.GeoDataFrame(mol_slice_df, geometry=mol_geometry)
mol_slice_gdf;

In [24]:
poly_slice_gdf = gpd.GeoDataFrame(geometry=[ shape(x) for x in poly_slice ])
poly_slice_gdf;

In [25]:
# Join the molecule points with their intersecting polygons.
mol_with_poly_gdf = gpd.sjoin(mol_slice_gdf, poly_slice_gdf, how="left", op="intersects")
mol_with_poly_gdf = mol_with_poly_gdf.rename(columns={"index_right": "poly_index"})

In [26]:
df = mol_with_poly_gdf

In [27]:
df["poly_index"].unique().shape

(3741,)

In [28]:
df = df.loc[pd.notna(df["poly_index"])]

In [29]:
df["poly_index"] = df["poly_index"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


In [30]:
df["compartment"].unique()

array(['Unknown', 'Nuclei', 'Cyto'], dtype=object)

In [31]:
z_vals = sorted(df["z"].unique())
z_to_z_index = dict(zip(z_vals, range(len(z_vals))))
z_to_z_index

{0.0: 0}

In [32]:
df["z_index"] = df["z"].apply(lambda z: z_to_z_index[z])

## Save molecules data

In [35]:
molecules_json = {}
for gene_id, gene_df in df.groupby("gene"):
    molecules_json[gene_id] = gene_df[["x", "y"]].values.tolist()
    #molecules_json[gene_id] = gene_df[["x", "y", "z_index"]].values.tolist()

In [36]:
with open(join("data", "molecules.json"), "w") as f:
    json.dump(molecules_json, f)

## Unmelt to get per-cell molecule counts for the zeroth z slice

In [37]:
df2 = df[["mol_id", "poly_index", "gene", "total_magnitude"]].pivot_table(index=["poly_index", "mol_id"], columns="gene").groupby("poly_index").count()
df2.columns = df2.columns.droplevel().rename(None)
df2.head()

Unnamed: 0_level_0,Acsl1,Acta2,Ada,Adgrd1,Adgrf5,Adra1a,Adra1b,Adra1d,Adra2a,Adra2b,...,Tm4sf4,Tnfrsf21,Tpsb2,Trdc,Trpm5,Tspan13,Txndc5,Tymp,Vcan,Vim
poly_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,10,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
X = df2.values
obs_df = pd.DataFrame(index=df2.index.values.tolist())
var_df = pd.DataFrame(index=df2.columns.values.tolist())

adata = AnnData(X=X, obs=obs_df, var=var_df)



In [43]:
adata.X

array([[ 0.,  2.,  0., ...,  0.,  0.,  2.],
       [ 0., 20.,  0., ...,  0.,  0.,  0.],
       [ 0., 10.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [44]:
adata.write_zarr(join("data", "segmentation.zarr"))

In [46]:
with open(join("data", "poly_per_z_0.json"), "w") as f:
    json.dump(poly_per_z[0]["geometries"], f)