In [1]:
!pip install pandas -U



In [2]:
!pip install h5py -U



In [3]:
!pip install anndata -U



In [4]:
!pip install scanpy -U



In [10]:
!pip install pathlib -U

Collecting pathlib
  Downloading pathlib-1.0.1-py3-none-any.whl (14 kB)
Installing collected packages: pathlib
Successfully installed pathlib-1.0.1


In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import csv
import seaborn as sns
import anndata as ad
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

In [49]:
counts = pd.read_csv('../data/emd/csv_tables/counts.csv', engine="pyarrow", index_col=0)
exprs = pd.read_csv('../data/emd/csv_tables/exprs.csv', engine="pyarrow", index_col=0)
metadata = pd.read_csv('../data/emd/csv_tables/observations_metadata.csv', engine="pyarrow", index_col=0)
# metadata['patient_n']=metadata['patient_n'].astype('category')
# metadata['sample_id']=metadata['sample_id'].astype('category')
marker_data = pd.read_csv('../data/emd/csv_tables/features_metadata.csv', engine="pyarrow", index_col=0)
# fastMNN= pd.read_csv('../data/emd/csv_tables/fastMNN.csv', engine="pyarrow", index_col=0)
# UMAP_mnnCorrected = pd.read_csv('../data/emd/csv_tables/UMAP_mnnCorrected.csv', engine="pyarrow", index_col=0)

In [50]:
from scipy.sparse import csr_matrix

adata = ad.AnnData(csr_matrix(counts))
adata.obs_names = counts.index
adata.var_names = counts.columns
adata.obs=metadata
adata.var=marker_data
adata.layers['exprs'] = csr_matrix(exprs)
adata

AnnData object with n_obs × n_vars = 108636 × 37
    obs: 'sample_id', 'ObjectNumber', 'area', 'axis_major_length', 'axis_minor_length', 'eccentricity', 'patient_n', 'BM_cohort', 'tissue_type', 'body_part', 'timepoint', 'width_px', 'height_px'
    var: 'channel', 'name', 'keep', 'deepcell', 'use_channel', 'features_oi', 'channel_name'
    layers: 'exprs'

In [51]:
for col in adata.obs.columns:
    if adata.obs[col].dtype == 'category':
        adata.obs[col] = adata.obs[col].astype(str)
    print(f"{col} is {adata.obs[col].dtype}")

sample_id is object
ObjectNumber is int64
area is int64
axis_major_length is float64
axis_minor_length is float64
eccentricity is float64
patient_n is int64
BM_cohort is object
tissue_type is object
body_part is object
timepoint is object
width_px is int64
height_px is int64


In [52]:
for col in adata.var.columns:
    if adata.var[col].dtype == 'category':
        adata.var[col] = adata.var[col].astype(str)
    print(f"{col} is {adata.var[col].dtype}")

channel is object
name is object
keep is int64
deepcell is int64
use_channel is bool
features_oi is bool
channel_name is object


In [55]:
adata.write('./emd_not_annotated.h5ad')

TypeError: expected str, bytes or os.PathLike object, not NoneType

In [None]:
adata_old = adata.copy()
new_vars=['CollagenI', 'Ki-67', 'aSMA', 'CCR4', 'CD14', 'TIM-3', 'CD16', 'CD163',
       'CD11b', 'PDL1', 'CD31', 'CD45', 'BCMA', 'CD11c', 'FoxP3', 'CD4',
       'Granzyme_K', 'CD68', 'CD117', 'CD20', 'CD8', 'CCR6', 'CD56',
       'PD-1', 'CD138', 'Granzyme_B', 'CD127', 'pSTAT1', 'CD3', 'CD27', 'TIGIT',
       'CCR7', 'HLA-DR', 'CD45RO', 'Histone_H3', 'ICSK1', 'ICSK2']

adata.var_names=new_vars

In [None]:
import pickle

with open("../data/non_denoised/models/XGB_classifier.pkl", "rb") as f:
    model = pickle.load(f)
with open("../data/non_denoised/models/XGB_classifier_label-encoder.pkl", "rb") as f:
    le = pickle.load(f)


In [None]:
features=list(model.feature_names_in_)
print([f for f in features if f not in new_vars])

In [None]:
df=sc.get.obs_df(adata, features, layer='exprs')
df.head()

In [None]:
classes=model.predict(df)

In [None]:
cell_labels = pd.DataFrame(le.inverse_transform(classes))
cell_labels.index = df.index
cell_labels.columns = ['major_celltype']

In [None]:
props = pd.DataFrame(cell_labels.value_counts())
props = props.reset_index()

In [None]:
plt.figure()
sns.barplot(data=props, x=props['major_celltype'], y=props['count'])
plt.xticks(rotation=45, ha='right')
plt.title('Size different cell clusters')
plt.show()

In [None]:
sns.clustermap(pd.concat([df,cell_labels],axis=1).groupby(by='major_celltype').mean().T,standard_scale=1)

# Refine Clustering

Similarly to the 2D dta we are going to refine the clustering using GMM

## Unclassified

In [None]:
adata.obs['major_pixel_type'] = cell_labels

In [None]:
def get_df_oi(adata,features_oi,pixel_type,column='major_pixel_type'):
    df=sc.get.obs_df(adata, keys=features_oi+[column],layer='exprs')
    df=df[df[column]==pixel_type]
    return df

In [None]:
from gmm.gmm import GMMCluster

In [None]:
n_components_range = range(2, 10)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD3','CD20','CD68','CD138','CD56','aSMA']
pixel_type='Unclassified'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=True)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Endothelial',
              1:'Unclassified',
              2:'NK',
              3:'Unclassified',
              4:'Endothelial',
              5:'Myeloma'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
n_components_range = range(2, 10)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD68','CD138','CD56','aSMA']
pixel_type='Unclassified'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Unclassified',
              1:'Unclassified'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
adata.obs['major_pixel_type'].value_counts().plot.bar()

## Other myeloid

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD68', 'CD11b']
pixel_type='Other myeloid'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'MDSCs',
              1:'MDSCs',
              2:'Mono-macro',
              3:'MDSCs',
              4:'MDSCs',
              5:'Mono-macro',
              6:'Mono-macro',
              7:'Mono-macro'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD68','CD163','CD14','CD16']
pixel_type='Mono-macro'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'M1-like M$\\phi$',
              1:'CD16 Monocytes',
              2:'CD14 Monocytes',
              3:'M1-like M$\\phi$',
              4:'CD16 Monocytes',
              5:'CD16 Monocytes',
              6:'M1-like M$\\phi$',
              7:'CD14 Monocytes',
              8:'M2-like M$\\phi$'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD14','CD16']
pixel_type='Monocytes'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Unclassified',
              1:'CD16 Monocytes',
              2:'MDSCs'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD11b','CD11c','HLA-DR']
pixel_type='MDSCs'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'MDSCs',
              1:'MDSCs',
              2:'MDSCs',
              3:'MoDC',
              4:'MoDC',
              5:'MoDC'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
adata.obs['major_pixel_type'].value_counts().plot.bar()

## Plasma

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD20','CD27','CD138']
pixel_type='Plasma'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'B',
              1:'Myeloma',
              2:'Myeloma',
              3:'B',
              4:'B',
              5:'Myeloma'
            
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
adata.obs['major_pixel_type'].value_counts().plot.bar()

## Unclassified

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD20','CD3','CD68','aSMA']
pixel_type='Unclassified'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Unclassified',
              1:'Unclassified',
              2:'Unclassified',
              3:'Unclassified',
              4:'Unclassified',
              5:'Unclassified'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

## T

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD4','CD8','FoxP3','Granzyme_B','PD-1','TIM-3']
pixel_type='T'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'CD8 Tmem',
              1:'CD4-8',
              2:'CD4 Tmem',
              3:'Treg',
              4:'CD4 Tmem',
              5:'CD4-8',
              6:'CD4 Tmem',
              7:'CD8 GZMB+ Tmem'
             }


means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD4','CD8']
pixel_type='CD4-8'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Other T',
              1:'CD8 Tmem',
              2:'CD4 Tmem'
             }


means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['PD-1','TIM-3']
pixel_type='CD4 Tmem'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['PD-1','TIM-3']
pixel_type='CD8 Tmem'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
adata.obs['major_pixel_type'].value_counts().plot.bar()

## NK

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD56','aSMA']
pixel_type='NK'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'NK',
              1:'Unclassified',
              2:'Unclassified',
              3:'NK'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

## Myeloma

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CD138','CD20','CD3','CD68','aSMA','CD45']
pixel_type='Myeloma'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Myeloma',
              1:'Myeloma',
              2:'Myeloma',
              3:'Myeloma',
              4:'Myeloma',
              5:'Unclassified',
              6:'Unclassified',
              7:'Myeloma',
              8:'Myeloma',
              9:'Unclassified'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
adata.obs['major_pixel_type'].value_counts().plot.bar()

## Megakaryocytes

not in this tissue, changing with keratinocytes and others

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['aSMA','CD31','CD45','Ki-67','CD68']
pixel_type='Megakaryocytes'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Unclassified',
              1:'Unclassified',
              2:'Unclassified',
              3:'Unclassified',
              4:'Unclassified'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
adata.obs['major_pixel_type'].value_counts().plot.bar()

## Unclassified again

In [None]:
n_components_range = range(2, 11)
covariance_types = ['full', 'tied', 'diag', 'spherical']
features_oi=['CollagenI','aSMA','CD20','CD68','CD138','CD56','CD3']
pixel_type='Unclassified'
df=get_df_oi(adata,features_oi,pixel_type)[features_oi]
mat=df.values

In [None]:
gmm=GMMCluster.get_best_gmm(mat=mat,
                            n_components_range=n_components_range,
                            covariance_types=covariance_types,
                            plot=False)

In [None]:
clusters = gmm.predict(mat)
means=pd.DataFrame(gmm.means_,columns=features_oi)

plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
map_clusters={0:'Endothelial',
              1:'CAF?',
              2:'CAF?',
              3:'Myeloma',
              4:'Unclassified',
              5:'CAF?',
              6:'Myeloma',
              7:'Unclassified',
              8:'CAF?',
              9:'Myeloma'
             }

means = means.reset_index()
means.index = means['index'].replace(map_clusters)
means = means.drop('index',axis=1)
plt.figure()
sns.clustermap(means.T, annot=True,col_cluster=True,figsize=(10, 6))
plt.show()

In [None]:
clusters_df=pd.DataFrame(clusters,index=df.index)[0].replace(map_clusters)
adata.obs.loc[clusters_df.index, 'major_pixel_type'] = clusters_df

In [None]:
adata.obs['major_pixel_type'].value_counts().plot.bar()

In [None]:
df=sc.get.obs_df(adata, list(adata.var_names)+['major_pixel_type'],layer='exprs')

In [None]:
sns.clustermap(df.groupby('major_pixel_type').mean().T,standard_scale=1)

In [None]:
adata_old.obs = adata.obs

In [None]:
adata_old.write('../data/emd/emd_annotated.h5ad')