In [1]:
import pandas as pd
import numpy as np
from annoy import AnnoyIndex
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans

In [2]:
n_pc = 26
n_tree = 500
metric = 'euclidean'

In [3]:
pca_df = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/ATAC/Metadata/ATAC.pc50.msg')
cell_meta = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/ATAC/Metadata/ATAC.cell_tidy_data.msg')
pca_df = pca_df.iloc[:, :n_pc].copy()

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


## Build ANN

In [4]:
for cluster, sub_df in pca_df.groupby(cell_meta['SubType']):
    sub_df = sub_df.reset_index(drop=True)
    print(cluster, sub_df.shape[0])
    t = AnnoyIndex(n_pc, metric)
    _ = sub_df.apply(lambda i: t.add_item(i.name, i.values), axis=1)
    t.build(n_tree)
    t.save(f'{cluster}.ann')

GABA.CRc.CRc 814
GABA.Cge.Cge3 2937
GABA.Cge.Cge5 958
GABA.Cge.Cge6 599
GABA.Cge.CgeHip 1224
GABA.Cge.Lamp5 6135
GABA.Cge.Vip 5739
GABA.Hip.Hip 5006
GABA.Mge.Mge1 11476
GABA.Mge.Mge10 823
GABA.Mge.Mge11 806
GABA.Mge.Mge12 799
GABA.Mge.Mge2 6204
GABA.Mge.Mge3 2362
GABA.Mge.Mge4 2137
GABA.Mge.Mge5 1883
GABA.Mge.Mge6 1586
GABA.Mge.Mge7 1494
GABA.Mge.Mge8 1349
GABA.Mge.Mge9 953
GABA.Msn.D1 22710
GABA.Msn.D2 29201
GABA.Msn.Foxp2 11169
GABA.Msn.MSNOLF 2600
GABA.Mxd1.Mxd1 5708
GABA.Olf.OBDOP 624
GABA.Olf.OBGC 11789
GABA.Olf.OBGLO 1450
GABA.Olf.OBIGC 1345
GABA.Olf.OBNBL 4215
GABA.Sept.LSX 14881
GABA.Sept.MA 11437
GABA.Sept.MS 7344
GABA.Sept.Sept1 793
GABA.Unk.Unk 559
Glutamate.CLA.CLA 4259
Glutamate.CT.CT1 41381
Glutamate.CT.CT2 1366
Glutamate.CT.L6b 3764
Glutamate.GC.GC 62462
Glutamate.HIP.CA1 18101
Glutamate.HIP.CA1p 1184
Glutamate.HIP.CA2 1426
Glutamate.HIP.CA3 9006
Glutamate.HIP.CA4 1578
Glutamate.HIP.CRc 467
Glutamate.HIP.Mossy 955
Glutamate.HIP.NBL 2021
Glutamate.HIP.unk1 656
Glutamate.H

In [25]:
total_dict = {}
for cluster, sub_df in pca_df.groupby(cell_meta['SubType']):
    cell_id_map = {cell_id: i for i, cell_id in enumerate(sub_df.index)}
    total_dict[cluster] = cell_id_map

import json
with open('cell_id_to_int.json', 'w') as f:
    json.dump(total_dict, f)

## Choose anchor

In [4]:
import json
with open('cell_id_to_int.json') as f:
    cell_id_dict = json.load(f)

In [5]:
def kmeans(cluster, n_merge):
    cell_id_map = pd.Series(cell_id_dict[cluster])
    this_pca_df = pca_df.loc[cell_id_map.index]
    n_cluster = this_pca_df.shape[0] // n_merge

    mbk = MiniBatchKMeans(n_clusters=n_cluster,
    init='k-means++',
    max_iter=100,
    batch_size=100,
    verbose=0,
    compute_labels=True,
    random_state=0,
    tol=0.0,
    max_no_improvement=10,
    init_size=3*n_cluster,
    n_init=5,
    reassignment_ratio=0.1)
    mbk.fit(this_pca_df.values)

    kmeans = pd.Series(mbk.labels_, index=this_pca_df.index)
    cluster_counts = kmeans.value_counts()
    cluster_counts = cluster_counts[cluster_counts > (n_merge * 0.5)]
    use_cluster = cluster_counts.index
    kmeans = kmeans.apply(lambda i: f'{cluster}+{i}' if i in use_cluster else -1)
    return kmeans

In [6]:
n_merge = 20

records = []
for cluster in cell_meta['SubType'].unique():
    kmeans_record = kmeans(cluster, n_merge)
    records.append(kmeans_record)
    print(cluster)

Glutamate.IT.L23
NonN.Asc.AscT
Glutamate.CT.CT1
Glutamate.IT.L5
NonN.Vc.Per
Glutamate.IT.L4
GABA.Mge.Mge1
Glutamate.NP.NP1
NonN.Mgc.Mgc
NonN.Ogc.Mol
Glutamate.PT.Pt
GABA.Mge.Mge2
Glutamate.IT.L6
Glutamate.CT.CT2
GABA.Cge.Lamp5
NonN.Opc.Opc
NonN.Ogc.Mfol
GABA.Mge.Mge9
NonN.Vc.Vec4
GABA.Cge.Vip
GABA.Mge.Mge3
NonN.Asc.Myoc
NonN.Vc.Vec3
NonN.Vc.Vec2
GABA.Mge.Mge8
NonN.Vc.Vec1
Glutamate.CLA.CLA
GABA.Cge.Cge3
NonN.Ogc.Nfol
GABA.Mge.Mge4
GABA.Unk.Unk
NonN.Ogc.Cop
GABA.Mge.Mge5
GABA.Mge.Mge11
Glutamate.CT.L6b
NonN.Vc.Vec5
GABA.Sept.MS
GABA.Mge.Mge6
GABA.Olf.OBGC
NonN.Vc.Vlmc
Glutamate.NP.NP2
NonN.Vc.Vpia
GABA.Cge.Cge6
GABA.Cge.Cge5
GABA.Cge.CgeHip
GABA.Msn.Foxp2
GABA.Olf.OBGLO
GABA.Mge.Mge12
GABA.Mge.Mge7
GABA.Olf.OBNBL
GABA.Mxd1.Mxd1
GABA.Olf.OBDOP
GABA.Msn.D1
GABA.Msn.D2
GABA.Sept.MA
GABA.Sept.Sept1
NonN.Asc.AscNt
Glutamate.PIR.Pir4
NonN.Asc.NIPC
Glutamate.PIR.Pir1
NonN.Asc.RGSZ
GABA.Msn.MSNOLF
GABA.Sept.LSX
GABA.Mge.Mge10
Glutamate.PIR.Pir2
Glutamate.PIR.OLFdeep
Glutamate.PIR.Pir3
Glutamate

In [7]:
final_kmean = pd.concat(records).replace(-1, np.nan)

In [8]:
cell_meta['PseudoCell'] = final_kmean

In [9]:
cell_meta.to_msgpack('CellMeta.with_pseudo_cell.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.
