# Reduced Data Exploration

We used UMAP to reduce the dimensionality of the data. Now, we can reasonably cluster and create a classifier for the data

In [6]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
import umap
import hdbscan

In [7]:
primary = pd.read_csv('../data/processed/primary_reduction_neighbors_50_components_50.csv', index_col='Unnamed: 0')
primary_viz = pd.read_csv('../data/meta/primary_tsne.tsv', sep='\t', header=None)
primary.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1.414866,4.22338,4.655418,4.221768,6.209269,4.57131,9.064432,5.080242,1.77397,8.466394,...,4.245566,2.992109,3.50527,7.737705,4.153919,0.793145,8.768568,5.901265,4.734169,3.67339
1,1.43274,4.290275,4.690469,4.239872,6.214148,4.591742,9.011972,5.062182,1.84257,8.362122,...,4.269808,3.010898,3.566487,7.66901,4.158532,0.808761,8.81378,5.890166,4.741519,3.707816
2,1.422657,4.247045,4.658894,4.192462,6.261431,4.614476,9.030548,5.077318,1.794996,8.401634,...,4.2445,3.029674,3.505005,7.669782,4.139117,0.82759,8.814221,5.87421,4.758564,3.679777
3,1.362225,4.184494,4.679659,4.191065,6.250918,4.578222,9.093062,5.107469,1.639314,8.451753,...,4.200933,3.048608,3.460865,7.66585,4.127635,0.849991,8.794816,5.855516,4.77593,3.651313
4,1.393179,4.238274,4.686842,4.218291,6.23816,4.574005,9.066672,5.084459,1.744257,8.402151,...,4.239795,3.023004,3.516864,7.681612,4.154156,0.822106,8.802408,5.879678,4.744999,3.669563


In [8]:
prim_umap = pd.read_csv('../data/processed/primary_reduction_neighbors_50_components_2.csv', index_col='Unnamed: 0')
prim_umap.shape

(189409, 2)

In [9]:
primary.shape

(189409, 50)

Now let's run clustering on the cells

In [10]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=20)
clusters = clusterer.fit(primary)
set(clusters.labels_)

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [None]:
primary['label'] = clusters.labels_

In [None]:
prim_umap['label'] = clusters.labels_

In [None]:
prim_umap.columns

In [None]:
import seaborn as sns

fig, ax = plt.subplots(figsize=(10, 10))

sns.scatterplot(
    x='0', 
    y='1',
    data=prim_umap,
    hue='label',
    legend='full',
    ax=ax,
    s=1,
    palette='bright'
)

plt.title(f'UMAP Projection of Primary Data, Colored by Cluster (computed on N=50 components)')
plt.savefig('umap_cluster.png', dpi=300)
plt.show()

In [128]:
np.unique(clusters.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24])

In [129]:
clusters.labels_

array([-1, -1, -1, ..., 20, 20, 20])

In [132]:
from collections import Counter

dict(zip(Counter(clusters.labels_).keys(), Counter(clusters.labels_).values()))

{-1: 31416,
 1: 238,
 23: 65,
 24: 11439,
 18: 61973,
 6: 64,
 22: 110,
 2: 45,
 21: 69,
 12: 546,
 20: 4260,
 3: 82,
 10: 54,
 15: 64,
 16: 53,
 17: 59,
 7: 561,
 19: 70286,
 11: 286,
 5: 22,
 13: 1795,
 9: 28,
 14: 28,
 8: 20,
 0: 5825,
 4: 21}