In [1]:
import pymc3
import umap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import time
import os
from scipy.stats import zscore
import threading
import pickle
%matplotlib inline

In [2]:
savedir = "../results/umap_out/2"

## Load in z-scored data

In [3]:
df = pickle.load(open("../data/processed/fc_z.p","rb"))
df.head()

Unnamed: 0,Cell Size,Cell Circularity,Cell Aspect Ratio,Cell Tracker Intensity,PI Intensity,AnexinV Intensity
0_2G3,62.4662,-9.764209,2.257279,-0.613713,-0.912228,-1.053166
1_2G3,-0.327282,-9.343066,84.07164,-0.941838,-0.915353,-1.213429
2_2G3,-0.590511,-7.18016,15.145722,-0.891318,-0.914065,-1.198231
3_2G3,-0.531419,-8.459477,30.618888,-0.818131,-0.911537,-1.19951
4_2G3,-0.644231,0.652443,1.079209,-0.827969,-0.913367,-1.194802


## Choose
a) n_neighbors - UMAP parameter similar to perplexity in t-sne   
b) min_distance - UMAP parameter reflecting how far apart points can be  
b) n_subsample - number of samples to use from each time-stimulus-genotype combination  
c) n_iter - number of times to run the umap subsample  

In [4]:
n_neighbors_l = [3, 5, 10, 15, 100]
min_distance_l = [0.1, 0]
n_subsample = 100
n_iter = 3

In [5]:
meta_df = pd.read_csv("../data/meta.tsv",sep="\t",index_col=0)

In [6]:
meta_df

Unnamed: 0,Timepoint,Stimuli,Sample,Genotype,Stimuli Names
0_2G3,3,G,2,WT,Nec1s
1_2G3,3,G,2,WT,Nec1s
2_2G3,3,G,2,WT,Nec1s
3_2G3,3,G,2,WT,Nec1s
4_2G3,3,G,2,WT,Nec1s
...,...,...,...,...,...
4592_4B46,46,B,4,WT,zVD
4593_4B46,46,B,4,WT,zVD
4594_4B46,46,B,4,WT,zVD
4595_4B46,46,B,4,WT,zVD


In [7]:
df = pd.concat((df,meta_df),axis=1)
df.head()

Unnamed: 0,Cell Size,Cell Circularity,Cell Aspect Ratio,Cell Tracker Intensity,PI Intensity,AnexinV Intensity,Timepoint,Stimuli,Sample,Genotype,Stimuli Names
0_2G3,62.4662,-9.764209,2.257279,-0.613713,-0.912228,-1.053166,3,G,2,WT,Nec1s
1_2G3,-0.327282,-9.343066,84.07164,-0.941838,-0.915353,-1.213429,3,G,2,WT,Nec1s
2_2G3,-0.590511,-7.18016,15.145722,-0.891318,-0.914065,-1.198231,3,G,2,WT,Nec1s
3_2G3,-0.531419,-8.459477,30.618888,-0.818131,-0.911537,-1.19951,3,G,2,WT,Nec1s
4_2G3,-0.644231,0.652443,1.079209,-0.827969,-0.913367,-1.194802,3,G,2,WT,Nec1s


## Run umap

## Loop through and run umap
### Use 2 cpus?

In [8]:
attrs = ["Cell Tracker Intensity", "PI Intensity", "AnexinV Intensity"]

In [9]:
def run_umap(df, f_save, n_neighbors, min_distance, subsample, attrs):
    t1 = time.time()
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_distance)
    embedding = reducer.fit_transform(df[attrs])
    embedding.shape
    t2 = time.time()
    print(f"Time in minutes:{(t2-t1)/60}")

    pickle.dump([embedding,df.index],open(f_save.replace(".p","") + ".p",'wb'))
    with open(f_save.replace(".p","") + "_time_minutes.txt","w") as f:
        f.write(str((t2-t1)/60))
    return

In [15]:
for i in range(n_iter):
    print('i',i)
    for neigh in n_neighbors_l:
        print('number of neighbors', neigh)
        for dist in min_distance_l:
            print('minimum distance', dist)
            # File name
            curr_f_save = f"{savedir}/embedding_{i}_{neigh}_{dist}.p" #umap_results
            if not os.path.exists(curr_f_save):
                print("Running")
                # Collect samples
                samples = df.groupby(["Stimuli","Genotype","Timepoint"]).apply(lambda x: x.sample(n=n_subsample).reset_index())
                samples = samples.set_index("index")

                run_umap(samples, curr_f_save, neigh, dist, n_subsample, attrs)
            else:
                print(f"Already ran {curr_f_save}")

i 0
number of neighbors 3
minimum distance 0.1
Already ran ../results/umap_out/2/embedding_0_3_0.1.p
minimum distance 0
Already ran ../results/umap_out/2/embedding_0_3_0.p
number of neighbors 5
minimum distance 0.1
Already ran ../results/umap_out/2/embedding_0_5_0.1.p
minimum distance 0
Already ran ../results/umap_out/2/embedding_0_5_0.p
number of neighbors 10
minimum distance 0.1
Already ran ../results/umap_out/2/embedding_0_10_0.1.p
minimum distance 0
Already ran ../results/umap_out/2/embedding_0_10_0.p
number of neighbors 15
minimum distance 0.1
Already ran ../results/umap_out/2/embedding_0_15_0.1.p
minimum distance 0
Already ran ../results/umap_out/2/embedding_0_15_0.p
number of neighbors 100
minimum distance 0.1
Already ran ../results/umap_out/2/embedding_0_100_0.1.p
minimum distance 0
Already ran ../results/umap_out/2/embedding_0_100_0.p
i 1
number of neighbors 3
minimum distance 0.1
Already ran ../results/umap_out/2/embedding_1_3_0.1.p
minimum distance 0
Already ran ../results/u

## Add more neighbors

In [16]:
n_neighbors_l = [500,1000]

In [17]:
for i in range(n_iter):
    print('i',i)
    for neigh in n_neighbors_l:
        print('number of neighbors', neigh)
        for dist in min_distance_l:
            print('minimum distance', dist)
            # File name
            curr_f_save = f"{savedir}/embedding_{i}_{neigh}_{dist}.p" #umap_results
            if not os.path.exists(curr_f_save):
                print("Running")
                # Collect samples
                samples = df.groupby(["Stimuli","Genotype","Timepoint"]).apply(lambda x: x.sample(n=n_subsample).reset_index())
                samples = samples.set_index("index")

                run_umap(samples, curr_f_save, neigh, dist, n_subsample, attrs)
            else:
                print(f"Already ran {curr_f_save}")

i 0
number of neighbors 500
minimum distance 0.1
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  self.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  

Time in minutes:9.910081140200298
minimum distance 0
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:13.141490697860718
number of neighbors 1000
minimum distance 0.1
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:23.025675376256306
minimum distance 0
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:23.507224849859874
i 1
number of neighbors 500
minimum distance 0.1
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:11.91708132425944
minimum distance 0
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:12.141723696390788
number of neighbors 1000
minimum distance 0.1
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:25.504487788677217
minimum distance 0
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:21.92332330942154
i 2
number of neighbors 500
minimum distance 0.1
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:12.545184576511383
minimum distance 0
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:10.950611464182536
number of neighbors 1000
minimum distance 0.1
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:22.942644886175792
minimum distance 0
Running


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/isshamie/software/anaconda2/envs/umap/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Time in minutes:19.000822758674623
