In [None]:
# import warnings
# warnings.filterwarnings('ignore')

import urllib.request
import random
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
from scvi.dataset import AnnDatasetFromAnnData, RetinaDataset, LoomDataset
from scvi.models import VAE
from scvi.inference import UnsupervisedTrainer
import torch
import matplotlib.pyplot as plt
import tensorflow as tf
import sys
import umap
sys.path.append("../")
from utils import entropy_batch_mixing , clustering_scores

seed = 2345
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
tf.set_random_seed(seed)

gpus = ["2"]
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpus)

# Load Data

In [None]:
save_path = "/home/mcb/users/mbahra5/project/data/"
dataset = RetinaDataset(save_path=save_path)

In [None]:
# dataset.filter_genes_by_count()

In [None]:
adata = anndata.AnnData(X=dataset.X)
adata.obs['cell_type'] = np.array([dataset.cell_types[dataset.labels[i][0]] for i in range(adata.n_obs)])
adata.obs['batch'] = np.array([dataset.batch_indices[i][0] for i in range(adata.n_obs)])

# Preprocess

In [None]:
# sc.pp.log1p(adata)

# Latent Inference

In [None]:
n_epochs = 50
lr = 0.001
use_batches = True
use_cuda = True
%matplotlib inline

In [None]:
vae = VAE(dataset.nb_genes, n_batch=dataset.n_batches * use_batches)
trainer = UnsupervisedTrainer(
    vae, 
    dataset, 
    train_size=0.9, test_size=0.05,
    use_cuda=use_cuda,
    frequency=5,
    seed = seed
)
trainer.train(n_epochs=n_epochs, lr=lr)

In [None]:
elbo_train = trainer.history["elbo_train_set"]
elbo_test = trainer.history["elbo_test_set"]
x = np.linspace(0, 50, (len(elbo_train)))
plt.plot(x, elbo_train)
plt.plot(x, elbo_test)
plt.ylim(min(elbo_train)-50, 3500)

In [None]:
posterior = trainer.create_posterior(trainer.model, dataset, indices=np.arange(len(dataset)))
latent, batches, labels = posterior.sequential().get_latent()

In [None]:
adata.obsm["X_scVI"] = latent

# Scores

In [None]:
def calc_scores(input_posterior):
    latent, batches, labels = input_posterior.sequential().get_latent()
    print("Entropy of batch mixing :", entropy_batch_mixing(latent,batches))
    print("Clustering ARI = {}".format(clustering_scores(dataset.n_labels, labels, latent)))

In [None]:
print('Train Set:')
calc_scores(trainer.train_set)

In [None]:
print('Test Set:')
calc_scores(trainer.test_set)

In [None]:
print('Validation Set:')
calc_scores(trainer.validation_set)

In [None]:
# posterior.clustering_scores()

# t-SNE

In [None]:
sc.tl.tsne(adata, use_rep='X_scVI', n_pcs=2)

In [None]:
show_plot = True
fig, ax = plt.subplots(figsize=(8, 7))
sc.pl.tsne(adata, color=["cell_type"], ax=ax, show=show_plot)
fig, ax = plt.subplots(figsize=(8, 7))
sc.pl.tsne(adata, color=["batch"], ax=ax, show=show_plot)

# UMAP

In [None]:
import warnings
warnings.filterwarnings('ignore')
sc.pp.neighbors(adata, use_rep="X_scVI", n_neighbors=15)
sc.tl.umap(adata, min_dist=0.1)

In [None]:
show_plot = True
fig, ax = plt.subplots(figsize=(7, 6))
sc.pl.umap(adata, color=["cell_type"], ax=ax, show=show_plot)
fig, ax = plt.subplots(figsize=(7, 6))
sc.pl.umap(adata, color=["batch"], ax=ax, show=show_plot)


# Classification Acc Measure

In [None]:
latent_x, batches, labels_x = trainer.train_set.sequential().get_latent()

In [None]:
classification_acc_measure(latent_x, labels_x)