In [None]:
import GEOparse
from tqdm import tqdm
import urllib.request
import random
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata

from utils_helper import VAE, Discriminator, Regressor, GANTrainer, entropy_batch_mixing, clustering_scores, GeneDataset

import torch
import matplotlib.pyplot as plt
import tensorflow as tf
import sys
import umap

seed = 345
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
tf.set_random_seed(seed)

gpus = ["6"]
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpus)
device = 'cuda:0'

# Load Data

In [None]:
data_path = '/home/mcb/users/mbahra5/project/data/GEO/'
gse = GEOparse.get_GEO(geo='GSE84133', destdir=data_path)

In [None]:
supp = gse.download_supplementary_files(directory=data_path, download_sra=False)

In [None]:
data = []
for k in tqdm(supp.keys()):
    for v in supp[k].values():
        if 'mouse' in v:
            data.append(pd.read_csv(v))

In [None]:
genes = data[0].columns[3:].values
df = pd.concat(data)
df['batch'] = df['Unnamed: 0'].apply(lambda x: x[:6]).astype('category').cat.codes.astype('long').values

In [None]:
adata = anndata.AnnData(X=df[genes].values)
adata.obs['cell_type'] = df['assigned_cluster'].values
adata.obs['labels'] = df['assigned_cluster'].astype('category').cat.codes.values
adata.obs['batch'] = df['batch'].values

In [None]:
# shuffle dataset
sc.pp.subsample(adata,fraction=1,random_state = seed)

In [None]:
dataset = GeneDataset(adata.X, adata.obs.labels, adata.obs.batch, adata.obs.batch)

# Latent Inference

In [None]:
n_epochs = 50
lr = 0.001
eps = 1e-8
use_batches = True
use_cuda = True
n_latent = 10
batch_size = 128
%matplotlib inline

In [None]:
vae = VAE(dataset.nb_genes, n_batch=dataset.n_batches * use_batches, n_latent=n_latent, n_layers = 2,
          n_hidden=64).cuda(device)

In [None]:
disc = Discriminator(n_latent, [2*n_latent], dataset.n_batches).cuda(device)

In [None]:
trainer = GANTrainer('discrete', vae, disc, dataset, device, batch_size)

In [None]:
# Pretraining
history = trainer.train(n_epochs=50, lr= lr*1, eps=eps, disc_lr= lr * 1, enc_lr = lr* 0.0 )

In [None]:
elbo_train = history[0]
x = np.linspace(0, len(elbo_train), len(elbo_train))
plt.plot(x, elbo_train)

In [None]:
# Training with Adversarial loss
history = trainer.train(n_epochs=30, lr= lr*1, eps=eps, disc_lr= lr * 1, enc_lr = lr* 0.05)

In [None]:
latent, labels, batches = [item.detach().cpu().numpy() for item in trainer.get_latent()]

In [None]:
adata.obsm["X_scGAN"] = latent

# t-SNE

In [None]:
sc.tl.tsne(adata, use_rep='X_scGAN', n_pcs=2)

In [None]:
show_plot = True
fig, ax = plt.subplots(figsize=(9, 8))
sc.pl.tsne(adata, color=["cell_type"], ax=ax, show=show_plot)

fig, ax = plt.subplots(figsize=(9, 8))
sc.pl.tsne(adata, color=["batch"], ax=ax, show=show_plot)

# Scores

In [None]:
print("Entropy of batch mixing :", entropy_batch_mixing(latent, batches))

## Kmeans Clustering Score

In [None]:
print("Clustering ARI = {}".format(clustering_scores(dataset.n_labels, labels, latent)))

## Louvain Clustering Score

In [None]:
sc.pp.neighbors(adata, use_rep="X_scGAN", n_neighbors=30)
sc.tl.louvain(adata, resolution=0.15)

In [None]:
show_plot = True
fig, ax = plt.subplots(figsize=(9, 8))
sc.pl.tsne(adata, color=['louvain'], ax=ax, show=show_plot)

In [None]:
from sklearn.metrics import adjusted_rand_score as ARI
ari_score = ARI(labels, adata.obs['louvain'])
print("Louvain Clustering ARI = {}".format(ari_score))

# UMAP

In [None]:
# import warnings
# warnings.filterwarnings('ignore')
sc.pp.neighbors(adata, use_rep="X_scGAN", n_neighbors=15)
sc.tl.umap(adata, min_dist=0.1)

In [None]:
show_plot = True
fig, ax = plt.subplots(figsize=(10, 9))
sc.pl.umap(adata, color=["cell_type"], ax=ax, show=show_plot)
fig, ax = plt.subplots(figsize=(10, 9))
sc.pl.umap(adata, color=["batch"], ax=ax, show=show_plot)
