In [None]:
import argparse
import copy
import os
import sys
import time

import numpy as np
import pandas as pd
from pandas.core.arrays import boolean
import torch
from scipy import stats
from sklearn import preprocessing
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import functional as F
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import models
import utils as ut
from models import AEBase, Predictor, PretrainedPredictor

import scanpypip.preprocessing as pp
import scanpypip.utils as scut 

import scanpy as sc

In [None]:
class Arguments:
    def __init__(self):   
        self.epochs = 500
        self.bottleneck = 512
        self.missing_value = np.nan
        self.data_path = "data/GSE108394/GSM2897334/"
        self.test_size = 0.2
        self.valid_size = 0.2
        self.model_store_path = "saved/models/"
        self.logging_file = "saved/logs/"
        self.batch_size = 200
        self.ft_h_dims = "512,256"
        self.var_genes_disp = 0
        self.pretrain_path = "saved/models/"
        self.min_n_genes = 0
        self.max_n_genes = 20000
        self.min_g = 200
        self.min_c = 3

        
args = Arguments()

In [None]:
epochs = args.epochs
dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512
dim_dnn_in = dim_au_out
dim_dnn_out=1
na = args.missing_value
data_path = args.data_path
test_size = args.test_size
valid_size = args.valid_size
g_disperson = args.var_genes_disp
min_n_genes = args.min_n_genes
max_n_genes = args.max_n_genes
model_path = args.model_store_path
pretrain_path = args.pretrain_path
log_path = args.logging_file
batch_size = args.batch_size
encoder_hdims = args.ft_h_dims.split(",")
encoder_hdims = list(map(int, encoder_hdims))


In [None]:
now=time.strftime("%Y-%m-%d-%H-%M-%S")
log_path = log_path+now+".txt"
export_name = data_path.replace("/","")
pretrain_path = "saved/models/ae_"+export_name+now+".pkl"

In [None]:
#scv = pd.read_csv('data/GSE117872/GSE117872_good_Data_TPM.txt',sep="\t")

In [None]:
#scv.shape

In [None]:
adata = pp.read_sc_file('data/GSE117872/GSE117872_good_Data_TPM.txt')

In [None]:
adata

In [None]:
# adata = sc.read_10x_mtx(
#  'data/GSE108394/GSM2897334/',  # the directory with the `.mtx` file 
#  var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
#  cache=True)                              # write a cache file for faster subsequent reading


In [None]:
adata

In [None]:
#adata = sc.pp.filter_cells(adata, min_genes=200)
#adata = sc.pp.filter_genes(adata, min_cells=3)

adata =pp.cal_ncount_ngenes(adata)

In [None]:
sc.pl.violin(adata, ['n_counts',"percent_mito",'percent_rps', 'percent_rpl'],
             jitter=0.4, multi_panel=True,save=export_name)

In [None]:
adata = pp.receipe_my(adata,l_n_genes=min_n_genes,r_n_genes=max_n_genes,filter_mincells=args.min_c,
                      filter_mingenes=args.min_g,normalize=True,log=True)

In [None]:
sc.pp.highly_variable_genes(adata,min_disp=g_disperson,max_disp=np.inf)

sc.pl.highly_variable_genes(adata,save=export_name)

In [None]:
adata.raw = adata

adata = adata[:, adata.var.highly_variable]

data=adata.X

In [None]:
data

In [None]:
mmscaler = preprocessing.MinMaxScaler()

In [None]:
data = mmscaler.fit_transform(data)

In [None]:
X_train, X_valid = train_test_split(data, test_size=valid_size, random_state=42)

In [None]:
print(X_train.shape, X_valid.shape)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
torch.cuda.set_device(device)

# Construct datasets and data loaders
X_trainTensor = torch.FloatTensor(X_train).to(device)
X_validTensor = torch.FloatTensor(X_valid).to(device)
X_allTensor = torch.FloatTensor(data).to(device)

train_dataset = TensorDataset(X_trainTensor, X_trainTensor)
valid_dataset = TensorDataset(X_validTensor, X_validTensor)
all_dataset = TensorDataset(X_allTensor, X_allTensor)


X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True)

dataloaders_pretrain = {'train':X_trainDataLoader,'val':X_validDataLoader}


In [None]:
len(X_trainDataLoader)

In [None]:
encoder = AEBase(input_dim=data.shape[1],latent_dim=dim_au_out,h_dims=encoder_hdims)
#model = VAE(dim_au_in=data_r.shape[1],dim_au_out=128)
if torch.cuda.is_available():
    encoder.cuda()

print(encoder)
encoder.to(device)
optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2)
loss_function_e = nn.MSELoss()
exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e)
encoder,loss_report_en = ut.train_extractor_model(net=encoder,data_loaders=dataloaders_pretrain,
                            optimizer=optimizer_e,loss_function=loss_function_e,
                            n_epochs=epochs,scheduler=exp_lr_scheduler_e,save_path=pretrain_path)

print("Pretrained finished")

In [None]:
embeddings = encoder.encode(X_allTensor).detach().cpu().numpy()

In [None]:
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
adata.obsm["X_AE"] = embeddings

In [None]:
sc.pp.neighbors(adata, n_neighbors=10,use_rep="X_AE")
#sc.tl.umap(adata)

In [None]:
sc.tl.tsne(adata,use_rep="X_AE")

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.tsne(adata,save=export_name,color=["leiden"])

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,save=export_name)

In [None]:
adata.write("saved/results"+export_name+".h5ad")