In [1]:
import h5py
import numpy
import pandas
import captum
import tqdm
import scipy
import scanpy

Learn - Model.py

In [2]:
import torch
import torch.nn as nn
global mu
global var

class LinBnDrop(nn.Sequential):
    """Module grouping `BatchNorm1d`, `Dropout` and `Linear` layers"""
    def __init__(self, n_in, n_out, bn=True, p=0., act=None, lin_first=True):
        layers = [nn.BatchNorm1d(n_out if lin_first else n_in)] if bn else []
        if p != 0: layers.append(nn.Dropout(p))
        lin = [nn.Linear(n_in, n_out, bias=not bn)]
        if act is not None: lin.append(act)
        layers = lin+layers if lin_first else layers+lin
        super().__init__(*layers)

        
class Encoder(nn.Module):
    """Encoder for CITE-seq data"""
    def __init__(self, nfeatures_modality1=10703, nfeatures_modality2=192, hidden_modality1=185,  hidden_modality2=15, z_dim=128):
        super().__init__()
        self.nfeatures_modality1 = nfeatures_modality1
        self.nfeatures_modality2 = nfeatures_modality2
        self.encoder_modality1 = LinBnDrop(nfeatures_modality1, hidden_modality1, p=0.2, act=nn.ReLU())
        self.encoder_modality2 = LinBnDrop(nfeatures_modality2, hidden_modality2, p=0.2, act=nn.ReLU())
        self.encoder = LinBnDrop(hidden_modality1 + hidden_modality2, z_dim,  p=0.2, act=nn.ReLU())
        self.weights_modality1 = nn.Parameter(torch.rand((1,nfeatures_modality1)) * 0.001, requires_grad=True)
        self.weights_modality2 = nn.Parameter(torch.rand((1,nfeatures_modality2)) * 0.001, requires_grad=True)
        self.fc_mu = LinBnDrop(z_dim,z_dim, p=0.2)
        self.fc_var = LinBnDrop(z_dim,z_dim, p=0.2)
        
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return eps * std + mu
    
    def forward(self, x):
        global mu
        global var
        x_modality1 = self.encoder_modality1(x[:, :self.nfeatures_modality1]*self.weights_modality1)
        x_modality2 = self.encoder_modality2(x[:, self.nfeatures_modality1:]*self.weights_modality2)
        x = torch.cat([x_modality1, x_modality2], 1)
        x = self.encoder(x)
        mu = self.fc_mu(x)
        var = self.fc_var(x)
        x = self.reparameterize(mu, var)
        return x
    

class Decoder(nn.Module):
    """Decoder for for 2 modalities data (citeseq data and shareseq data) """
    def __init__(self, nfeatures_modality1=10703, nfeatures_modality2=192,  hidden_modality1=185,  hidden_modality2=15, z_dim=128):
        super().__init__()
        self.nfeatures_modality1 = nfeatures_modality1
        self.nfeatures_modality2 = nfeatures_modality2
        self.decoder1 = LinBnDrop(z_dim, nfeatures_modality1, act=nn.ReLU())
        self.decoder2 = LinBnDrop(z_dim, nfeatures_modality2,  act=nn.ReLU())

    def forward(self, x):
        x_rna = self.decoder1(x)
        x_adt = self.decoder2(x)
        x = torch.cat((x_rna,x_adt),1)
        return x

    
class CiteAutoencoder_CITEseq(nn.Module):
    def __init__(self, nfeatures_rna=0, nfeatures_adt=0,  hidden_rna=185,  hidden_adt=15, z_dim=20,classify_dim=17):
        """ Autoencoder for 2 modalities data (citeseq data and shareseq data) """
        super().__init__()
        self.encoder = Encoder(nfeatures_rna, nfeatures_adt, hidden_rna,  hidden_adt, z_dim)
        self.classify = nn.Linear(z_dim, classify_dim)
        self.decoder = Decoder(nfeatures_rna, nfeatures_adt, hidden_rna,  hidden_adt, z_dim)
        
    def forward(self, x):
        global mu
        global var
        x = self.encoder(x)
        x_cty = self.classify(x)
        x = self.decoder(x)
        return x, x_cty,mu,var


Learn - Train.py

In [3]:
import torch
import torch.nn as nn
from collections import defaultdict
from tqdm import tqdm
from torch.autograd import Variable
import os
import sys
import shutil
from util import AverageMeter,accuracy,save_checkpoint,CrossEntropyLabelSmooth,KL_loss

def train_model(model, train_dl, test_dl, lr, epochs, classify_dim=17, best_top1_acc=0, save_path = "", feature_num=10000):
    #####set optimizer and criterin#####
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr) ##
    criterion = nn.MSELoss().to(device)
    criterion_smooth_cty = CrossEntropyLabelSmooth().to(device)
    
    best_top1_acc=0
    best_each_celltype_top1 = []
    best_each_celltype_num=[]
    train_each_celltype_num = []

    for i in range(classify_dim):
        best_each_celltype_top1.append(0)
        best_each_celltype_num.append(0)
        train_each_celltype_num.append(0)
                
    ######loop training process, each epoch contains train and test two part#########
    for epoch in tqdm(range(1, epochs + 1)):
        model.train()
        nsamples_train = 0
        train_top1 = AverageMeter('Acc@1', ':6.2f')
        model = model.train()
        nsamples_test = 0
        test_top1 = AverageMeter('Acc@1', ':6.2f')
        each_celltype_top1 = []
        each_celltype_num=[]
        for i in range(classify_dim):
            each_celltype_top1.append(AverageMeter('Acc@1', ':6.2f'))
            each_celltype_num.append(0)
            

            
        for i, batch_sample in enumerate(train_dl):
            optimizer.zero_grad()
            ###load data
            x = batch_sample['data']
            x = Variable(x)
            x = torch.reshape(x,(x.size(0),-1))
            train_label = batch_sample['label']
            train_label = Variable(train_label)
            # Forward pass
            x_prime, x_cty,mu, var = model(x.to(device))
            # loss function
            loss1 = criterion(x_prime, x.to(device)) + 1/feature_num*(KL_loss(mu,var))#simulation loss
            loss2 = criterion_smooth_cty(x_cty, train_label.to(device))  #classification loss
            loss = 0.9*loss1 + 0.1*loss2 ##sum up the loss together
            # Backward pass
            loss.backward()
            optimizer.step()
            # log losses
            batch_size = x.shape[0]
            nsamples_train += batch_size
            train_pred1,  = accuracy(x_cty, train_label, topk=(1, ))
            train_top1.update(train_pred1[0], 1)
            if epoch == 1:
                for j in range(classify_dim):
                    if len(train_label[train_label==j])!=0:
                        train_each_celltype_num[j]=train_each_celltype_num[j] + len(train_label[train_label==j])                        


        model = model.eval()
        if test_dl!="NULL":
            with torch.no_grad():
                for i, batch_sample in enumerate(test_dl):
                    ###load data
                    x = batch_sample['data']
                    x = Variable(x)
                    x = torch.reshape(x,(x.size(0),-1))
                    test_label = batch_sample['label']    
                    test_label = Variable(test_label)
                    ###forward process
                    x_prime, x_cty,mu, var = model(x.to(device))

                    batch_size = x.shape[0]
                    nsamples_test += batch_size
                    test_pred1,  = accuracy(x_cty, test_label, topk=(1, ))
                    test_top1.update(test_pred1[0], 1)
                
                    ###record accuracy for each celltype
                    for j in range(classify_dim):
                        if len(test_label[test_label==j])!=0:
                            pred1,  = accuracy(x_cty[test_label==j,:], test_label[test_label==j], topk=(1, ))
                            each_celltype_top1[j].update(pred1[0],1)
                            each_celltype_num[j]=each_celltype_num[j] + len(test_label[test_label==j])
        
            ####save the best model
        #if test_top1.avg > best_top1_acc:
        #    best_top1_acc = test_top1.avg
        if epoch==epochs:
            #for j in range(classify_dim):
            #    best_each_celltype_top1[j] = each_celltype_top1[j].avg
            #    best_each_celltype_num[j] = each_celltype_num[j]
            save_checkpoint({'epoch': epoch,
                'state_dict': model.state_dict(),
                #'best_top1_acc': best_top1_acc,
                #'best_top1_celltype_acc': best_each_celltype_top1,
                #'best_top1_celltype_num': best_each_celltype_num,
                'optimizer' : optimizer.state_dict(),
                }, save_path)

        
        #if epoch==epochs:
        #    print('Epoch : ',epoch, '\t')
        #    for j in range(classify_dim):
        #        print('cell type : ',j, '\t', '\t', 'prec :', best_each_celltype_top1[j], 'number:', best_each_celltype_num[j], 'train_cty_num:',train_each_celltype_num[j])
    
    return model,best_each_celltype_top1,best_each_celltype_num,train_each_celltype_num



Learn - Predict.py

In [4]:
import torch
import torch.nn as nn
from collections import defaultdict
from tqdm import tqdm
from torch.autograd import Variable
import os
import sys
import shutil
from util import AverageMeter,accuracy

def test_model(model, dl, real_label, classify_dim=17, save_path = ""):
    #####set optimizer and criterin#####
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    nsamples_test = 0
    test_top1 = AverageMeter('Acc@1', ':6.2f')
    each_celltype_top1 = []
    each_celltype_num=[]
    best_each_celltype_top1 = []
    for i in range(classify_dim):
        each_celltype_top1.append(AverageMeter('Acc@1', ':6.2f'))
        each_celltype_num.append(0)
        best_each_celltype_top1.append(0)

    model = model.eval()
    classified_label = []
    groundtruth_label = []
    prob = []
    with torch.no_grad():
        for i, batch_sample in enumerate(dl):
            ###load data
            x = batch_sample['data']
            x = Variable(x)
            x = torch.reshape(x,(x.size(0),-1))
            test_label = batch_sample['label']    
            test_label = Variable(test_label)
            ###forward process
            x_prime, x_cty,mu, var = model(x.to(device))
            a = torch.max(nn.Softmax()(x_cty),1)
            
            for j in range(x_prime.size(0)):
                classified_label.append(real_label[a.indices[j]])
                groundtruth_label.append(real_label[test_label[j]])
                prob.append(a.values[j])
               

            batch_size = x.shape[0]
            nsamples_test += batch_size
            test_pred1,  = accuracy(x_cty, test_label, topk=(1, ))
            test_top1.update(test_pred1[0], 1)

            ###record accuracy for each celltype
            for j in range(classify_dim):
                if len(test_label[test_label==j])!=0:
                    pred1,  = accuracy(x_cty[test_label==j,:], test_label[test_label==j], topk=(1, ))
                    each_celltype_top1[j].update(pred1[0],1)
                    each_celltype_num[j]=each_celltype_num[j] + len(test_label[test_label==j])

    for j in range(classify_dim):
        best_each_celltype_top1[j] = each_celltype_top1[j].avg
        print('cell type ID: ',j, '\t', '\t', 'cell type:', real_label[j], '\t', '\t', 'prec :', each_celltype_top1[j].avg, 'number:', each_celltype_num[j], file = save_path)
        
    return model,best_each_celltype_top1,each_celltype_num, classified_label, groundtruth_label,prob



Main_matilda_train.py


In [5]:
import os
import parser
import argparse

import pandas as pd
import numpy as np
from captum.attr import *
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torch.autograd import Variable

#from learn.train import train_model
from util import setup_seed, MyDataset,ToTensor, read_h5_data, read_fs_label, get_vae_simulated_data_from_sampling, get_encodings, compute_zscore, compute_log2,save_checkpoint


def main_matilda_train():
    
    parser = argparse.ArgumentParser("Matilda")
    parser.add_argument('--seed', type=int, default=1, help='seed')
    parser.add_argument('--augmentation', type=bool, default= True, help='if augmentation or not')

    ############# for data build ##############
    parser.add_argument('--rna', metavar='DIR', default='NULL', help='path to train rna data')
    parser.add_argument('--adt', metavar='DIR', default='NULL', help='path to train adt data')
    parser.add_argument('--atac', metavar='DIR', default='NULL', help='path to train atac data')
    parser.add_argument('--cty', metavar='DIR', default='NULL', help='path to train cell type label')

    ##############  for training #################
    parser.add_argument('--batch_size', type=int, default=64, help='batch size')
    parser.add_argument('--epochs', type=int, default=30, help='num of training epochs')
    parser.add_argument('--lr', type=float, default=0.02, help='init learning rate')

    ############# for model build ##############
    parser.add_argument('--z_dim', type=int, default=100, help='the number of neurons in latent space')
    parser.add_argument('--hidden_rna', type=int, default=185, help='the number of neurons for RNA layer')
    parser.add_argument('--hidden_adt', type=int, default=30, help='the number of neurons for ADT layer')
    parser.add_argument('--hidden_atac', type=int, default=185, help='the number of neurons for ATAC layer')

    args = parser.parse_args()
    setup_seed(args.seed) ### set random seed in order to reproduce the result
    cuda = True if torch.cuda.is_available() else False
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
    LongTensor = torch.cuda.LongTensor if cuda else torch.LongTensor

    if args.adt != "NULL" and args.atac == "NULL":
        mode = "CITEseq"
        train_rna_data_path = args.rna
        train_adt_data_path = args.adt
        train_label_path = args.cty
        train_rna_data = read_h5_data(train_rna_data_path)
        train_adt_data = read_h5_data(train_adt_data_path)
        train_label = read_fs_label(train_label_path)
        classify_dim = (max(train_label)+1).cpu().numpy()
        nfeatures_rna = train_rna_data.shape[1]
        nfeatures_adt = train_adt_data.shape[1]
        feature_num = nfeatures_rna + nfeatures_adt
        train_rna_data = compute_log2(train_rna_data)
        train_adt_data = compute_log2(train_adt_data)
        train_rna_data = compute_zscore(train_rna_data)
        train_adt_data = compute_zscore(train_adt_data)
        train_data = torch.cat((train_rna_data,train_adt_data),1)
        train_transformed_dataset = MyDataset(train_data, train_label)
        train_dl = DataLoader(train_transformed_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=False)
        
        
    test_dl = "NULL"

            
    print("The dataset is", mode)    
    output_v = []
    model_save_path = "../trained_model/{}/".format(mode)   
    model_save_path_1stage = "../trained_model/{}/simulation_".format(mode)    
    save_fs_eachcell = "../output/marker/{}/".format(mode)   

    #######build model#########
    if mode == "CITEseq":
        model = CiteAutoencoder_CITEseq(nfeatures_rna, nfeatures_adt, args.hidden_rna, args.hidden_adt, args.z_dim, classify_dim)
    else:
        print("error")

    #model = nn.DataParallel(model).to(device) #multi gpu
    model = model.to(device) #one gpu
    ########train model#########
    model, acc1, num1, train_num = train_model(model, train_dl, test_dl, lr=args.lr, epochs=args.epochs, classify_dim = classify_dim, best_top1_acc=0, save_path=model_save_path,feature_num=feature_num)
    ##################prepare to do augmentation##################            
    if args.augmentation == True:
        stage1_list = []
        for i in np.arange(0, classify_dim):
            stage1_list.append([i, train_num[i]])
            stage1_df = pd.DataFrame(stage1_list)
        if classify_dim%2==0:
            train_median = np.sort(train_num)[int(classify_dim/2)-1]
        else: 
            train_median = np.median(train_num)
        median_anchor = stage1_df[stage1_df[1] == train_median][0]
        train_major = stage1_df[stage1_df[1] > train_median]
        train_minor = stage1_df[stage1_df[1] < train_median]
        anchor_fold = np.array((train_median)/(train_minor[:][1]))
        minor_anchor_cts = train_minor[0].to_numpy()
        major_anchor_cts = train_major[0].to_numpy()

        index = (train_label == int(np.array(median_anchor))).nonzero(as_tuple=True)[0]
        anchor_data = train_data[index.tolist(),:]
        anchor_label = train_label[index.tolist()]
        new_data = anchor_data 
        new_label = anchor_label

        ##############random downsample major cell types##############
        j=0
        for anchor in major_anchor_cts:     
            anchor_num = np.array(train_major[1])[j]
            N = range(anchor_num)
            ds_index = random.sample(N,int(train_median))
            index = (train_label == anchor).nonzero(as_tuple=True)[0]
            anchor_data = train_data[index.tolist(),:]
            anchor_label = train_label[index.tolist()]
            anchor_data = anchor_data[ds_index,:]
            anchor_label = anchor_label[ds_index]
            new_data = torch.cat((new_data,anchor_data),0)
            new_label = torch.cat((new_label,anchor_label.to(device)),0)
            j = j+1

        ###############augment for minor cell types##################
        j = 0
        for anchor in minor_anchor_cts:
            aug_fold = int((anchor_fold[j]))    
            remaining_cell = int(train_median - (int(anchor_fold[j]))*np.array(train_minor[1])[j])
            index = (train_label == anchor).nonzero(as_tuple=True)[0]
            anchor_data = train_data[index.tolist(),:]
            anchor_label = train_label[index.tolist()]
            anchor_transfomr_dataset = MyDataset(anchor_data, anchor_label)
            anchor_dl = DataLoader(anchor_transfomr_dataset, batch_size=args.batch_size,shuffle=True, num_workers=0,drop_last=False)
            reconstructed_data, reconstructed_label, real_data = get_vae_simulated_data_from_sampling(model, anchor_dl)
            reconstructed_data[reconstructed_data>torch.max(real_data)]=torch.max(real_data)
            reconstructed_data[reconstructed_data<torch.min(real_data)]=torch.min(real_data)
            reconstructed_data[torch.isnan(reconstructed_data)]=torch.max(real_data)

            new_data = torch.cat((new_data,reconstructed_data),0)
            new_label = torch.cat((new_label, reconstructed_label),0)
            for i in range(aug_fold-1):
                reconstructed_data, reconstructed_label,real_data = get_vae_simulated_data_from_sampling(model, anchor_dl)
                reconstructed_data[reconstructed_data>torch.max(real_data)]=torch.max(real_data)
                reconstructed_data[reconstructed_data<torch.min(real_data)]=torch.min(real_data)
                reconstructed_data[torch.isnan(reconstructed_data)]=torch.max(real_data)
                new_data = torch.cat((new_data,reconstructed_data),0)
                new_label = torch.cat((new_label,reconstructed_label.to(device)),0)

            reconstructed_data, reconstructed_label,real_data = get_vae_simulated_data_from_sampling(model, anchor_dl)
            reconstructed_data[reconstructed_data>torch.max(real_data)]=torch.max(real_data)
            reconstructed_data[reconstructed_data<torch.min(real_data)]=torch.min(real_data)
            reconstructed_data[torch.isnan(reconstructed_data)]=torch.max(real_data)

            #add remaining cell
            N = range(np.array(train_minor[1])[j])
            ds_index = random.sample(N, remaining_cell)
            reconstructed_data = reconstructed_data[ds_index,:]
            reconstructed_label = reconstructed_label[ds_index]
            new_data = torch.cat((new_data,reconstructed_data),0)
            new_label = torch.cat((new_label,reconstructed_label.to(device)),0)
            j = j+1               

    if not os.path.exists(model_save_path):
        os.mkdir(model_save_path)
        
    #######load the model trained before augmentation#########
    checkpoint_tar = os.path.join(model_save_path, 'model_best.pth.tar')
    if os.path.exists(checkpoint_tar):
        checkpoint = torch.load(checkpoint_tar)
        model.load_state_dict(checkpoint['state_dict'], strict=True)
        print("load successfully")

    ############process new data after augmentation###########
    train_transformed_dataset = MyDataset(new_data, new_label)
    train_dl = DataLoader(train_transformed_dataset, batch_size=args.batch_size,shuffle=True, num_workers=0,drop_last=False)

    ############## train model ###########
    model,acc2,num1,train_num = train_model(model, train_dl, test_dl, lr=args.lr, epochs=int(args.epochs/2),classify_dim=classify_dim,best_top1_acc=0, save_path=model_save_path,feature_num=feature_num)
    checkpoint_tar = os.path.join(model_save_path, 'model_best.pth.tar')
    if os.path.exists(checkpoint_tar):
        checkpoint = torch.load(checkpoint_tar)
        model.load_state_dict(checkpoint['state_dict'], strict=True)
        print("load successfully")
    model,acc2,num1,train_num = train_model(model, train_dl, test_dl, lr=args.lr/10, epochs=int(args.epochs/2),classify_dim=classify_dim,best_top1_acc=0, save_path=model_save_path,feature_num=feature_num)



    
            



  import parser


In [37]:
df = pd.read_csv(r'C:\Users\KARAN\Desktop\MultiOmics-Research\STAGATE\Landau\SPOTS Landau paper dataset\protein\GSE198353_mmtv_pymt_ADT_t.csv',index_col=0)
df.columns

Index(['CD4', 'CD8a', 'CD366', 'CD279', 'CD117', 'Ly-6C', 'Ly-6G', 'CD19',
       'CD45', 'CD25', 'CD11c', 'F4/80', 'I-A/I-E', 'NK-1.1', 'Ly-6A/E',
       'CD274', 'CD86', 'CD192 (CCR2)', 'CD326', 'CD38', 'IgD', 'CD140a',
       'CD11a', 'P2X7R', 'CD1d', 'Notch 4', 'CD31', 'Podoplanin', 'CD45R/B220',
       'CD27', 'CD11b', 'CD202b'],
      dtype='object')

In [38]:
import numpy as np
import pandas as pd

df = pd.read_csv(r'C:\Users\KARAN\Desktop\MultiOmics-Research\STAGATE\Landau\SPOTS Landau paper dataset\protein\GSE198353_mmtv_pymt_ADT_t.csv',index_col=0)

df = pd.DataFrame(np.arange(63296).reshape((1978,32)), columns=['CD4', 'CD8a', 'CD366', 'CD279', 'CD117', 'Ly-6C', 'Ly-6G', 'CD19',
       'CD45', 'CD25', 'CD11c', 'F4/80', 'I-A/I-E', 'NK-1.1', 'Ly-6A/E',
       'CD274', 'CD86', 'CD192 (CCR2)', 'CD326', 'CD38', 'IgD', 'CD140a',
       'CD11a', 'P2X7R', 'CD1d', 'Notch 4', 'CD31', 'Podoplanin', 'CD45R/B220',
       'CD27', 'CD11b', 'CD202b'])

print(df)

        CD4   CD8a  CD366  CD279  CD117  Ly-6C  Ly-6G   CD19   CD45   CD25  \
0         0      1      2      3      4      5      6      7      8      9   
1        32     33     34     35     36     37     38     39     40     41   
2        64     65     66     67     68     69     70     71     72     73   
3        96     97     98     99    100    101    102    103    104    105   
4       128    129    130    131    132    133    134    135    136    137   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
1973  63136  63137  63138  63139  63140  63141  63142  63143  63144  63145   
1974  63168  63169  63170  63171  63172  63173  63174  63175  63176  63177   
1975  63200  63201  63202  63203  63204  63205  63206  63207  63208  63209   
1976  63232  63233  63234  63235  63236  63237  63238  63239  63240  63241   
1977  63264  63265  63266  63267  63268  63269  63270  63271  63272  63273   

      ...  CD11a  P2X7R   CD1d  Notch 4   CD31  Podoplanin  CD4

In [39]:
# Save to HDF5
filename = 'adt.h5'

df.to_hdf(filename, 'matrix/data', mode='w', format='table')
del df    # allow df to be garbage collected

print(pd.read_hdf(filename, 'matrix/data'))

        CD4   CD8a  CD366  CD279  CD117  Ly-6C  Ly-6G   CD19   CD45   CD25  \
0         0      1      2      3      4      5      6      7      8      9   
1        32     33     34     35     36     37     38     39     40     41   
2        64     65     66     67     68     69     70     71     72     73   
3        96     97     98     99    100    101    102    103    104    105   
4       128    129    130    131    132    133    134    135    136    137   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
1973  63136  63137  63138  63139  63140  63141  63142  63143  63144  63145   
1974  63168  63169  63170  63171  63172  63173  63174  63175  63176  63177   
1975  63200  63201  63202  63203  63204  63205  63206  63207  63208  63209   
1976  63232  63233  63234  63235  63236  63237  63238  63239  63240  63241   
1977  63264  63265  63266  63267  63268  63269  63270  63271  63272  63273   

      ...  CD11a  P2X7R   CD1d  Notch 4   CD31  Podoplanin  CD4

In [29]:
cuda = True if torch.cuda.is_available() else False
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

In [60]:
import scanpy as sc

In [63]:
adata=sc.read_10x_h5(r'C:\Users\KARAN\Desktop\Matilda\data\landau\protein\GSE198353_mmtv_pymt_GEX_filtered_feature_bc_matrix.h5')

  utils.warn_names_duplicates("var")


In [72]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome
Xkr4,ENSMUSG00000051951,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
Gm1992,ENSMUSG00000089699,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
Gm19938,ENSMUSG00000102331,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
Gm37381,ENSMUSG00000102343,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
Rp1,ENSMUSG00000025900,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
...,...,...,...
AC133095.2,ENSMUSG00000095475,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
AC133095.1,ENSMUSG00000094855,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
AC234645.1,ENSMUSG00000095019,Gene Expression,MMTV_PyMT_gex-mm10-2020-A
AC149090.1,ENSMUSG00000095041,Gene Expression,MMTV_PyMT_gex-mm10-2020-A


In [66]:
1978 * 32286

63861708

In [81]:
data = h5py.File(r'C:\Users\KARAN\Desktop\Matilda\data\landau\protein\GSE198353_mmtv_pymt_GEX_filtered_feature_bc_matrix.h5',"r")
h5_data = data['matrix/data']
h5_data

<HDF5 dataset "data": shape (7378559,), type "<i4">

In [78]:
sparse_data = scipy.sparse.csr_matrix(np.array(h5_data).transpose())


In [79]:
sparse_data

<1x7378559 sparse matrix of type '<class 'numpy.intc'>'
	with 7378559 stored elements in Compressed Sparse Row format>

In [80]:
data_fs = torch.from_numpy(np.array(sparse_data.todense()))
data_fs = Variable(data_fs.type(FloatTensor))

In [68]:
a.shape

torch.Size([1, 7378559])

In [73]:
df = pd.read_csv(r'C:\Users\KARAN\Desktop\Matilda\data\landau\protein\GSE198353_mmtv_pymt_ADT_t.csv')

In [74]:
import numpy as np

In [75]:
y=np.array(df)

In [76]:
y

array([['AAACAAGTATCTCCCA-1', 478, 583, ..., 1067, 1193, 587],
       ['AAACACCAATAACTGC-1', 1504, 1217, ..., 2040, 3056, 1000],
       ['AAACAGGGTCTATATT-1', 1526, 1231, ..., 2193, 3863, 985],
       ...,
       ['TTGTTTCATTAGTCTA-1', 661, 528, ..., 948, 801, 404],
       ['TTGTTTCCATACAACT-1', 1031, 857, ..., 1582, 3864, 652],
       ['TTGTTTGTATTACACG-1', 861, 720, ..., 1124, 1002, 492]],
      dtype=object)

In [56]:
df

Unnamed: 0,FIELD1,CD4,CD8a,CD366,CD279,CD117,Ly-6C,Ly-6G,CD19,CD45,...,CD11a,P2X7R,CD1d,Notch 4,CD31,Podoplanin,CD45R/B220,CD27,CD11b,CD202b
0,AAACAAGTATCTCCCA-1,478,583,877,446,57,0,481,434,3157,...,638,1222,1253,1273,1354,4858,656,1067,1193,587
1,AAACACCAATAACTGC-1,1504,1217,1731,943,64,2,1027,933,6580,...,1808,2240,1932,2253,3095,10214,1266,2040,3056,1000
2,AAACAGGGTCTATATT-1,1526,1231,1433,849,23,1,1322,1515,5964,...,1778,2120,1971,2216,2927,2700,1437,2193,3863,985
3,AAACAGTGTTCCTGGG-1,847,787,1028,517,67,0,610,567,3476,...,939,1266,1242,1268,1742,4985,881,1230,1046,634
4,AAACATGGTGAGAGGA-1,2317,1770,2347,1475,58,1,1802,2371,7370,...,2573,3359,2859,3335,4107,6983,2777,3766,4316,1723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973,TTGTTGGCAATGACTG-1,1669,1378,1766,932,60,0,1070,957,6174,...,1595,2025,1963,2319,2821,6929,1472,2130,1904,1053
1974,TTGTTTCACATCCAGG-1,903,693,919,523,61,2,613,623,3531,...,917,1395,1224,1319,1623,9180,870,1226,1119,569
1975,TTGTTTCATTAGTCTA-1,661,528,729,477,23,1,512,404,2571,...,714,984,952,1057,1242,4286,699,948,801,404
1976,TTGTTTCCATACAACT-1,1031,857,1250,666,40,0,951,938,5599,...,1425,1454,1123,1547,2710,2784,1106,1582,3864,652


In [52]:
import pandas as pd
import scipy.sparse
import numpy as np
import torch

def read_csv_data(data_path):
    # Read CSV file using pandas
    df = pd.read_csv(data_path)

    # Extract the data column from the CSV dataframe
    csv_data = df['data']

    # Convert the data column to a sparse CSR matrix
    sparse_data = scipy.sparse.csr_matrix(csv_data.values.reshape(-1, 1))

    # Convert the sparse matrix to a dense numpy array
    dense_data = sparse_data.toarray()

    # Convert the dense array to a PyTorch tensor
    data_fs = torch.from_numpy(dense_data)

    return data_fs


In [53]:
b = read_csv_data(r'C:\Users\KARAN\Desktop\Matilda\data\landau\protein\GSE198353_mmtv_pymt_ADT_t.csv')

KeyError: 'data'

In [33]:
x.shape


torch.Size([1, 7378559])

In [28]:
!python main_matilda_train.py --rna ../data/landau/protein/GSE198353_mmtv_pymt_GEX_filtered_feature_bc_matrix.h5 --adt ../data/landau/protein/GSE198353_mmtv_pymt_ADT_t.csv  


  import parser
Traceback (most recent call last):
  File "c:\Users\KARAN\Desktop\Matilda\main\main_matilda_train.py", line 54, in <module>
    train_adt_data = read_h5_data(train_adt_data_path)
  File "c:\Users\KARAN\Desktop\Matilda\main\util.py", line 130, in read_h5_data
    data = h5py.File(data_path,"r")
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\h5py\_hl\files.py", line 567, in __init__
    fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\h5py\_hl\files.py", line 231, in make_fid
    fid = h5f.open(name, flags, fapl=fapl)
  File "h5py\_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
  File "h5py\_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
  File "h5py\h5f.pyx", line 106, in h5py.h5f.open
OSError: Unable to open file (file signature not found)


In [91]:
!python main_matilda_train.py --rna ../data/TEAseq/train_rna.h5 --adt ../data/TEAseq/train_adt.h5  --cty ../data/TEAseq/train_cty.csv 

  import parser
Traceback (most recent call last):
  File "c:\Users\KARAN\Desktop\Matilda\main\main_matilda_train.py", line 55, in <module>
    train_label = read_fs_label(train_label_path)
  File "c:\Users\KARAN\Desktop\Matilda\main\util.py", line 141, in read_fs_label
    label_fs = pd.read_csv(label_path,header=None,index_col=False)  #
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\util\_decorators.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\util\_decorators.py", line 331, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\io\parsers\readers.py", line 950, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\io\parsers\readers.py", line 605, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "c:\Users\KARAN\

In [84]:
!python main_matilda_task.py --rna ../data/TEAseq/test_rna.h5 --adt ../data/TEAseq/test_adt.h5 --cty ../data/TEAseq/test_cty.csv --classification True --query True

The dataset is CITEseq


  import parser
  a = torch.max(nn.Softmax()(x_cty),1)


In [86]:
!python main_matilda_task.py --rna ../data/TEAseq/train_rna.h5 --adt ../data/TEAseq/train_adt.h5 --cty ../data/TEAseq/train_cty.csv --simulation True --simulation_ct 1 --simulation_num 200


The dataset is CITEseq
simulate celltype index: 1 	 cell type name: B.Naive
finish simulation


  import parser


In [87]:
!python main_matilda_task.py --rna ../data/TEAseq/train_rna.h5 --adt ../data/TEAseq/train_adt.h5 --cty ../data/TEAseq/train_cty.csv --dim_reduce True 


The dataset is CITEseq
finish dimension reduction


  import parser


In [90]:
!python main_matilda_task.py --rna ../data/TEAseq/train_rna.h5 --adt ../data/TEAseq/train_adt.h5   --fs True 


  import parser
Traceback (most recent call last):
  File "c:\Users\KARAN\Desktop\Matilda\main\main_matilda_task.py", line 68, in <module>
    label = read_fs_label(label_path)
  File "c:\Users\KARAN\Desktop\Matilda\main\util.py", line 141, in read_fs_label
    label_fs = pd.read_csv(label_path,header=None,index_col=False)  #
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\util\_decorators.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\util\_decorators.py", line 331, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\io\parsers\readers.py", line 950, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "c:\Users\KARAN\anaconda3\envs\STAGATE_PT\lib\site-packages\pandas\io\parsers\readers.py", line 605, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "c:\Users\KARAN\anaconda3\env