In [1]:
# load packages:
# package requirement: tensorflow-gpu 1.15.2, keras 2.3.1 munkres
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as pathces
import sys

from mmDUFS import JointModel
from mmDUFS import DiffModel
from mmDUFS import DataSet

from baselines import fs_eval_diff

from sklearn.metrics import f1_score
from scipy import stats

Using TensorFlow backend.


In [2]:
!nvidia-smi

Thu Jun  1 19:43:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 36%   53C    P8    20W / 250W |    817MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 29%   29C    P8    12W / 250W |   1844MiB / 11264MiB |      0%      Default |
|       

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [4]:
cbmc_diff_data = np.load("datasets/cbmc_citeseq_diff.npz",allow_pickle=True)
X_z=cbmc_diff_data['X']
Y_z=cbmc_diff_data['Y']
y=cbmc_diff_data['y'] #'CD34+': 0, 'Eryth': 1, 'Mouse': 2
feature_label=cbmc_diff_data['feature_label']

In [104]:
# run baselines to get the differential features
_,diff_list = fs_eval_diff(X_z,Y_z,
                         label_true_X = feature_label,
                         label_true_Y = np.ones(Y_z.shape[1]), # random token
                         baselines = ["concat","sum","prod"],
                         n_total_x = 150,
                         n_total_y = 10, 
                         nx=60,
                         ny=50, knn=2,fac=5, laplacian="normalized")

In [6]:
# run mmDUFS

In [7]:
diff_mmdufs_params = {
    "lam1":3, # lambda x
    "batch_size":X_z.shape[0], # full batch
    "const":2,
    "const2":1,
    "laplacian":"unnormalized"
}
diff_mmdfus_learning_rate = 2
display_step = 1000
epochs = 5000

In [8]:
diff_mmdufs_params["input_dim1"] = X_z.shape[1]
diff_mmdufs_params["input_dim2"] = Y_z.shape[1]

In [17]:
dataset = DataSet(**{'_data1':X_z,
                         '_data2':Y_z,
                    })
    
diff_model1 = DiffModel(**diff_mmdufs_params) 
    
diff_result1= diff_model1.train(dataset,
                              learning_rate=diff_mmdfus_learning_rate,
                        feature_label = feature_label, 
                        display_step=display_step, 
                        num_epoch=epochs) 

num_samples : 832
Epoch: 1000 loss= -0.930166185 score1= -1.466650128 reg1= 0.178827986 f1= 0.9362
Epoch: 2000 loss= -0.944792390 score1= -1.475106359 reg1= 0.176771313 f1= 0.9362
Epoch: 3000 loss= -0.946312129 score1= -1.475676417 reg1= 0.176454768 f1= 0.9362
Epoch: 4000 loss= -0.946783721 score1= -1.475733757 reg1= 0.176316679 f1= 0.9362
Epoch: 5000 loss= -0.947310388 score1= -1.476033926 reg1= 0.176241174 f1= 0.9362
Optimization Finished!


In [105]:
mmDUFS_selected_feats = diff_model1.get_prob_alpha1() == 1

In [300]:
sum(mmDUFS_selected_feats)

88

In [301]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score

In [303]:
c2keep = np.array([(cp == 1)|(cp == 2) for cp in y])
X_subset = X_z[c2keep,:]
y_subset = y[c2keep]

ylabel_new_mapper = {2:0,1:1}
ylabels = np.array([ylabel_new_mapper[i] for i in y_subset])


In [336]:
acc_dict = {"concat":[],
            "sum":[],
            "prod":[],
           "mmDUFS":[]}

In [337]:
for seed in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_subset, ylabels, train_size=0.05,random_state=seed)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    for base in ["concat","sum","prod"]:
        feats2keep = diff_list[base]['X'].astype('bool')
    
        X_train_sub,X_test_sub = X_train[:,feats2keep], X_test[:,feats2keep]
        clf = LinearSVC(max_iter=1e6)
        clf.fit(X_train_sub,y_train)
        y_pred = clf.predict(X_test_sub)
        acc = balanced_accuracy_score(y_test,y_pred)
        acc_dict[base].append(acc)
    # mmDUFS
    clf = LinearSVC(max_iter=1e6)
    clf.fit(X_train[:,mmDUFS_selected_feats],y_train)
    y_pred = clf.predict(X_test[:,mmDUFS_selected_feats])
    acc = balanced_accuracy_score(y_test,y_pred)
    acc_dict["mmDUFS"].append(acc)
    

In [338]:
for method in ["concat","sum","prod","mmDUFS"]:
    print("{}: mean acc: {:.4f}".format(method,np.mean(acc_dict[method])))

concat: mean acc: 0.9697
sum: mean acc: 0.9380
prod: mean acc: 0.9380
mmDUFS: mean acc: 0.9752
