In [1]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data_folder = "/home/groups/ConradLab/daniel/sharp_sims/splat_0.5_de_rq/"

In [6]:
os.path.exists(data_folder + "preds.csv")

True

In [42]:
# get labels
data_path = data_folder + "query_counts.csv"
#tools = ["sctype","scsorter","scina","singler", "scpred"]
tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

In [43]:
all_labels.shape[1]

3

In [44]:
all_labels

Unnamed: 0,scsorter,scina,singler
Cell1001,Group4,Group2,Group1
Cell1002,Group4,Group2,Group2
Cell1003,Group4,Group4,Group4
Cell1004,Group3,Group3,Group3
Cell1005,Group4,Group4,Group4
...,...,...,...
Cell1996,Group4,Group4,Group4
Cell1997,Group3,Group3,Group3
Cell1998,Group1,Group2,Group1
Cell1999,Group2,Group2,Group2


In [10]:
_,marker_names = utilities.read_marker_file(marker_path)
marker_names

['Group1', 'Group2', 'Group3', 'Group4']

In [14]:
all_labels['scina'].unique()

array(['Group2', 'Group4', 'Group3', nan, 'Group1'], dtype=object)

In [44]:
all_labels['sctype']

Cell1001    Group1
Cell1002    Group2
Cell1003    Group2
Cell1004    Group3
Cell1005    Group2
             ...  
Cell1996    Group2
Cell1997    Group1
Cell1998    Group3
Cell1999    Group1
Cell2000    Group2
Name: sctype, Length: 1000, dtype: object

In [45]:
all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)
encoded_labels

array([[1., 1., 0., 1.],
       [0., 2., 0., 1.],
       [0., 0., 0., 3.],
       ...,
       [2., 1., 0., 0.],
       [0., 3., 0., 0.],
       [2., 1., 0., 0.]])

In [12]:
# read in dataset
X = pd.read_csv(data_path, index_col=0)
X = utilities.preprocess(np.array(X), scale=False)
X.shape

(1000, 500)

In [13]:
meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y.shape

(1000,)

In [47]:
print(utilities.pred_accuracy(all_labels_factored['scina'], real_y))
#print(utilities.pred_accuracy(all_labels_factored['sctype'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scsorter'], real_y))
print(utilities.pred_accuracy(all_labels_factored['singler'], real_y))
#print(utilities.pred_accuracy(all_labels_factored['scpred'], real_y))


0.656000018119812
0.8069999814033508
0.8730000257492065


In [48]:
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
utilities.pred_accuracy(max_pred, real_y)

  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


0.8870000243186951

In [50]:
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 2)
confident_labels

array([-1.,  1.,  3.,  2.,  3.,  0.,  2.,  1.,  1.,  0.,  1.,  3.,  1.,
        1.,  1.,  1.,  0.,  0.,  1.,  3.,  0.,  2.,  1.,  0.,  1.,  3.,
        2., -1.,  0.,  0.,  3., -1.,  1., -1.,  3.,  1.,  1.,  3.,  1.,
        3.,  0.,  1.,  3.,  3.,  3.,  1., -1.,  2.,  3.,  1.,  3.,  1.,
        2.,  2.,  3.,  3.,  3.,  2.,  1., -1.,  3.,  0.,  0., -1.,  0.,
        2.,  1.,  2.,  0.,  0.,  1.,  1.,  3.,  2.,  0.,  2.,  2.,  2.,
        0.,  1.,  3.,  1.,  3.,  3.,  2., -1.,  3.,  2.,  3.,  3.,  1.,
        0.,  0.,  1.,  1.,  3.,  0.,  0.,  1.,  2.,  1.,  2., -1.,  2.,
        1.,  0.,  3.,  3.,  2., -1.,  2.,  1.,  2.,  3.,  1.,  0.,  1.,
        0.,  2.,  3.,  1.,  2.,  0.,  0.,  3.,  1.,  1.,  0.,  2.,  3.,
       -1.,  1.,  3.,  2.,  0.,  3.,  1., -1.,  0.,  1.,  1.,  3.,  1.,
        3.,  1.,  1.,  3.,  0.,  2.,  1.,  1.,  0.,  2.,  0.,  3.,  3.,
       -1.,  2.,  1.,  2.,  3.,  2.,  1.,  0.,  3.,  3.,  0.,  3.,  1.,
        3.,  2.,  1.,  2., -1.,  0.,  3.,  1.,  0.,  2.,  0., -1

In [51]:
train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]
print(np.unique(confident_labels))
print(np.unique(confident_labels[train_nodes]))
print(np.unique(confident_labels[test_nodes]))

[-1.  0.  1.  2.  3.]
[0. 1. 2. 3.]
[-1.]


In [52]:
print(utilities.pred_accuracy(confident_labels[train_nodes], real_y[train_nodes]))

0.9302071928977966


In [53]:
len(test_nodes)

83

In [55]:
# tool accuracy on test
print(utilities.pred_accuracy(np.array(all_labels_factored['scina'][test_nodes]), real_y[test_nodes]))
#print(utilities.pred_accuracy(np.array(all_labels_factored['sctype'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scsorter'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['singler'][test_nodes]), real_y[test_nodes]))
#print(utilities.pred_accuracy(np.array(all_labels_factored['scpred'][test_nodes]), real_y[test_nodes]))
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
print(utilities.pred_accuracy(max_pred[test_nodes], real_y[test_nodes]))

0.10843373835086823
0.15662650763988495
0.6024096608161926
0.40963855385780334


In [36]:
real_y[test_nodes]

array([2, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 1, 1, 3, 1, 1, 0, 2, 2, 1,
       2, 3, 2, 2, 0, 1, 1, 0, 1, 2, 1, 1, 1, 2, 0, 2, 0, 1, 0, 0, 1, 1,
       1, 0, 3, 1, 2, 1, 3, 0, 1, 2, 0, 2, 1, 1, 2, 2, 3, 3, 1, 1, 0, 1,
       1, 1, 1, 1, 2, 1, 1, 0, 1, 2, 1, 3, 0, 3, 2, 1, 0, 0, 1, 3, 2, 1,
       3, 1, 0, 0, 0, 1, 2, 1, 3, 0, 0, 1, 3, 3, 2, 1, 1, 0, 1, 2, 1, 0,
       2, 2, 3, 2, 1, 2, 0, 0, 3, 3, 2, 1, 1, 2, 2, 3, 1, 3, 1, 2, 2, 1,
       1, 1, 0, 2, 0, 1, 3, 2, 2, 3, 2, 0, 1, 0, 2, 1, 2, 2, 1, 3, 2, 2,
       2, 0, 1, 2, 2, 1, 0, 0, 0, 2, 3, 0, 2, 0, 1, 0, 2, 1, 2, 2, 0, 0,
       2, 0, 1, 2, 3, 1, 2, 2, 2, 2, 2, 1, 1, 0, 2, 2, 1, 2, 3, 2, 1, 1,
       3, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 1, 2, 3, 3, 1, 1, 2, 0, 0, 2, 3,
       1, 0, 1, 1, 1, 1, 1, 3, 2, 2, 0, 0, 2, 1, 3, 1, 0, 0, 0, 2, 0, 0,
       2, 3, 1, 0, 2, 0, 1, 0, 1, 0, 0, 1, 3, 3, 2, 0, 1, 3, 0, 3, 2, 2,
       3, 2, 2, 0, 3, 0, 2, 1, 1, 3, 2, 0, 1, 2, 2, 1, 3, 3, 2, 1, 3, 0,
       1, 1, 3, 0, 2, 0, 0, 1, 1, 2, 1, 2, 0, 1, 2,

In [35]:
all_labels_factored['sctype'][test_nodes]

0     -1
1     -1
5     -1
7     -1
8      3
      ..
988   -1
989   -1
990   -1
997    3
999   -1
Name: sctype, Length: 447, dtype: int64

In [56]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=35, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=35, shuffle=False)

In [57]:
m = GCNModel("configs/2_8.txt", 2, dropout=0.0)

In [58]:
m.train(dataloader, 100)

Loss in epoch 0 = 38.343639
Loss in epoch 10 = 0.269517
Loss in epoch 20 = 0.057539
Loss in epoch 30 = 0.021863
Loss in epoch 40 = 0.013117
Loss in epoch 50 = 0.010559
Loss in epoch 60 = 0.005622
Loss in epoch 70 = 0.003653
Loss in epoch 80 = 0.002842
Loss in epoch 90 = 0.002318


In [59]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.8989999890327454,
 array([[227,   6,   2,  10],
        [  8, 222,  19,   7],
        [  8,  29, 201,   8],
        [  3,   0,   1, 249]]),
 0.9302071928977966,
 array([[215,   5,   1,   7],
        [  6, 204,  12,   5],
        [  5,  17, 185,   3],
        [  2,   0,   1, 249]]),
 0.5542168617248535,
 array([[12,  1,  1,  3],
        [ 2, 18,  7,  2],
        [ 3, 12, 16,  5],
        [ 1,  0,  0,  0]]))

In [30]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9750000238418579,
 array([[244,   0,   0,   1],
        [  1, 254,   1,   0],
        [  4,   2, 239,   1],
        [  3,   5,   7, 238]]),
 0.9841938614845276,
 array([[241,   0,   0,   0],
        [  0, 252,   1,   0],
        [  2,   0, 224,   1],
        [  2,   4,   5, 217]]),
 0.8039215803146362,
 array([[ 3,  0,  0,  1],
        [ 1,  2,  0,  0],
        [ 2,  2, 15,  0],
        [ 1,  1,  2, 21]]))

In [32]:
data_folders = ["/home/groups/ConradLab/daniel/sharp_sims/splat_0.5_de_rq/", "/home/groups/ConradLab/daniel/sharp_sims/splat_0.6_de_rq/", "/home/groups/ConradLab/daniel/sharp_sims/splat_0.7_de_rq/"]
tools = ["sctype","scsorter","scina","singler", "scpred"]
votes_necessary = 3
model_file = "configs/2_8.txt"
neighbors = 2
batch_size=35
training_epochs=200
random_inits = 3

In [33]:
results = test_model(data_folders, tools, votes_necessary, model_file, neighbors, batch_size, training_epochs, random_inits)

[0.8100000023841858, 0.7979999780654907, 0.8220000267028809]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


[0.9169999957084656, 0.9120000004768372, 0.9129999876022339]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


[0.9729999899864197, 0.9750000238418579, 0.9729999899864197]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


In [34]:
results

Unnamed: 0,data_name,method,total_accuracy,train_accuracy,test_accuracy,total_sd,train_sd,test_sd
0,splat_0.5_de_rq,GCN,0.81,0.976492,0.604027,0.012,0.0,0.026846
1,splat_0.5_de_rq,Max Col.,0.884,0.976492,0.769575,0.0,0.0,0.0
2,splat_0.5_de_rq,Confident Labels,,0.976492,,0.0,0.0,0.0
3,splat_0.5_de_rq,sctype,0.074,0.133816,0.0,0.0,0.0,0.0
4,splat_0.5_de_rq,scsorter,0.807,0.976492,0.597315,0.0,0.0,0.0
5,splat_0.5_de_rq,scina,0.656,0.929476,0.317673,0.0,0.0,0.0
6,splat_0.5_de_rq,singler,0.873,0.985533,0.733781,0.0,0.0,0.0
7,splat_0.5_de_rq,scpred,0.046,0.083183,0.0,0.0,0.0,0.0
0,splat_0.6_de_rq,GCN,0.914,0.967822,0.6875,0.002646,0.0,0.01378
1,splat_0.6_de_rq,Max Col.,0.922,0.967822,0.729167,0.0,0.0,0.0
