In [1]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os

In [2]:
#data_folder = "/home/groups/ConradLab/daniel/sharp_sims/splat_0.5_de_rq/"
data_folder = "simulations/splat_0.7_de_rq/"

In [3]:
os.path.exists(data_folder + "preds.csv")

True

In [3]:
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

In [43]:
all_labels.shape[1]

3

In [4]:
all_labels

Unnamed: 0,scina,scsorter,sctype,singler,scpred
Cell1001,Group2,Group2,Group1,Group2,Group2
Cell1002,Group4,Group2,Group1,Group4,Group2
Cell1003,Group1,Group4,Group1,Group2,
Cell1004,Group4,Group4,Group4,Group4,Group1
Cell1005,Group1,Group1,Group1,Group1,Group1
...,...,...,...,...,...
Cell1996,Group2,Group2,Group4,Group2,Group1
Cell1997,Group1,Group1,Group1,Group1,Group1
Cell1998,Group3,Group3,Group4,Group3,Group3
Cell1999,Group4,Group4,Group1,Group4,


In [5]:
# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells = utilities.preprocess(np.array(X), scale=False)
X.shape

  view_to_actual(adata)
  view_to_actual(adata)


(999, 500)

In [6]:
all_labels = all_labels.loc[keep_cells,:]

In [7]:
_,marker_names = utilities.read_marker_file(marker_path)
marker_names

['Group1', 'Group2', 'Group3', 'Group4']

In [58]:
all_labels['scsorter']

Cell1001    Group3
Cell1002    Group2
Cell1003    Group4
Cell1004    Group3
Cell1005    Group4
             ...  
Cell1996    Group4
Cell1997    Group3
Cell1998    Group4
Cell1999    Group2
Cell2000    Group1
Name: scsorter, Length: 999, dtype: object

In [8]:
all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)
encoded_labels

array([[1., 4., 0., 0.],
       [1., 2., 0., 2.],
       [2., 1., 0., 1.],
       ...,
       [0., 0., 4., 1.],
       [1., 0., 0., 3.],
       [1., 3., 0., 1.]])

In [9]:
meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]
real_y.shape

(999,)

In [61]:
len(real_y[real_y==0])

245

In [10]:
print(utilities.pred_accuracy(all_labels_factored['scina'], real_y))
print(utilities.pred_accuracy(all_labels_factored['sctype'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scsorter'], real_y))
print(utilities.pred_accuracy(all_labels_factored['singler'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scpred'], real_y))


0.792792797088623
0.4114114046096802
0.826826810836792
0.8408408164978027
0.6026026010513306


In [11]:
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
utilities.pred_accuracy(max_pred, real_y)

  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


0.935935914516449

In [12]:
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
confident_labels

array([ 1., -1., -1.,  3.,  0.,  3.,  3., -1.,  1.,  0.,  2.,  3.,  3.,
        0.,  3.,  3.,  3.,  0.,  2.,  2.,  2.,  1.,  2.,  0.,  0.,  1.,
        0.,  0.,  3.,  3.,  3.,  2.,  3., -1.,  0.,  0.,  0.,  3.,  0.,
        1.,  0., -1.,  0.,  3.,  3.,  2.,  3.,  1.,  0.,  0.,  1.,  3.,
        1.,  0.,  0., -1.,  1.,  3.,  0., -1.,  0.,  2., -1.,  0.,  2.,
        2.,  1.,  3., -1.,  1.,  2.,  1.,  3.,  2., -1.,  3.,  3.,  2.,
        1.,  0., -1.,  3.,  3.,  1., -1.,  3.,  1.,  3.,  2.,  2., -1.,
        0.,  1.,  1.,  3.,  3.,  1.,  0.,  1., -1.,  3.,  3.,  1.,  3.,
        1.,  3., -1.,  1.,  3.,  3.,  2., -1.,  3.,  3.,  0.,  3.,  1.,
        2.,  2., -1.,  2.,  2.,  3.,  1.,  1.,  2.,  0.,  0., -1.,  1.,
        2.,  0.,  1.,  0.,  3., -1.,  0.,  2., -1.,  1., -1.,  0.,  2.,
        1., -1.,  3.,  3.,  3.,  3.,  0.,  3.,  3.,  3.,  3.,  2.,  3.,
        3.,  1.,  3.,  0.,  1.,  3., -1.,  1.,  1.,  3.,  3.,  2.,  2.,
        0.,  1.,  1.,  1., -1.,  0.,  1.,  2., -1.,  2.,  0.,  1

In [13]:
train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]
print(np.unique(confident_labels))
print(np.unique(confident_labels[train_nodes]))
print(np.unique(confident_labels[test_nodes]))

[-1.  0.  1.  2.  3.]
[0. 1. 2. 3.]
[-1.]


In [14]:
real_y[test_nodes]

array([3, 0, 3, 0, 0, 2, 2, 2, 3, 0, 1, 3, 0, 2, 0, 2, 1, 3, 3, 3, 0, 0,
       3, 2, 1, 2, 1, 1, 0, 0, 3, 2, 0, 0, 1, 2, 2, 0, 1, 2, 0, 0, 0, 0,
       0, 3, 3, 2, 2, 3, 0, 2, 1, 3, 2, 2, 1, 3, 3, 0, 2, 2, 2, 0, 2, 1,
       0, 3, 0, 1, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 0, 0, 2, 3,
       0, 2, 3, 0, 1, 2, 0, 0, 2, 1, 0, 0, 2, 3, 3, 3, 2, 0, 0, 0, 2, 2,
       0, 0, 2, 2, 2, 0, 3, 0, 0, 3, 1, 0, 2, 2, 2, 0, 0, 2])

In [15]:
print(utilities.pred_accuracy(confident_labels[train_nodes], real_y[train_nodes]))

0.9781860113143921


In [16]:
len(test_nodes)

128

In [17]:
# tool accuracy on test
print(utilities.pred_accuracy(np.array(all_labels_factored['scina'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['sctype'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scsorter'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['singler'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scpred'][test_nodes]), real_y[test_nodes]))
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
print(utilities.pred_accuracy(max_pred[test_nodes], real_y[test_nodes]))

0.2265625
0.375
0.484375
0.5390625
0.1796875
0.6484375


In [18]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=35, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=35, shuffle=False)

In [19]:
m = GCNModel("configs/2_8.txt", 2, dropout=0.0)

In [20]:
m.train(dataloader, 100)

Loss in epoch 0 = 37.374233
Loss in epoch 10 = 0.193275
Loss in epoch 20 = 0.051210
Loss in epoch 30 = 0.018899
Loss in epoch 40 = 0.010107
Loss in epoch 50 = 0.005313
Loss in epoch 60 = 0.004084
Loss in epoch 70 = 0.002934
Loss in epoch 80 = 0.002424
Loss in epoch 90 = 0.001516


In [21]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9629629850387573,
 array([[219,   7,   2,   4],
        [  0, 265,   2,   1],
        [  1,   3, 221,   9],
        [  1,   4,   3, 257]]),
 0.9781860113143921,
 array([[177,   4,   1,   1],
        [  0, 249,   2,   1],
        [  0,   2, 187,   5],
        [  1,   1,   1, 239]]),
 0.859375,
 array([[42,  3,  1,  3],
        [ 0, 16,  0,  0],
        [ 1,  1, 34,  4],
        [ 0,  3,  2, 18]]))

In [32]:
#data_folders = ["/home/groups/ConradLab/daniel/sharp_sims/splat_0.5_de_rq/", "/home/groups/ConradLab/daniel/sharp_sims/splat_0.6_de_rq/", "/home/groups/ConradLab/daniel/sharp_sims/splat_0.7_de_rq/"]
data_folders = ["simulations/splat_0.6_de_rq/", "simulations/splat_0.7_de_rq/", "simulations/splat_0.8_de_rq/"]
tools = ["sctype","scsorter","scina","singler", "scpred"]
votes_necessary = 3
model_file = "configs/2_8.txt"
neighbors = 2
batch_size=35
training_epochs=200
random_inits = 3

In [33]:
results = test_model(data_folders, tools, votes_necessary, model_file, neighbors, batch_size, training_epochs, random_inits)

[0.8100000023841858, 0.7979999780654907, 0.8220000267028809]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


[0.9169999957084656, 0.9120000004768372, 0.9129999876022339]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


[0.9729999899864197, 0.9750000238418579, 0.9729999899864197]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


In [34]:
results

Unnamed: 0,data_name,method,total_accuracy,train_accuracy,test_accuracy,total_sd,train_sd,test_sd
0,splat_0.5_de_rq,GCN,0.81,0.976492,0.604027,0.012,0.0,0.026846
1,splat_0.5_de_rq,Max Col.,0.884,0.976492,0.769575,0.0,0.0,0.0
2,splat_0.5_de_rq,Confident Labels,,0.976492,,0.0,0.0,0.0
3,splat_0.5_de_rq,sctype,0.074,0.133816,0.0,0.0,0.0,0.0
4,splat_0.5_de_rq,scsorter,0.807,0.976492,0.597315,0.0,0.0,0.0
5,splat_0.5_de_rq,scina,0.656,0.929476,0.317673,0.0,0.0,0.0
6,splat_0.5_de_rq,singler,0.873,0.985533,0.733781,0.0,0.0,0.0
7,splat_0.5_de_rq,scpred,0.046,0.083183,0.0,0.0,0.0,0.0
0,splat_0.6_de_rq,GCN,0.914,0.967822,0.6875,0.002646,0.0,0.01378
1,splat_0.6_de_rq,Max Col.,0.922,0.967822,0.729167,0.0,0.0,0.0
