In [25]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os

In [26]:
#data_folder = "/home/groups/ConradLab/daniel/sharp_sims/splat_0.7_de_rq/"
data_folder = "simulations/splat_0.7_de_rq/"

In [27]:
os.path.exists(data_folder + "preds.csv")

True

In [28]:
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

In [43]:
all_labels.shape[1]

3

In [29]:
all_labels

Unnamed: 0,scina,scsorter,sctype,singler,scpred
Cell1001,Group2,Group2,Group2,Group2,Group2
Cell1002,Group4,Group2,Group2,Group4,Group2
Cell1003,Group1,Group4,Group2,Group2,
Cell1004,Group4,Group4,Group4,Group4,Group1
Cell1005,Group1,Group1,Group1,Group1,Group1
...,...,...,...,...,...
Cell1996,Group2,Group2,Group4,Group2,Group1
Cell1997,Group1,Group1,Group1,Group1,Group1
Cell1998,Group3,Group3,Group3,Group3,Group3
Cell1999,Group4,Group4,Group2,Group4,


In [30]:
# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells = utilities.preprocess(np.array(X), scale=False)
X.shape

(999, 500)

In [31]:
all_labels = all_labels.loc[keep_cells,:]

In [32]:
_,marker_names = utilities.read_marker_file(marker_path)
marker_names

['Group1', 'Group2', 'Group3', 'Group4']

In [10]:
all_labels['scsorter']

Cell1001    Group2
Cell1002    Group2
Cell1003    Group4
Cell1004    Group4
Cell1005    Group1
             ...  
Cell1996    Group2
Cell1997    Group1
Cell1998    Group3
Cell1999    Group4
Cell2000    Group2
Name: scsorter, Length: 999, dtype: object

In [33]:
all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)
encoded_labels

array([[0., 5., 0., 0.],
       [0., 3., 0., 2.],
       [1., 2., 0., 1.],
       ...,
       [0., 0., 5., 0.],
       [0., 1., 0., 3.],
       [0., 4., 0., 1.]])

In [34]:
meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]
real_y.shape

(999,)

In [61]:
len(real_y[real_y==0])

245

In [35]:
print(utilities.pred_accuracy(all_labels_factored['scina'], real_y))
print(utilities.pred_accuracy(all_labels_factored['sctype'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scsorter'], real_y))
print(utilities.pred_accuracy(all_labels_factored['singler'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scpred'], real_y))


0.792792797088623
0.8138138055801392
0.826826810836792
0.8408408164978027
0.6026026010513306


In [36]:
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
utilities.pred_accuracy(max_pred, real_y)

  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


0.9309309124946594

In [37]:
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
confident_labels.shape

(999,)

In [38]:
train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]
print(np.unique(confident_labels))
print(np.unique(confident_labels[train_nodes]))
print(np.unique(confident_labels[test_nodes]))

[-1.  0.  1.  2.  3.]
[0. 1. 2. 3.]
[-1.]


In [26]:
real_y[test_nodes]

array([3, 0, 3, 0, 0, 0, 1, 2, 2, 2, 3, 0, 1, 1, 3, 2, 0, 1, 2, 3, 2, 1,
       0, 3, 3, 0, 3, 1, 2, 1, 1, 0, 0, 3, 2, 0, 0, 1, 2, 2, 0, 1, 1, 3,
       2, 0, 0, 0, 0, 3, 3, 3, 0, 3, 1, 0, 3, 3, 0, 1, 2, 2, 0, 2, 1, 0,
       3, 1, 0, 0, 2, 1, 3, 2, 0, 0, 0, 0, 2, 1, 0, 0, 2, 3, 0, 2, 3, 0,
       1, 2, 3, 2, 2, 2, 2, 0, 3, 0, 2, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 0,
       3, 1, 2, 2, 0, 0, 2, 0])

In [27]:
print(utilities.pred_accuracy(confident_labels[train_nodes], real_y[train_nodes]))

0.9807037711143494


In [28]:
len(test_nodes)

118

In [21]:
# tool accuracy on test
print(utilities.pred_accuracy(np.array(all_labels_factored['scina'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['sctype'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scsorter'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['singler'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scpred'][test_nodes]), real_y[test_nodes]))
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
print(utilities.pred_accuracy(max_pred[test_nodes], real_y[test_nodes]))

0.19491524994373322
0.3644067943096161
0.43220338225364685
0.5932203531265259
0.17796610295772552
0.6271186470985413


In [29]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=35, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=35, shuffle=False)

In [30]:
m = GCNModel("configs/2_8.txt", 2, dropout=0.0)

In [31]:
m.train(dataloader, 100)

Loss in epoch 0 = 36.780918
Loss in epoch 10 = 0.142983
Loss in epoch 20 = 0.038523
Loss in epoch 30 = 0.018872
Loss in epoch 40 = 0.008604
Loss in epoch 50 = 0.006198
Loss in epoch 60 = 0.003754
Loss in epoch 70 = 0.002780
Loss in epoch 80 = 0.002784
Loss in epoch 90 = 0.001744


In [32]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9699699878692627,
 array([[222,   4,   1,   5],
        [  0, 267,   1,   0],
        [  2,   4, 223,   5],
        [  2,   5,   1, 257]]),
 0.9807037711143494,
 array([[183,   3,   1,   3],
        [  0, 249,   0,   0],
        [  1,   2, 195,   4],
        [  1,   1,   1, 237]]),
 0.8898305296897888,
 array([[39,  1,  0,  2],
        [ 0, 18,  1,  0],
        [ 1,  2, 28,  1],
        [ 1,  4,  0, 20]]))

In [22]:
#data_folders = ["/home/groups/ConradLab/daniel/sharp_sims/splat_0.6_de_rq/", "/home/groups/ConradLab/daniel/sharp_sims/splat_0.7_de_rq/", "/home/groups/ConradLab/daniel/sharp_sims/splat_0.8_de_rq/"]
#data_folders = ["simulations/splat_0.6_de_rq/", "simulations/splat_0.7_de_rq/", "simulations/splat_0.8_de_rq/"]
data_folders = ["simulations/splat_0.7_de_rq/"]
tools = ["sctype","scsorter","scina","singler", "scpred"]
votes_necessary = 3
model_file = "configs/2_15.txt"
neighbors = 2
batch_size=20
training_epochs=150
random_inits = 5

In [23]:
results = test_model(data_folders, tools, votes_necessary, model_file, neighbors, batch_size, training_epochs, random_inits)

[0.9619619846343994, 0.9639639854431152, 0.966966986656189, 0.9639639854431152, 0.965965986251831]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


In [24]:
# results from no feed forward or gcn labels
results

Unnamed: 0,data_name,method,total_accuracy,train_accuracy,test_accuracy,total_sd,train_sd,test_sd
0,splat_0.7_de_rq,GCN,0.964565,0.972284,0.892783,0.001951,0.0,0.020096
1,splat_0.7_de_rq,Max Col.,0.930931,0.972284,0.546392,0.0,0.0,0.0
2,splat_0.7_de_rq,Confident Labels,,0.972284,,0.0,0.0,0.0
3,splat_0.7_de_rq,sctype,0.813814,0.850333,0.474227,0.0,0.0,0.0
4,splat_0.7_de_rq,scsorter,0.826827,0.871397,0.412371,0.0,0.0,0.0
5,splat_0.7_de_rq,scina,0.792793,0.851441,0.247423,0.0,0.0,0.0
6,splat_0.7_de_rq,singler,0.840841,0.886918,0.412371,0.0,0.0,0.0
7,splat_0.7_de_rq,scpred,0.602603,0.644124,0.216495,0.0,0.0,0.0


In [63]:
results

Unnamed: 0,data_name,method,total_accuracy,train_accuracy,test_accuracy,total_sd,train_sd,test_sd
0,splat_0.6_de_rq,GCN,0.8392,0.944708,0.657221,0.021076,0.0,0.057428
1,splat_0.6_de_rq,Max Col.,0.798,0.944708,0.544959,0.0,0.0,0.0
2,splat_0.6_de_rq,Confident Labels,,0.944708,,0.0,0.0,0.0
3,splat_0.6_de_rq,sctype,0.232,0.235387,0.226158,0.0,0.0,0.0
4,splat_0.6_de_rq,scsorter,0.677,0.846761,0.384196,0.0,0.0,0.0
5,splat_0.6_de_rq,scina,0.467,0.668246,0.119891,0.0,0.0,0.0
6,splat_0.6_de_rq,singler,0.84,0.913112,0.713896,0.0,0.0,0.0
7,splat_0.6_de_rq,scpred,0.503,0.663507,0.226158,0.0,0.0,0.0
0,splat_0.7_de_rq,GCN,0.963764,0.980704,0.837288,0.004386,0.0,0.037134
1,splat_0.7_de_rq,Max Col.,0.938939,0.980704,0.627119,0.0,0.0,0.0


In [66]:
# get labels
data_folder = "simulations/splat_0.7_de_rq/"
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

X = pd.read_csv(data_path, index_col=0)
X, keep_cells = utilities.preprocess(np.array(X), scale=False)
print(X.shape)

_,marker_names = utilities.read_marker_file(marker_path)

all_labels = all_labels.loc[keep_cells,:]
all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
train_nodes = np.where(confident_labels != -1)[0]
original_test_nodes = np.where(confident_labels == -1)[0]

meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]
real_y.shape

dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=20, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=False)

print(len(original_test_nodes))

(999, 500)
97


In [67]:
m = GCNModel("configs/2_15.txt", 2, dropout=0.0)
m.train(dataloader, 150)
m.validation_metrics(test_dataloader, train_nodes, original_test_nodes)

Loss in epoch 0 = 60.686684
Loss in epoch 10 = 0.114549
Loss in epoch 20 = 0.023500
Loss in epoch 30 = 0.010405
Loss in epoch 40 = 0.005857
Loss in epoch 50 = 0.003666
Loss in epoch 60 = 0.002030
Loss in epoch 70 = 0.001588
Loss in epoch 80 = 0.001203
Loss in epoch 90 = 0.000762
Loss in epoch 100 = 0.000438
Loss in epoch 110 = 0.000338
Loss in epoch 120 = 0.000305
Loss in epoch 130 = 0.000162
Loss in epoch 140 = 0.000123


(0.9619619846343994,
 array([[215,   9,   2,   6],
        [  0, 265,   2,   1],
        [  0,   2, 228,   4],
        [  1,   4,   7, 253]]),
 0.9722838401794434,
 array([[172,   7,   2,   1],
        [  0, 264,   2,   1],
        [  0,   2, 206,   1],
        [  0,   3,   6, 235]]),
 0.8659793734550476,
 array([[43,  2,  0,  5],
        [ 0,  1,  0,  0],
        [ 0,  0, 22,  3],
        [ 1,  1,  1, 18]]))

In [68]:
new_labels, _ = m.predict(test_dataloader)
new_labels = new_labels.max(dim=1)[1]

In [69]:

for i in range(1,3):
    print(i)
    col_name = "gcn" + str(i)
    all_labels_factored[col_name] = new_labels
    encoded_labels = utilities.encode_predictions(all_labels_factored)
    confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
    train_nodes = np.where(confident_labels != -1)[0]
    test_nodes = np.where(confident_labels == -1)[0]

    dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=20, shuffle=True)

    test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=False)

    print(len(test_nodes))

    #m = GCNModel("configs/2_15.txt", 2, dropout=0.0)
    m.train(dataloader, 25, verbose=True)
    print(m.validation_metrics(test_dataloader, train_nodes, test_nodes))

    new_labels, _ = m.predict(test_dataloader)
    new_labels = new_labels.max(dim=1)[1]
    

1
27
Loss in epoch 0 = 10.116533
Loss in epoch 10 = 0.040961
Loss in epoch 20 = 0.010100
(0.9609609842300415, array([[211,  10,   4,   7],
       [  0, 265,   2,   1],
       [  0,   2, 230,   2],
       [  2,   3,   6, 254]]), 0.9681069850921631, array([[202,   9,   2,   3],
       [  0, 265,   2,   1],
       [  0,   2, 225,   2],
       [  1,   3,   6, 249]]), 0.7037037014961243, array([[9, 1, 2, 4],
       [0, 0, 0, 0],
       [0, 0, 5, 0],
       [1, 0, 0, 5]]))
2
5
Loss in epoch 0 = 0.333755
Loss in epoch 10 = 0.004290
Loss in epoch 20 = 0.003589
(0.9609609842300415, array([[211,  10,   4,   7],
       [  0, 265,   2,   1],
       [  0,   2, 230,   2],
       [  2,   3,   6, 254]]), 0.9647887349128723, array([[211,   9,   3,   6],
       [  0, 265,   2,   1],
       [  0,   2, 229,   2],
       [  1,   3,   6, 254]]), 0.20000000298023224, array([[0, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0]]))


In [70]:
m.validation_metrics(test_dataloader, train_nodes, original_test_nodes)

(0.9609609842300415,
 array([[211,  10,   4,   7],
        [  0, 265,   2,   1],
        [  0,   2, 230,   2],
        [  2,   3,   6, 254]]),
 0.9647887349128723,
 array([[211,   9,   3,   6],
        [  0, 265,   2,   1],
        [  0,   2, 229,   2],
        [  1,   3,   6, 254]]),
 0.8556700944900513,
 array([[39,  3,  2,  6],
        [ 0,  1,  0,  0],
        [ 0,  0, 24,  1],
        [ 2,  0,  0, 19]]))