In [1]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os
import statistics

In [2]:
#data_folder = "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.8_de_rq/"
data_folder = "simulations/splat_0.7_de_rq/"
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells,_,_ = utilities.preprocess(np.array(X), scale=False, comps=500)

all_labels = all_labels.loc[keep_cells,:]

_,marker_names = utilities.read_marker_file(marker_path)

all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)

meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]

confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = .51)

train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]



In [4]:
X.shape

(999, 500)

In [3]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)


In [4]:
m = GCNModel("configs/2_40.txt", 2, dropout=0.0)
m.train(dataloader, 150)

Loss in epoch 0 = 25.843058
Loss in epoch 10 = 0.073158
Loss in epoch 20 = 0.017070
Loss in epoch 30 = 0.009762
Loss in epoch 40 = 0.004732
Loss in epoch 50 = 0.002694
Loss in epoch 60 = 0.002010
Loss in epoch 70 = 0.001146
Loss in epoch 80 = 0.000985
Loss in epoch 90 = 0.000842
Loss in epoch 100 = 0.000619
Loss in epoch 110 = 0.000556
Loss in epoch 120 = 0.000495
Loss in epoch 130 = 0.000373
Loss in epoch 140 = 0.000319


In [5]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9579579830169678,
 array([[214,  10,   2,   6],
        [  0, 265,   2,   1],
        [  0,   3, 227,   4],
        [  1,   5,   8, 251]]),
 0.9702315330505371,
 array([[173,   7,   2,   2],
        [  0, 264,   2,   1],
        [  0,   3, 208,   1],
        [  0,   3,   6, 235]]),
 0.8369565010070801,
 array([[41,  3,  0,  4],
        [ 0,  1,  0,  0],
        [ 0,  0, 19,  3],
        [ 1,  2,  2, 16]]))

In [7]:
# get tool weights by cell
ultra_confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = .9)
ultra_conf_nodes = np.where(ultra_confident_labels != -1)[0]
weights = np.zeros((4,5))
for type in [0,1,2,3]:
    
    type_indices = np.where(confident_labels[ultra_conf_nodes]==type)[0]
    for i, tool in enumerate(tools):
        weights[type,i] = utilities.pred_accuracy(all_labels_factored[tool].to_numpy()[ultra_conf_nodes][type_indices], confident_labels[ultra_conf_nodes][type_indices])
weights

array([[1.        , 1.        , 0.95918369, 1.        , 0.81632656],
       [1.        , 1.        , 0.97354496, 1.        , 0.78835976],
       [1.        , 1.        , 0.93129772, 1.        , 0.87022901],
       [1.        , 1.        , 1.        , 1.        , 0.76543212]])

In [8]:
new_encoded = utilities.weighted_encode(all_labels_factored, encoded_labels, weights,.5)
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = .51)

train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]

multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes
multiple maxes


In [14]:
len(test_nodes)

92

In [9]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)

In [12]:
m = GCNModel("configs/2_40.txt", 2, dropout=0.0)
m.train(dataloader, 150)

Loss in epoch 0 = 26.061798
Loss in epoch 10 = 0.071316
Loss in epoch 20 = 0.019366
Loss in epoch 30 = 0.006830
Loss in epoch 40 = 0.004786
Loss in epoch 50 = 0.002491
Loss in epoch 60 = 0.002129
Loss in epoch 70 = 0.001528
Loss in epoch 80 = 0.001012
Loss in epoch 90 = 0.000689
Loss in epoch 100 = 0.000599
Loss in epoch 110 = 0.000556
Loss in epoch 120 = 0.000393
Loss in epoch 130 = 0.000327
Loss in epoch 140 = 0.000350


In [13]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9589589834213257,
 array([[211,  11,   2,   8],
        [  0, 265,   2,   1],
        [  0,   4, 228,   2],
        [  1,   4,   6, 254]]),
 0.9702315330505371,
 array([[173,   7,   2,   2],
        [  0, 264,   2,   1],
        [  0,   3, 208,   1],
        [  0,   3,   6, 235]]),
 0.8478260636329651,
 array([[38,  4,  0,  6],
        [ 0,  1,  0,  0],
        [ 0,  1, 20,  1],
        [ 1,  1,  0, 19]]))

In [17]:
preds,_ = m.predict(test_dataloader)
final_preds = preds.max(dim=1)[1]

In [20]:
real_y[test_nodes]

array([0, 0, 0, 0, 3, 3, 3, 0, 3, 0, 3, 0, 0, 3, 2, 0, 3, 0, 0, 2, 0, 0,
       0, 0, 2, 2, 0, 2, 2, 2])

In [21]:
final_preds[test_nodes]

tensor([0, 0, 2, 1, 3, 3, 3, 0, 3, 2, 3, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 0, 0, 2,
        2, 2, 0, 2, 0, 2], device='cuda:0')

In [18]:
preds[test_nodes]
preds.max(dim=1)[0][test_nodes]

tensor([1.0000, 0.9968, 0.9425, 0.5502, 0.9494, 0.9739, 0.9985, 0.9989, 0.8702,
        0.7334, 0.9981, 0.9445, 0.5272, 0.5079, 0.9870, 0.9995, 0.9896, 0.9990,
        0.9985, 0.7850, 0.9972, 0.7456, 0.5415, 0.6565, 0.9933, 0.9968, 0.9990,
        0.9982, 0.7832, 0.9412], device='cuda:0', grad_fn=<IndexBackward>)

In [10]:
test_accuracy = [0]*5
for i in range(5):
    print(i)
    m = GCNModel("configs/test18.txt", 2, dropout=0.1)
    m.train(dataloader, 150, verbose=False)
    _,_,_,_,acc,_ = m.validation_metrics(test_dataloader, train_nodes, test_nodes)
    test_accuracy[i] = acc
print(statistics.mean(test_accuracy))
print(statistics.stdev(test_accuracy))

0
1
2
3
4
0.5888888955116272
0.06136311310462684
