In [1]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os
import statistics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_folder = "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq/"
#data_folder = "simulations/splat_0.7_de_rq/"

In [3]:
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler", "sctype"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

In [4]:
# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells, keep_genes, rpca = utilities.preprocess(np.array(X), scale=False)
X.shape

(999, 500)

In [5]:
all_labels = all_labels.loc[keep_cells,:]
_,marker_names = utilities.read_marker_file(marker_path)

In [6]:
all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = .51)
train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]

In [7]:
scores = np.zeros(len(tools))
for i, tool in enumerate(tools):
    scores[i] = utilities.pred_accuracy(all_labels_factored[tool].to_numpy()[train_nodes], confident_labels[train_nodes])


In [8]:
scores

array([0.8512035 , 0.89168489, 0.85557985, 0.87527353, 0.68818378])

In [9]:
scores /= scores.sum()
scores = np.log(scores)
scores

array([-1.58708189, -1.54062031, -1.58195369, -1.55919668, -1.79967719])

In [10]:
torch.softmax(torch.tensor(scores),0)

tensor([0.2045, 0.2142, 0.2056, 0.2103, 0.1654], dtype=torch.float64)

In [11]:
results = all_labels_factored.to_numpy()
results_exp = np.zeros((results.shape[0], results.shape[1], 4))

results_exp[results == 0, :] = np.array([1,0,0,0])
results_exp[results == 1, :] = np.array([0,1,0,0])
results_exp[results == 2, :] = np.array([0,0,1,0])
results_exp[results == 3, :] = np.array([0,0,0,1])

tY = torch.tensor(results_exp).float()

In [12]:
meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]
real_y.shape

(999,)

In [13]:
confusion_matrix(real_y, confident_labels)

array([[  0,   0,   0,   0,   0],
       [ 42, 178,   7,   2,   3],
       [  2,   0, 264,   2,   0],
       [ 16,   1,   2, 214,   1],
       [ 25,   0,   5,   4, 231]])

In [14]:
confusion_matrix(real_y, all_labels_factored["sctype"])

array([[186,  12,   9,  25],
       [  2, 248,   5,  13],
       [  9,   8, 196,  21],
       [ 21,  31,  30, 183]])

In [15]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(tY))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)

  dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(tY))


In [16]:
m = GCNModel("configs/2_40.txt", 2, scores, weights_mode = True, learn_weights = True, dropout=0.0)
m.train(dataloader, 250, verbose=True)

Loss in epoch 0 = 12.690382
Loss in epoch 10 = 4.394219
Loss in epoch 20 = 4.067180
Loss in epoch 30 = 4.004829
Loss in epoch 40 = 3.942397
Loss in epoch 50 = 3.873939
Loss in epoch 60 = 3.844952
Loss in epoch 70 = 3.808426
Loss in epoch 80 = 3.789386
Loss in epoch 90 = 3.783088
Loss in epoch 100 = 3.779667
Loss in epoch 110 = 3.760251
Loss in epoch 120 = 3.760436
Loss in epoch 130 = 3.764865
Loss in epoch 140 = 3.756975
Loss in epoch 150 = 3.757146
Loss in epoch 160 = 3.747303
Loss in epoch 170 = 3.746152
Loss in epoch 180 = 3.740202
Loss in epoch 190 = 3.739523
Loss in epoch 200 = 3.731399
Loss in epoch 210 = 3.739038
Loss in epoch 220 = 3.732474
Loss in epoch 230 = 3.742904
Loss in epoch 240 = 3.726575


In [17]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.935935914516449,
 array([[198,  15,   8,  11],
        [  0, 265,   2,   1],
        [  2,   6, 223,   3],
        [  0,   7,   9, 249]]),
 0.970459520816803,
 array([[178,   7,   2,   3],
        [  0, 264,   2,   0],
        [  1,   2, 214,   1],
        [  0,   5,   4, 231]]),
 0.5647059082984924,
 array([[20,  8,  6,  8],
        [ 0,  1,  0,  1],
        [ 1,  4,  9,  2],
        [ 0,  2,  5, 18]]))

In [18]:
for i in range(3):
    m = GCNModel("configs/2_40.txt", 2, scores, weights_mode = True, learn_weights = True, dropout=0.0)
    m.train(dataloader, 250, verbose=False)
    print(m.validation_metrics(test_dataloader, train_nodes, test_nodes))

(0.935935914516449, array([[201,  14,   8,   9],
       [  0, 266,   2,   0],
       [  2,   5, 221,   6],
       [  4,   5,   9, 247]]), 0.9693654179573059, array([[178,   7,   2,   3],
       [  0, 264,   2,   0],
       [  1,   2, 214,   1],
       [  1,   5,   4, 230]]), 0.5764706134796143, array([[23,  7,  6,  6],
       [ 0,  2,  0,  0],
       [ 1,  3,  7,  5],
       [ 3,  0,  5, 17]]))
(0.9289289116859436, array([[198,  17,   9,   8],
       [  0, 266,   2,   0],
       [  3,   5, 221,   5],
       [  1,  11,  10, 243]]), 0.970459520816803, array([[178,   7,   2,   3],
       [  0, 264,   2,   0],
       [  1,   2, 214,   1],
       [  0,   5,   4, 231]]), 0.48235294222831726, array([[20, 10,  7,  5],
       [ 0,  2,  0,  0],
       [ 2,  3,  7,  4],
       [ 1,  6,  6, 12]]))
(0.9219219088554382, array([[196,  14,  12,  10],
       [  0, 264,   3,   1],
       [  4,   7, 218,   5],
       [  0,  10,  12, 243]]), 0.970459520816803, array([[178,   7,   2,   3],
       [  0, 264

In [19]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)

In [20]:
m = GCNModel("configs/2_40.txt", 2, scores, weights_mode = False, learn_weights = False, dropout=0.0)
m.train(dataloader, 150)

Loss in epoch 0 = 26.129650
Loss in epoch 10 = 0.062354
Loss in epoch 20 = 0.017898
Loss in epoch 30 = 0.007746
Loss in epoch 40 = 0.004978
Loss in epoch 50 = 0.002958
Loss in epoch 60 = 0.001911
Loss in epoch 70 = 0.001794
Loss in epoch 80 = 0.001034
Loss in epoch 90 = 0.000938
Loss in epoch 100 = 0.000736
Loss in epoch 110 = 0.000646
Loss in epoch 120 = 0.000430
Loss in epoch 130 = 0.000282
Loss in epoch 140 = 0.000286


In [21]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9579579830169678,
 array([[212,  10,   2,   8],
        [  0, 266,   2,   0],
        [  2,   3, 227,   2],
        [  1,   7,   5, 252]]),
 0.970459520816803,
 array([[178,   7,   2,   3],
        [  0, 264,   2,   0],
        [  1,   2, 214,   1],
        [  0,   5,   4, 231]]),
 0.8235294222831726,
 array([[34,  3,  0,  5],
        [ 0,  2,  0,  0],
        [ 1,  1, 13,  1],
        [ 1,  2,  1, 21]]))