In [1]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os
import statistics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [2]:
#data_folder = "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq_v3/"
data_folder = "simulations/splat_0.7_de_rq/"

In [3]:
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

In [4]:
# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells, keep_genes, rpca = utilities.preprocess(np.array(X), scale=False)
X.shape

(999, 500)

In [5]:
all_labels = all_labels.loc[keep_cells,:]
_,marker_names = utilities.read_marker_file(marker_path)

In [6]:
all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = .51)
train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]

In [7]:
scores = np.zeros(len(tools))
for i, tool in enumerate(tools):
    scores[i] = utilities.pred_accuracy(all_labels_factored[tool].to_numpy()[train_nodes], confident_labels[train_nodes])


In [8]:
scores

array([0.85556781, 0.88643879, 0.85997796, 0.88313121, 0.65711135])

In [9]:
scores /= scores.sum()
scores = np.log(scores)
scores

array([-1.57722352, -1.5417768 , -1.57208211, -1.54551509, -1.84113539])

In [10]:
torch.softmax(torch.tensor(scores),0)

tensor([0.2065, 0.2140, 0.2076, 0.2132, 0.1586], dtype=torch.float64)

In [11]:
results = all_labels_factored.to_numpy()
results_exp = np.zeros((results.shape[0], results.shape[1], 4))

results_exp[results == 0, :] = np.array([1,0,0,0])
results_exp[results == 1, :] = np.array([0,1,0,0])
results_exp[results == 2, :] = np.array([0,0,1,0])
results_exp[results == 3, :] = np.array([0,0,0,1])

tY = torch.tensor(results_exp).float()

In [12]:
meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]
real_y.shape

(999,)

In [22]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(tY))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)

  dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(tY))


In [23]:
m = GCNModel("configs/2_40.txt", 2, scores, weights_mode = True, learn_weights = False, dropout=0.0)

In [24]:
m.train(dataloader, 150)

Loss in epoch 0 = 12.064449
Loss in epoch 10 = 4.499492
Loss in epoch 20 = 4.263627
Loss in epoch 30 = 4.201023
Loss in epoch 40 = 4.147159
Loss in epoch 50 = 4.122489
Loss in epoch 60 = 4.110283
Loss in epoch 70 = 4.088551
Loss in epoch 80 = 4.090820
Loss in epoch 90 = 4.075344
Loss in epoch 100 = 4.056178
Loss in epoch 110 = 4.065664
Loss in epoch 120 = 4.046824
Loss in epoch 130 = 4.044737
Loss in epoch 140 = 4.041814


In [25]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9319319128990173,
 array([[199,  17,  11,   5],
        [  0, 265,   2,   1],
        [  3,   6, 221,   4],
        [  2,   6,  11, 246]]),
 0.9702315330505371,
 array([[173,   7,   2,   2],
        [  0, 264,   2,   1],
        [  0,   3, 208,   1],
        [  0,   3,   6, 235]]),
 0.554347813129425,
 array([[26, 10,  9,  3],
        [ 0,  1,  0,  0],
        [ 3,  3, 13,  3],
        [ 2,  3,  5, 11]]))

In [17]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)

In [18]:
m = GCNModel("configs/2_40.txt", 2, scores, weights_mode = False, learn_weights = False, dropout=0.0)
m.train(dataloader, 250)

Loss in epoch 0 = 26.071560
Loss in epoch 10 = 0.066667
Loss in epoch 20 = 0.014319
Loss in epoch 30 = 0.007804
Loss in epoch 40 = 0.004762
Loss in epoch 50 = 0.003824
Loss in epoch 60 = 0.002068
Loss in epoch 70 = 0.001413
Loss in epoch 80 = 0.001221
Loss in epoch 90 = 0.001055
Loss in epoch 100 = 0.000549
Loss in epoch 110 = 0.000465
Loss in epoch 120 = 0.000501
Loss in epoch 130 = 0.000378
Loss in epoch 140 = 0.000349
Loss in epoch 150 = 0.000510
Loss in epoch 160 = 0.000215
Loss in epoch 170 = 0.000199
Loss in epoch 180 = 0.000140
Loss in epoch 190 = 0.000132
Loss in epoch 200 = 0.000244
Loss in epoch 210 = 0.000103
Loss in epoch 220 = 0.000077
Loss in epoch 230 = 0.000095
Loss in epoch 240 = 0.000073


In [19]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.955955982208252,
 array([[211,  10,   2,   9],
        [  0, 265,   2,   1],
        [  2,   3, 224,   5],
        [  1,   3,   6, 255]]),
 0.9702315330505371,
 array([[173,   7,   2,   2],
        [  0, 264,   2,   1],
        [  0,   3, 208,   1],
        [  0,   3,   6, 235]]),
 0.8152173757553101,
 array([[38,  3,  0,  7],
        [ 0,  1,  0,  0],
        [ 2,  0, 16,  4],
        [ 1,  0,  0, 20]]))