In [1]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os
import statistics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_folder = "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq/"
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells = utilities.preprocess(np.array(X), scale=False, comps=500)

all_labels = all_labels.loc[keep_cells,:]

_,marker_names = utilities.read_marker_file(marker_path)

all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)

meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]

confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)

train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]



In [3]:
X.shape

(999, 500)

In [4]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=X.shape[0], shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=X.shape[0], shuffle=False)


In [13]:
m = GCNModel("configs/test18.txt", 10, dropout=0.1)
m.train(dataloader, 150)

RuntimeError: CUDA out of memory. Tried to allocate 78.00 MiB (GPU 0; 10.76 GiB total capacity; 9.56 GiB already allocated; 41.44 MiB free; 9.70 GiB reserved in total by PyTorch)

In [8]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.7627627849578857,
 array([[173,  14,  10,  35],
        [  2, 232,   4,  30],
        [ 16,  23, 177,  18],
        [ 42,  29,  14, 180]]),
 0.7964796423912048,
 array([[149,  10,   5,  24],
        [  2, 232,   4,  28],
        [ 14,  17, 172,  12],
        [ 40,  20,   9, 171]]),
 0.42222222685813904,
 array([[24,  4,  5, 11],
        [ 0,  0,  0,  2],
        [ 2,  6,  5,  6],
        [ 2,  9,  5,  9]]))

In [10]:
test_accuracy = [0]*5
for i in range(5):
    print(i)
    m = GCNModel("configs/test18.txt", 2, dropout=0.1)
    m.train(dataloader, 150, verbose=False)
    _,_,_,_,acc,_ = m.validation_metrics(test_dataloader, train_nodes, test_nodes)
    test_accuracy[i] = acc
print(statistics.mean(test_accuracy))
print(statistics.stdev(test_accuracy))

0
1
2
3
4
0.5888888955116272
0.06136311310462684
