In [1]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os
import statistics

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
data_folder = "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq/"
#data_folder = "simulations/splat_0.7_de_rq/"

In [27]:
os.path.exists(data_folder + "preds.csv")

True

In [23]:
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

In [24]:
all_labels.shape[1]

5

In [25]:
all_labels

Unnamed: 0,scina,scsorter,sctype,singler,scpred
Cell1001,Group2,Group2,Group2,Group2,Group2
Cell1002,Group4,Group2,Group2,Group4,Group2
Cell1003,Group1,Group4,Group2,Group2,
Cell1004,Group4,Group4,Group4,Group4,Group1
Cell1005,Group1,Group1,Group1,Group1,Group1
...,...,...,...,...,...
Cell1996,Group2,Group2,Group4,Group2,Group1
Cell1997,Group1,Group1,Group1,Group1,Group1
Cell1998,Group3,Group3,Group3,Group3,Group3
Cell1999,Group4,Group4,Group2,Group4,Group4


In [26]:
# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells = utilities.preprocess(np.array(X), scale=False)
X.shape

(999, 500)

In [27]:
all_labels = all_labels.loc[keep_cells,:]

In [28]:
_,marker_names = utilities.read_marker_file(marker_path)
marker_names

['Group1', 'Group2', 'Group3', 'Group4']

In [10]:
all_labels['scsorter']

Cell1001    Group2
Cell1002    Group2
Cell1003    Group4
Cell1004    Group4
Cell1005    Group1
             ...  
Cell1996    Group2
Cell1997    Group1
Cell1998    Group3
Cell1999    Group4
Cell2000    Group2
Name: scsorter, Length: 999, dtype: object

In [29]:
all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)
encoded_labels

array([[0., 5., 0., 0.],
       [0., 3., 0., 2.],
       [1., 2., 0., 1.],
       ...,
       [0., 0., 5., 0.],
       [0., 1., 0., 4.],
       [0., 4., 0., 0.]])

In [30]:
meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]
real_y.shape

(999,)

In [31]:
len(real_y[real_y==0])

232

In [32]:
print(utilities.pred_accuracy(all_labels_factored['scina'], real_y))
print(utilities.pred_accuracy(all_labels_factored['sctype'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scsorter'], real_y))
print(utilities.pred_accuracy(all_labels_factored['singler'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scpred'], real_y))


0.792792797088623
0.8138138055801392
0.826826810836792
0.8408408164978027
0.6386386156082153


In [33]:
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
utilities.pred_accuracy(max_pred, real_y)

  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


0.9309309124946594

In [34]:
confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
confident_labels.shape

(999,)

In [35]:
train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]
print(np.unique(confident_labels))
print(np.unique(confident_labels[train_nodes]))
print(np.unique(confident_labels[test_nodes]))

[-1.  0.  1.  2.  3.]
[0. 1. 2. 3.]
[-1.]


In [26]:
real_y[test_nodes]

array([3, 0, 3, 0, 0, 0, 1, 2, 2, 2, 3, 0, 1, 1, 3, 2, 0, 1, 2, 3, 2, 1,
       0, 3, 3, 0, 3, 1, 2, 1, 1, 0, 0, 3, 2, 0, 0, 1, 2, 2, 0, 1, 1, 3,
       2, 0, 0, 0, 0, 3, 3, 3, 0, 3, 1, 0, 3, 3, 0, 1, 2, 2, 0, 2, 1, 0,
       3, 1, 0, 0, 2, 1, 3, 2, 0, 0, 0, 0, 2, 1, 0, 0, 2, 3, 0, 2, 3, 0,
       1, 2, 3, 2, 2, 2, 2, 0, 3, 0, 2, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 0,
       3, 1, 2, 2, 0, 0, 2, 0])

In [36]:
print(utilities.pred_accuracy(confident_labels[train_nodes], real_y[train_nodes]))

0.9724972248077393


In [37]:
len(test_nodes)

90

In [38]:
# tool accuracy on test
print(utilities.pred_accuracy(np.array(all_labels_factored['scina'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['sctype'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scsorter'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['singler'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scpred'][test_nodes]), real_y[test_nodes]))
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
print(utilities.pred_accuracy(max_pred[test_nodes], real_y[test_nodes]))

0.23333333432674408
0.4555555582046509
0.35555556416511536
0.46666666865348816
0.2222222238779068
0.5111111402511597


In [39]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=35, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=35, shuffle=False)

In [65]:
m = GCNModel("configs/3_25.txt", 2, dropout=0.0)

In [66]:
m.train(dataloader, 200)

Loss in epoch 0 = 36.518837
Loss in epoch 10 = 0.023115
Loss in epoch 20 = 0.006610
Loss in epoch 30 = 0.001642
Loss in epoch 40 = 0.001037
Loss in epoch 50 = 0.000560
Loss in epoch 60 = 0.000466
Loss in epoch 70 = 0.000263
Loss in epoch 80 = 0.000593
Loss in epoch 90 = 0.000179
Loss in epoch 100 = 0.000171
Loss in epoch 110 = 0.000103
Loss in epoch 120 = 0.000110
Loss in epoch 130 = 0.000090
Loss in epoch 140 = 0.000041
Loss in epoch 150 = 0.000036
Loss in epoch 160 = 0.000035
Loss in epoch 170 = 0.000045
Loss in epoch 180 = 0.000042
Loss in epoch 190 = 0.000014


In [67]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9619619846343994,
 array([[219,   7,   4,   2],
        [  1, 265,   2,   0],
        [  2,   3, 225,   4],
        [  2,   6,   5, 252]]),
 0.9724972248077393,
 array([[178,   6,   2,   2],
        [  0, 264,   2,   0],
        [  1,   2, 211,   1],
        [  0,   5,   4, 231]]),
 0.855555534362793,
 array([[41,  1,  2,  0],
        [ 1,  1,  0,  0],
        [ 1,  1, 14,  3],
        [ 2,  1,  1, 21]]))

In [None]:
# start putting labels back in test

In [2]:
data_folders = ["/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.6_de_rq/", "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq/", "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.8_de_rq/"]
#data_folders = ["simulations/splat_0.6_de_rq/", "simulations/splat_0.7_de_rq/", "simulations/splat_0.8_de_rq/"]
#data_folders = ["/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq/"]
tools = ["sctype","scsorter","scina","singler", "scpred"]
votes_necessary = 3
model_file = "configs/2_25.txt"
neighbors = 2
batch_size=20
training_epochs=150
random_inits = 5

In [3]:
results = test_model(data_folders, tools, votes_necessary, model_file, neighbors, batch_size, training_epochs, random_inits)

[0.7839999794960022, 0.7870000004768372, 0.7919999957084656, 0.777999997138977, 0.7749999761581421]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


[0.9639639854431152, 0.9619619846343994, 0.9629629850387573, 0.9619619846343994, 0.9599599838256836]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


[0.9869869947433472, 0.9879879951477051, 0.9869869947433472, 0.9869869947433472, 0.9879879951477051]


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


In [4]:
# results from no feed forward or gcn labels
results

Unnamed: 0,data_name,method,total_accuracy,train_accuracy,test_accuracy,total_sd,train_sd,test_sd
0,splat_0.6_de_rq,GCN,0.7832,0.920684,0.535574,0.006834,0.0,0.019142
1,splat_0.6_de_rq,Max Col.,0.792,0.920684,0.560224,0.0,0.0,0.0
2,splat_0.6_de_rq,Confident Labels,,0.920684,,0.0,0.0,0.0
3,splat_0.6_de_rq,sctype,0.301,0.443235,0.044818,0.0,0.0,0.0
4,splat_0.6_de_rq,scsorter,0.677,0.861586,0.344538,0.0,0.0,0.0
5,splat_0.6_de_rq,scina,0.467,0.645412,0.145658,0.0,0.0,0.0
6,splat_0.6_de_rq,singler,0.84,0.917574,0.70028,0.0,0.0,0.0
7,splat_0.6_de_rq,scpred,0.503,0.62675,0.280112,0.0,0.0,0.0
0,splat_0.7_de_rq,GCN,0.962162,0.972497,0.857778,0.001485,0.0,0.01648
1,splat_0.7_de_rq,Max Col.,0.930931,0.972497,0.511111,0.0,0.0,0.0


In [6]:
# test putting labels back in
random_inits = 5
data_folders = ["/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.6_de_rq/", "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq/", "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.8_de_rq/"]
for data_folder in data_folders:
    data_path = data_folder + "query_counts.csv"
    X = pd.read_csv(data_path, index_col=0)
    X, keep_cells = utilities.preprocess(np.array(X), scale=False)
    print(X.shape)
    marker_path = data_folder + "markers.txt"
    _,marker_names = utilities.read_marker_file(marker_path)
    
    meta_path = data_folder + "query_meta.csv"
    metadata = pd.read_csv(meta_path, index_col=0)
    real_y = pd.factorize(metadata['Group'], sort=True)[0]
    real_y = real_y[keep_cells]
    real_y.shape
    
    test_accuracy = [0]*random_inits
    for i in range(random_inits):
        tools = ["sctype","scsorter","scina","singler", "scpred"]
        #tools = ["scsorter","scina","singler"]
        ref_path = data_folder + "ref_counts.csv"
        ref_label_path = data_folder + "ref_labels.csv"
        
        if os.path.exists(data_folder + "preds.csv"):
            all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
            if all_labels.shape[1] != len(tools): 
                all_labels = all_labels[tools]
                #raise Exception("wrong amount of tools in file")
        else:
            all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

        all_labels = all_labels.loc[keep_cells,:]
        all_labels_factored = utilities.factorize_df(all_labels, marker_names)
        encoded_labels = utilities.encode_predictions(all_labels_factored)
        confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
        train_nodes = np.where(confident_labels != -1)[0]
        original_test_nodes = np.where(confident_labels == -1)[0]

        dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=20, shuffle=True)

        test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
        test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=False)
        
        m = GCNModel("configs/2_25.txt", 2, dropout=0.0)
        m.train(dataloader, 150, verbose = False)
        
        new_labels, _ = m.predict(test_dataloader)
        new_labels = new_labels.max(dim=1)[1]
        
        #print(len(original_test_nodes))
        for j in range(1,5):
            col_name = "gcn" + str(j)
            all_labels_factored[col_name] = new_labels.cpu()
            encoded_labels = utilities.encode_predictions(all_labels_factored)
            confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
            train_nodes = np.where(confident_labels != -1)[0]
            test_nodes = np.where(confident_labels == -1)[0]

            dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=20, shuffle=True)

            test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
            test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=False)

            #print(len(test_nodes))

            #m = GCNModel("configs/2_15.txt", 2, dropout=0.0)
            m.train(dataloader, 50, verbose=False)
            #print(m.validation_metrics(test_dataloader, train_nodes, test_nodes))

            new_labels, _ = m.predict(test_dataloader)
            new_labels = new_labels.max(dim=1)[1]
            
        _,_,_,_,accuracy,_ = m.validation_metrics(test_dataloader, train_nodes, original_test_nodes)
        test_accuracy[i] = accuracy
    print(statistics.mean(test_accuracy))
    print(statistics.stdev(test_accuracy))

(1000, 500)
0.6011204719543457
0.026691604570561273
(999, 500)
0.848888885974884
0.023040499079638933
(999, 500)
0.7612903237342834
0.02885248584038609


In [7]:
# test putting labels back in new model each time
random_inits = 3
data_folders = ["/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.6_de_rq/", "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq/", "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.8_de_rq/"]
for data_folder in data_folders:
    data_path = data_folder + "query_counts.csv"
    X = pd.read_csv(data_path, index_col=0)
    X, keep_cells = utilities.preprocess(np.array(X), scale=False)
    print(X.shape)
    marker_path = data_folder + "markers.txt"
    _,marker_names = utilities.read_marker_file(marker_path)
    
    meta_path = data_folder + "query_meta.csv"
    metadata = pd.read_csv(meta_path, index_col=0)
    real_y = pd.factorize(metadata['Group'], sort=True)[0]
    real_y = real_y[keep_cells]
    real_y.shape
    
    test_accuracy = [0]*random_inits
    for i in range(random_inits):
        tools = ["sctype","scsorter","scina","singler", "scpred"]
        #tools = ["scsorter","scina","singler"]
        ref_path = data_folder + "ref_counts.csv"
        ref_label_path = data_folder + "ref_labels.csv"
        
        if os.path.exists(data_folder + "preds.csv"):
            all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
            if all_labels.shape[1] != len(tools): 
                all_labels = all_labels[tools]
                #raise Exception("wrong amount of tools in file")
        else:
            all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

        all_labels = all_labels.loc[keep_cells,:]
        all_labels_factored = utilities.factorize_df(all_labels, marker_names)
        encoded_labels = utilities.encode_predictions(all_labels_factored)
        confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
        train_nodes = np.where(confident_labels != -1)[0]
        original_test_nodes = np.where(confident_labels == -1)[0]

        dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=20, shuffle=True)

        test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
        test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=False)
        
        m = GCNModel("configs/2_25.txt", 2, dropout=0.0)
        m.train(dataloader, 150, verbose = False)
        
        new_labels, _ = m.predict(test_dataloader)
        new_labels = new_labels.max(dim=1)[1]
        
        #print(len(original_test_nodes))
        for j in range(1,5):
            col_name = "gcn" + str(j)
            all_labels_factored[col_name] = new_labels.cpu()
            encoded_labels = utilities.encode_predictions(all_labels_factored)
            confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)
            train_nodes = np.where(confident_labels != -1)[0]
            test_nodes = np.where(confident_labels == -1)[0]

            dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=20, shuffle=True)

            test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
            test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=False)

            #print(len(test_nodes))

            m = GCNModel("configs/2_25.txt", 2, dropout=0.0)
            m.train(dataloader, 150, verbose=False)
            #print(m.validation_metrics(test_dataloader, train_nodes, test_nodes))

            new_labels, _ = m.predict(test_dataloader)
            new_labels = new_labels.max(dim=1)[1]
            
        _,_,_,_,accuracy,_ = m.validation_metrics(test_dataloader, train_nodes, original_test_nodes)
        test_accuracy[i] = accuracy
    print(statistics.mean(test_accuracy))
    print(statistics.stdev(test_accuracy))

(1000, 500)
0.6302521228790283
0.014005601406097412
(999, 500)
0.8814814885457357
0.03394500312454106
(999, 500)
0.8387096722920736
0.08534682520664126
