In [76]:
import numpy as np
import pandas as pd
import torch
from gcn_model import GCNModel
import utilities
from test_model import test_model
import os
import statistics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import random

In [77]:
data_folder = "/home/groups/ConradLab/daniel/sharp_data/sharp_sims/splat_0.7_de_rq_v3/"
# get labels
data_path = data_folder + "query_counts.csv"
tools = ["sctype","scsorter","scina","singler", "scpred"]
#tools = ["scsorter","scina","singler"]
ref_path = data_folder + "ref_counts.csv"
ref_label_path = data_folder + "ref_labels.csv"
marker_path = data_folder + "markers.txt"
if os.path.exists(data_folder + "preds.csv"):
    all_labels = pd.read_csv(data_folder + "preds.csv", index_col=0)
    if all_labels.shape[1] != len(tools): 
        all_labels = all_labels[tools]
        #raise Exception("wrong amount of tools in file")
else:
    all_labels = utilities.label_counts(data_path,tools,ref_path,ref_label_path,marker_path)

# read in dataset
X = pd.read_csv(data_path, index_col=0)
X, keep_cells = utilities.preprocess(np.array(X), scale=False, comps=500)

all_labels = all_labels.loc[keep_cells,:]

_,marker_names = utilities.read_marker_file(marker_path)

all_labels_factored = utilities.factorize_df(all_labels, marker_names)
encoded_labels = utilities.encode_predictions(all_labels_factored)

meta_path = data_folder + "query_meta.csv"
metadata = pd.read_csv(meta_path, index_col=0)
real_y = pd.factorize(metadata['Group'], sort=True)[0]
real_y = real_y[keep_cells]

confident_labels = utilities.get_consensus_labels(encoded_labels, necessary_vote = 3)

train_nodes = np.where(confident_labels != -1)[0]
test_nodes = np.where(confident_labels == -1)[0]

In [39]:
len(test_nodes)

43

In [78]:
confusion_matrix(real_y[train_nodes], confident_labels[train_nodes])

array([[357,   8,   7,   0],
       [  0, 281,   1,   0],
       [  1,   0, 187,   0],
       [  2,   8,   4,  10]])

In [79]:
confusion_matrix(real_y, all_labels_factored["scina"])

array([[  0,   0,   0,   0,   0],
       [236, 126,  20,  37,   5],
       [ 99,   6, 165,  14,   5],
       [ 49,   8,   7, 133,   2],
       [ 51,   7,   5,  10,  15]])

In [80]:
confusion_matrix(real_y, all_labels_factored["sctype"])

array([[400,  13,  11,   0],
       [  7, 270,  12,   0],
       [  9,   8, 182,   0],
       [ 19,  48,  21,   0]])

In [81]:
confusion_matrix(real_y, all_labels_factored["singler"])

array([[280, 106,  30,   8],
       [  0, 289,   0,   0],
       [  0,   3, 196,   0],
       [  2,  16,   6,  64]])

In [82]:
confusion_matrix(real_y, all_labels_factored["scpred"])

array([[  0,   0,   0,   0,   0],
       [ 34, 351,  18,  21,   0],
       [ 35,  13, 210,  31,   0],
       [ 28,   6,  17, 148,   0],
       [ 36,  13,  19,  20,   0]])

In [83]:
confusion_matrix(real_y, all_labels_factored["scsorter"])

array([[333,  46,  23,  22],
       [  3, 279,   4,   3],
       [  4,   8, 175,  12],
       [  5,  14,   6,  63]])

In [41]:
print(utilities.pred_accuracy(all_labels_factored['scina'], real_y))
print(utilities.pred_accuracy(all_labels_factored['sctype'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scsorter'], real_y))
print(utilities.pred_accuracy(all_labels_factored['singler'], real_y))
print(utilities.pred_accuracy(all_labels_factored['scpred'], real_y))
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
utilities.pred_accuracy(max_pred, real_y)

0.8298298120498657
0.9319319128990173
0.9069069027900696
0.9479479193687439
0.8608608841896057


  return float((torch.tensor(preds) == torch.tensor(real)).type(torch.FloatTensor).mean().numpy())


0.9649649858474731

In [42]:
print(utilities.pred_accuracy(np.array(all_labels_factored['scina'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['sctype'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scsorter'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['singler'][test_nodes]), real_y[test_nodes]))
print(utilities.pred_accuracy(np.array(all_labels_factored['scpred'][test_nodes]), real_y[test_nodes]))
max_pred = torch.tensor(encoded_labels).max(dim=1)[1]
print(utilities.pred_accuracy(max_pred[test_nodes], real_y[test_nodes]))

0.04651162773370743
0.302325576543808
0.5581395626068115
0.6976743936538696
0.1860465109348297
0.5116279125213623


In [43]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(confident_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X), torch.tensor(real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)

In [44]:
m = GCNModel("configs/2_40.txt", 2, dropout=0.0)
m.train(dataloader, 150)

Loss in epoch 0 = 24.393032
Loss in epoch 10 = 0.060813
Loss in epoch 20 = 0.012397
Loss in epoch 30 = 0.007144
Loss in epoch 40 = 0.003823
Loss in epoch 50 = 0.002240
Loss in epoch 60 = 0.001888
Loss in epoch 70 = 0.001163
Loss in epoch 80 = 0.000829
Loss in epoch 90 = 0.000842
Loss in epoch 100 = 0.000439
Loss in epoch 110 = 0.000487
Loss in epoch 120 = 0.000420
Loss in epoch 130 = 0.000350
Loss in epoch 140 = 0.000689


In [45]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9699699878692627,
 array([[421,   1,   2,   0],
        [  0, 288,   0,   0],
        [  0,   0, 198,   1],
        [  8,   3,  15,  62]]),
 0.9853556752204895,
 array([[413,   1,   2,   0],
        [  0, 286,   0,   0],
        [  0,   0, 198,   1],
        [  4,   1,   5,  45]]),
 0.6279069781303406,
 array([[ 8,  0,  0,  0],
        [ 0,  2,  0,  0],
        [ 0,  0,  0,  0],
        [ 4,  2, 10, 17]]))

In [None]:
# add copies of confidently labelled cell type 4 to boost numbers

In [59]:
X.shape

(999, 500)

In [60]:
conf_four_cells = np.where(confident_labels == 3)[0]
X[conf_four_cells,:]

array([[-5.661688  , -2.7192438 ,  1.2781072 , ...,  0.3024522 ,
         0.32953277,  2.059688  ],
       [ 5.551842  ,  4.5610647 ,  1.7842762 , ...,  0.58477086,
         0.43750235,  1.3204188 ],
       [ 5.095987  ,  3.1165342 ,  5.7744884 , ..., -1.1838729 ,
         0.24187106, -0.04682564],
       ...,
       [-1.3873566 , -1.4106914 ,  1.9305731 , ..., -1.2444688 ,
        -0.38206223, -0.56642795],
       [-1.5110993 , -0.22504108,  1.521301  , ...,  0.25146037,
        -1.7881571 ,  0.6317005 ],
       [ 4.829648  ,  3.8642364 ,  3.1150396 , ...,  0.01267185,
        -0.1364863 ,  1.3072101 ]], dtype=float32)

In [61]:
repeated_four = np.tile(X[conf_four_cells,:], (2,1))
repeated_four.shape

(92, 500)

In [62]:
test = np.array(X[confident_labels==0,18], dtype="float64")
statistics.stdev(test)

1.9519283509764285

In [63]:
random_mat = np.zeros(repeated_four.shape)
for i in range(X.shape[1]):
    temp_sd = statistics.stdev(np.array(X[confident_labels==0,i], dtype="float64"))
    random_column = np.random.normal(0,temp_sd, repeated_four.shape[0])
    random_mat[:,i] = random_column

In [64]:
random_mat

array([[-4.65346652e+00, -1.00392066e+00,  8.88266010e-01, ...,
        -1.38436306e+00, -1.11398303e+00, -1.15993156e+00],
       [ 2.09293517e+00,  1.96142170e-01, -6.12993509e-01, ...,
         1.27789133e+00,  4.57666604e-01,  3.60618482e-01],
       [-5.29424811e+00,  6.43562607e-02,  2.12086386e+00, ...,
        -8.32679063e-02,  2.76405603e-01,  2.58014296e-02],
       ...,
       [ 4.91545371e+00,  1.89572213e+00,  2.15827891e+00, ...,
         1.91989216e-01, -2.54953900e-02,  3.52407708e-01],
       [ 6.75285044e+00, -1.90097268e+00, -9.09240298e-01, ...,
         1.51190948e+00,  1.22089246e+00, -4.97614741e-01],
       [-6.63780164e-01, -1.56258455e+00, -1.27789867e+00, ...,
        -1.32820775e+00,  1.22304758e-03, -4.77185170e-01]])

In [65]:
#repeated_four = repeated_four + np.random.normal(0, 4, repeated_four.shape)
repeated_four = repeated_four + random_mat
repeated_four

array([[-10.31515437,  -3.72316443,   2.16637318, ...,  -1.08191085,
         -0.78445026,   0.89975653],
       [  7.64477738,   4.75720689,   1.17128274, ...,   1.86266219,
          0.89516896,   1.68103732],
       [ -0.19826127,   3.18089049,   7.89535231, ...,  -1.26714084,
          0.51827666,  -0.02102421],
       ...,
       [  3.52809707,   0.48503075,   4.08885202, ...,  -1.05247959,
         -0.40755762,  -0.21402024],
       [  5.2417511 ,  -2.12601376,   0.61206073, ...,   1.76336985,
         -0.56726464,   0.13408577],
       [  4.16586785,   2.3016518 ,   1.83714092, ...,  -1.3155359 ,
         -0.13526326,   0.83002492]])

In [66]:
X_extended = np.concatenate((X, repeated_four), axis=0)
X_extended.shape

(1091, 500)

In [67]:
confident_labels.shape

(999,)

In [68]:
extended_conf_labels = np.concatenate((confident_labels, np.array([3]*92)))
extended_conf_labels.shape

(1091,)

In [69]:
extended_real_y = np.concatenate((real_y, np.array([3]*92)))
extended_real_y.shape

(1091,)

In [70]:
# mix in fake nodes
random.seed(8)
shuffled = list(range(len(extended_real_y)))
random.shuffle(shuffled)
extended_real_y = extended_real_y[shuffled]
extended_conf_labels = extended_conf_labels[shuffled]
X_extended = X_extended[shuffled,:]
train_nodes = np.where(extended_conf_labels != -1)[0]
test_nodes = np.where(extended_conf_labels == -1)[0]

In [71]:
len(test_nodes)

43

In [72]:
extended_conf_labels[1:100]

array([ 1.,  0.,  0.,  1.,  0.,  0.,  3.,  0.,  1.,  0.,  0.,  1.,  0.,
        2.,  2.,  3.,  0.,  1.,  2., -1.,  2.,  2.,  3.,  2.,  0.,  0.,
        3.,  1.,  1.,  3.,  3.,  3.,  0.,  2.,  0.,  3.,  1.,  0.,  0.,
        0.,  1.,  3.,  0.,  3.,  1.,  0.,  2.,  0.,  3.,  2.,  0.,  1.,
        1.,  2.,  1.,  2.,  2.,  2.,  3.,  0.,  0.,  3.,  0.,  1.,  0.,
        1.,  2.,  1.,  2.,  0.,  3.,  1., -1.,  2.,  2.,  1.,  0.,  2.,
        3., -1.,  1., -1.,  1.,  0.,  2.,  2.,  0.,  0.,  2.,  2.,  0.,
        1.,  3.,  0.,  1.,  2.,  0.,  1.,  1.])

In [73]:
dataset  = torch.utils.data.TensorDataset(torch.tensor(X_extended), torch.tensor(extended_conf_labels))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True)

test_dataset  = torch.utils.data.TensorDataset(torch.tensor(X_extended), torch.tensor(extended_real_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=False)

In [74]:
m = GCNModel("configs/2_40.txt", 2, dropout=0.0)
m.train(dataloader, 150)

Loss in epoch 0 = 27.671219
Loss in epoch 10 = 0.084420
Loss in epoch 20 = 0.013673
Loss in epoch 30 = 0.007804
Loss in epoch 40 = 0.003306
Loss in epoch 50 = 0.002280
Loss in epoch 60 = 0.001337
Loss in epoch 70 = 0.001127
Loss in epoch 80 = 0.001028
Loss in epoch 90 = 0.000634
Loss in epoch 100 = 0.000479
Loss in epoch 110 = 0.000367
Loss in epoch 120 = 0.000379
Loss in epoch 130 = 0.000268
Loss in epoch 140 = 0.000334


In [75]:
m.validation_metrics(test_dataloader, train_nodes, test_nodes)

(0.9715856909751892,
 array([[421,   1,   2,   0],
        [  0, 287,   1,   0],
        [  0,   0, 198,   1],
        [  7,   4,  15, 154]]),
 0.9866412281990051,
 array([[413,   1,   2,   0],
        [  0, 286,   0,   0],
        [  0,   0, 198,   1],
        [  4,   1,   5, 137]]),
 0.604651153087616,
 array([[ 8,  0,  0,  0],
        [ 0,  1,  1,  0],
        [ 0,  0,  0,  0],
        [ 3,  3, 10, 17]]))