In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import random
import utilities
import torch
from label_sharpen_arm import LabelSharpen
from label_prop_arm import LabelProp
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier

In [2]:
datasets = []
for i in np.arange(.1,.5,.1):
    i = round(i, 5)
    file_path = "simulations/splat_" + str(i) + "_de/"
    temp_data = pd.read_csv(file_path + "counts.csv", index_col=0)
    temp_meta = pd.read_csv(file_path + "meta.csv", index_col=0)
    temp_preds = pd.read_csv(file_path + "predictions.csv", index_col=0)
    temp_X = np.array(temp_data)
    temp_y = pd.factorize(temp_meta['Group'], sort=True)[0]

    temp = temp_preds.apply(pd.factorize, axis=0, sort=True)
    temp = temp.iloc[0,:]
    indices = list(temp.index)
    d = {key: None for key in indices}
    for i in range(temp.shape[0]):
        d[indices[i]] = temp.iloc[i]

    temp_preds = pd.DataFrame(d)
    #temp_preds.apply(pd.factorize(), axis=0)
    #temp_preds = pd.factorize(temp_preds, sort=True)[0]
    datasets.append((temp_X, temp_y, temp_preds))

In [3]:
def randomize_encoding(row):
    change = np.random.normal(.5, .2)
    if change < 0: change = 0
    all_indices = list(range(len(row)))
    max_index = np.argmax(row)
    all_indices.pop(max_index)
    row[max_index] = 1 - change
    selections = random.sample(all_indices, 2)

    for i in selections:
        row[i] = change/2

    return row

In [28]:
def prep_data(dataset):
    features = dataset[0]
    labels = dataset[1]
    preds = dataset[2]

    all_preds = []
    for i in range(preds.shape[1]):
        all_preds.append(preds.iloc[:,i].to_numpy())
    all_preds = np.array(all_preds).flatten()
    
    #add -1 then remove so encoder takes into account unknowns even if there isn't any
    all_preds = np.append(all_preds, -1)
    enc = OneHotEncoder(drop='first')
    encoded_y = enc.fit_transform(all_preds.reshape(-1,1)).toarray()
    encoded_y = encoded_y[:-1,:]
    # need to add three scores together
    final_encoded = np.zeros(shape=(preds.shape[0],encoded_y.shape[1]))
    scoring_length = preds.shape[0]
    lower =0
    upper = scoring_length
    for i in range(int(len(encoded_y)/preds.shape[0])):
        final_encoded += encoded_y[lower:upper,:]
        lower = upper
        upper += scoring_length

    # turn encoded into prob. by dividng each row by it's sum
    final_encoded = final_encoded / final_encoded.sum(axis=1, keepdims=True)
    """enc = OneHotEncoder()
    encoded_y = enc.fit_transform(labels.reshape(-1,1)).toarray()
    encoded_y = np.apply_along_axis(randomize_encoding, 1, encoded_y)"""

    features = utilities.preprocess(features, scale=False)

    return final_encoded, features, labels


In [79]:
# prep 0.4 simulated training and validation sets
encoded_y_one, features_one, y_one = prep_data(datasets[0])
train_features = features_one[:800,:]
#train_X = features[:10,:]
train_X = encoded_y_one[:800,:]
train_y = y_one[:800]

test_features = features_one[800:,:]
test_X = encoded_y_one[800:,:]
test_y = y_one[800:]

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_features), torch.tensor(train_X),torch.tensor(train_y))
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=35, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_features), torch.tensor(test_X),torch.tensor(test_y))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=35, shuffle=True)

In [33]:
# prep test 0.3

encoded_y_three, features_three, y_three = prep_data(datasets[2])

test_dataset_three = torch.utils.data.TensorDataset(torch.tensor(features_three), torch.tensor(encoded_y_three),torch.tensor(y_three))
test_dataloader_three = torch.utils.data.DataLoader(test_dataset_three, batch_size=35, shuffle=True)

In [34]:
# prep test 0.2

encoded_y_two, features_two, y_two = prep_data(datasets[1])

test_dataset_two = torch.utils.data.TensorDataset(torch.tensor(features_two), torch.tensor(encoded_y_two),torch.tensor(y_two))
test_dataloader_two = torch.utils.data.DataLoader(test_dataset_two, batch_size=35, shuffle=True)

In [35]:
# prep test 0.1

encoded_y_one, features_one, y_one = prep_data(datasets[0])

test_dataset_one = torch.utils.data.TensorDataset(torch.tensor(features_one), torch.tensor(encoded_y_one),torch.tensor(y_one))
test_dataloader_one = torch.utils.data.DataLoader(test_dataset_one, batch_size=35, shuffle=True)

In [86]:
arm = LabelSharpen("configs/sharpen_basic.txt", 2)

In [87]:
arm.train(train_dataloader,100)

Loss in epoch 0 = 28.803392
Loss in epoch 10 = 13.313494
Loss in epoch 20 = 11.186023
Loss in epoch 30 = 10.507597
Loss in epoch 40 = 10.075891
Loss in epoch 50 = 9.643491
Loss in epoch 60 = 9.346367
Loss in epoch 70 = 8.955503
Loss in epoch 80 = 8.736804
Loss in epoch 90 = 8.839277


In [88]:
arm.validation_metrics(train_dataloader)

(0.9049999713897705,
 array([[329,  10,   2,   5],
        [ 11, 150,  10,   2],
        [  7,   9, 157,   4],
        [  5,   7,   4,  88]]))

In [97]:
(torch.tensor(datasets[0][2]['SCINA'].to_numpy()[800:]) == torch.tensor(test_y)).type(torch.FloatTensor).mean()

tensor(0.9150)

In [83]:
final_pred = torch.tensor(train_X).max(dim=1)[1]
equality = torch.tensor(train_y) == final_pred
equality.type(torch.FloatTensor).mean()

tensor(0.8925)

In [89]:
arm.validation_metrics(test_dataloader)

(0.8899999856948853,
 array([[82,  1,  1,  1],
        [ 6, 40,  3,  1],
        [ 3,  0, 36,  3],
        [ 2,  1,  0, 20]]))

In [90]:
final_pred = torch.tensor(test_X).max(dim=1)[1]
equality = torch.tensor(test_y) == final_pred
equality.type(torch.FloatTensor).mean()

tensor(0.8900)

In [62]:
arm.validation_metrics(test_dataloader_three)

(1.0,
 array([[431,   0,   0,   0],
        [  0, 223,   0,   0],
        [  0,   0, 219,   0],
        [  0,   0,   0, 127]]))

In [43]:
final_pred = torch.tensor(encoded_y_three).max(dim=1)[1]
equality = torch.tensor(y_three) == final_pred
equality.type(torch.FloatTensor).mean()

tensor(1.)

In [63]:
arm.validation_metrics(test_dataloader_two)

(0.9959999918937683,
 array([[428,   3,   0,   0],
        [  0, 223,   0,   0],
        [  1,   0, 218,   0],
        [  0,   0,   0, 127]]))

In [45]:
final_pred = torch.tensor(encoded_y_two).max(dim=1)[1]
equality = torch.tensor(y_two) == final_pred
equality.type(torch.FloatTensor).mean()

tensor(0.9980)

In [46]:
arm.validation_metrics(test_dataloader_one)

(0.8090000152587891,
 array([[380,   8,  16,  27],
        [ 70, 120,  16,  17],
        [ 11,   4, 202,   2],
        [  6,   3,  11, 107]]))

In [47]:
final_pred = torch.tensor(encoded_y_one).max(dim=1)[1]
equality = torch.tensor(y_one) == final_pred
equality.type(torch.FloatTensor).mean()

tensor(0.8920)

In [3]:
def prep_data_test(dataset):
    features = dataset[0]
    labels = dataset[1]
    preds = dataset[2]

    all_preds = []
    for i in range(preds.shape[1]):
        all_preds.append(preds.iloc[:,i].to_numpy())
    all_preds = np.array(all_preds).flatten()
    
    #add -1 then remove so encoder takes into account unknowns even if there isn't any
    all_preds = np.append(all_preds, -1)
    enc = OneHotEncoder(drop='first')
    encoded_y = enc.fit_transform(all_preds.reshape(-1,1)).toarray()
    encoded_y = encoded_y[:-1,:]
    # need to add three scores together
    final_encoded = np.zeros(shape=(preds.shape[0],encoded_y.shape[1]))
    scoring_length = preds.shape[0]
    lower =0
    upper = scoring_length
    for i in range(int(len(encoded_y)/preds.shape[0])):
        final_encoded += encoded_y[lower:upper,:]
        lower = upper
        upper += scoring_length

    # turn encoded into prob. by dividng each row by it's sum
    #final_encoded = final_encoded / final_encoded.sum(axis=1, keepdims=True)
    """enc = OneHotEncoder()
    encoded_y = enc.fit_transform(labels.reshape(-1,1)).toarray()
    encoded_y = np.apply_along_axis(randomize_encoding, 1, encoded_y)"""

    features = utilities.preprocess(features, scale=False)

    return final_encoded, features, labels

In [4]:
encoded_y_one, features_one, y_one = prep_data_test(datasets[0])

In [5]:
encoded_y_one[1:10,:]

array([[2., 0., 0., 1.],
       [0., 0., 2., 0.],
       [1., 0., 2., 0.],
       [2., 0., 0., 0.],
       [0., 2., 0., 0.],
       [3., 0., 0., 0.],
       [0., 2., 0., 0.],
       [0., 0., 2., 0.],
       [0., 1., 1., 0.]])

In [6]:
confident_labels = np.zeros(shape = (encoded_y_one.shape[0],))
for i in range(encoded_y_one.shape[0]):
    row = encoded_y_one[i,:]
    max_index = np.argmax(row)
    if row[max_index] > 1:
        confident_labels[i] = max_index
    else: confident_labels[i] = -1


In [7]:
correct = 0
total = 0

for i in range(len(confident_labels)):
    if confident_labels[i] == -1: continue
    if confident_labels[i] == y_one[i]: correct +=1
    total += 1

print(correct)
print(total)

788
840


In [13]:
test_nodes = np.where(confident_labels == -1)[0]

In [48]:
prop_arm = LabelProp("configs/semi_basic_linear.txt", 2)

In [49]:
prop_dataset  = torch.utils.data.TensorDataset(torch.tensor(features_one), torch.tensor(confident_labels))
prop_dataloader = torch.utils.data.DataLoader(prop_dataset, batch_size=35, shuffle=True)

prop_test_dataset  = torch.utils.data.TensorDataset(torch.tensor(features_one), torch.tensor(y_one))
prop_test_dataloader = torch.utils.data.DataLoader(prop_test_dataset, batch_size=35, shuffle=False)

In [50]:
prop_arm.train(prop_dataloader, 100)

Loss in epoch 0 = 38.440067
Loss in epoch 10 = 3.111017
Loss in epoch 20 = 0.272159
Loss in epoch 30 = 0.074041
Loss in epoch 40 = 0.031537
Loss in epoch 50 = 0.020813
Loss in epoch 60 = 0.011673
Loss in epoch 70 = 0.009322
Loss in epoch 80 = 0.004970
Loss in epoch 90 = 0.004087


In [51]:
prop_arm.validation_metrics(prop_test_dataloader, test_nodes=test_nodes)

(0.9129999876022339,
 array([[398,  13,  13,   7],
        [ 11, 200,   8,   4],
        [  6,   8, 204,   1],
        [  6,   5,   5, 111]]),
 0.78125,
 array([[68,  3, 10,  3],
        [ 2, 27,  4,  2],
        [ 4,  2, 21,  1],
        [ 1,  1,  2,  9]]))

Confident labels: 788/840
Unconfident labels: 125/160
Total: 913/1000 = 91.3%

In [52]:
id = 0
print((torch.tensor(datasets[id][2]['scSorter']) == torch.tensor(y_one)).type(torch.FloatTensor).mean())
print((torch.tensor(datasets[id][2]['SCINA']) == torch.tensor(y_one)).type(torch.FloatTensor).mean())
print((torch.tensor(datasets[id][2]['scType']) == torch.tensor(y_one)).type(torch.FloatTensor).mean())

tensor(0.7890)
tensor(0.9230)
tensor(0.0630)


In [53]:
print((torch.tensor(datasets[id][2]['scSorter'])[test_nodes] == torch.tensor(y_one[test_nodes])).type(torch.FloatTensor).mean())
print((torch.tensor(datasets[id][2]['SCINA'])[test_nodes] == torch.tensor(y_one[test_nodes])).type(torch.FloatTensor).mean())
print((torch.tensor(datasets[id][2]['scType'])[test_nodes] == torch.tensor(y_one[test_nodes])).type(torch.FloatTensor).mean())

tensor(0.1063)
tensor(0.8438)
tensor(0.)
