In [1]:
from ACDC.random_walk_classifier import * 
from ACDC.cell_type_annotation import * 

In [2]:
import pandas as pd
import numpy as np
from collections import Counter

path = 'data/AML_benchmark/'
df = pd.read_csv(path + 'AML_benchmark.csv.gz', sep=',', header = 0, compression = 'gzip')
df = df.drop(['Time', 'Cell_length','file_number', 'event_number', 'DNA1(Ir191)Di',
              'DNA2(Ir193)Di', 'Viability(Pt195)Di', 'subject'], axis = 1)

channels = [item[:item.find('(')] for item in df.columns[:-1]]
df.columns = channels + ['cell_type']


df = df.loc[df['cell_type'] != 'NotDebrisSinglets']

table = pd.read_csv(path + 'AML_table.csv', sep=',', header=0, index_col=0)
table = table.fillna(0)

cts, channels = get_label(table)


In [3]:
X0= np.arcsinh((df[channels].values - 1.0)/5.0)

idx2ct = [key for idx, key in enumerate(table.index)]
idx2ct.append('unknown')

ct2idx = {key:idx for idx, key in enumerate(table.index)}
ct2idx['unknown'] = len(table.index)
        
ct_score = np.abs(table.as_matrix()).sum(axis = 1)

## compute manual gated label
y0 = np.zeros(df.cell_type.shape)

for i, ct in enumerate(df.cell_type):
    if ct in ct2idx:
        y0[i] = ct2idx[ct]
    else:
        y0[i] = ct2idx['unknown']
        

In [4]:
import phenograph

tic = time.clock()
phenograph.cluster(X0)
toc = time.clock()
print(toc-tic)

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 81.13759088516235 seconds
Jaccard graph constructed in 8.448960304260254 seconds
Wrote graph to binary file in 5.9023706912994385 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873828
After 11 runs, maximum modularity is Q = 0.875041
Louvain completed 31 runs in 172.93613123893738 seconds
PhenoGraph complete in 269.9322965145111 seconds
550.4218629999999


In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix


thres = 0.5


In [6]:
import time

result = []
score_final = []
process_time = []

    
X = X0
y_true = y0

tic = time.clock()
mk_model =  compute_marker_model(pd.DataFrame(X, columns = channels), table, 0.0)

## compute posterior probs
score = get_score_mat(X, [], table, [], mk_model)
score = np.concatenate([score, 1.0 - score.max(axis = 1)[:, np.newaxis]], axis = 1)    

## get indices     
ct_index = get_unique_index(X, score, table, thres)
    

## running ACDC
tic = time.clock()
res_c = get_landmarks(X, score, ct_index, idx2ct, phenograph, thres)

landmark_mat, landmark_label = output_feature_matrix(res_c, [idx2ct[i] for i in range(len(idx2ct))]) 

landmark_label = np.array(landmark_label)

toc = time.clock()

time0 = toc - tic

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10385346412658691 seconds
Jaccard graph constructed in 0.15057158470153809 seconds
Wrote graph to binary file in 0.040326833724975586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.605397
After 14 runs, maximum modularity is Q = 0.606565
Louvain completed 34 runs in 0.8924710750579834 seconds
PhenoGraph complete in 1.1986892223358154 seconds
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 6.7386720180511475 seconds
Jaccard graph constructed in 1.5939991474151611 seconds
Wrote graph to binary file in 0.6559884548187256 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.664857
After 4 runs, maximum modularity is Q = 0.668278
Louvain completed 24 runs in 15.0370032787323 seconds
PhenoGraph complete in 24.246049642562866 seconds
Finding 30 nearest neighbors using mink

In [7]:
score_final = []
process_time = []
for n_neighbor in [10, 20, 30]:
    tic = time.clock()
    
    lp, y_pred = rm_classify(X, landmark_mat, landmark_label, n_neighbor)
    
    toc = time.clock()
    time1 = toc - tic
    
    process_time.append(time0 + time1)

    score_final.append(accuracy_score(y_true, [ct2idx[c] for c in y_pred]))    
    
    print(process_time)
    print(score_final)

[884.4606950000002]
[0.98361552637641092]
[884.4606950000002, 992.7849330000001]
[0.98361552637641092, 0.98305881901251635]
[884.4606950000002, 992.7849330000001, 1077.234291]
[0.98361552637641092, 0.98305881901251635, 0.98250211164862167]


In [8]:
df.shape

(104184, 33)