In [None]:
import pandas as pd
from os import listdir
from os.path import isfile, join

In [146]:
cancer_type = "prostate"
treatments = ['rad', 'surg']
#treatments = ['chemorad', 'surg']
mypath = "/share/pi/rubin/jiaming/nlp_results/"

In [147]:
def load_results(mypath, cancer_type, data_arch="struct"):
    if data_arch == "embed":
        files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
        files = [f for f in files if f.startswith(cancer_type + "_doc2vec")] + [f for f in files if f.startswith(cancer_type + "_fasttext")]

        results = pd.DataFrame([])
        for f in files:
            results2 = pd.read_csv(mypath + f, index_col=0)
            if (len(results2) == 270):
                print ("{} finished.".format(f))
            else:
                print ("{} unfinished: {}".format(f, len(results2)))
            results = pd.concat([results, results2], ignore_index=True)

        results = results.drop([], axis=1)
    else:
        results = pd.read_csv(mypath + "{}_results.csv".format(cancer_type), index_col=0)
        results = results.rename(columns = {'model': 'method'})

    return results

def create_summary_table(results, treatments):
    results['f1_macro'] = (results['{}_f1_'.format(treatments[0])] + results['{}_f1_'.format(treatments[1])])/2
    methods_pd = pd.DataFrame(results.method.str.split("-").tolist(),columns = ['arch','ml_method'])
    results = pd.concat([methods_pd, results], axis = 1)
    
    idx_max = results.groupby(['arch'])['f1_macro'].transform(max) == results['f1_macro']
    
    best_results = results[idx_max].reset_index(drop=True)
    best_results = best_results.drop_duplicates(['arch', 'f1_macro'])
    # Columns to print
    metrics = ["recall", "precision", "f1"]
    output_columns = ['arch']
    for t in treatments:
        for m in metrics:
            prefix = '{}_{}_'.format(t, m)
            best_results[prefix + 'CI'] = best_results.apply(lambda x: "{:.2f} [{:.2f}, {:.2f}]".format(x[prefix], x[prefix+'lower'], x[prefix+'upper']), axis=1)
            output_columns.append(prefix + 'CI')
        
    return best_results[output_columns]

In [148]:
# Load structured and unstructured results
results_struct = load_results(mypath, cancer_type, "struct")
results_embed = load_results(mypath, cancer_type, "embed")

prostate_doc2vec_t0_vs300_w3_results.csv finished.
prostate_doc2vec_t0_vs500_w3_results.csv finished.
prostate_doc2vec_t1_vs500_w3_results.csv unfinished: 185
prostate_doc2vec_t0_vs100_w5_results.csv finished.
prostate_doc2vec_t0_vs300_w5_results.csv finished.
prostate_doc2vec_t1_vs300_w5_results.csv finished.
prostate_doc2vec_t1_vs300_w3_results.csv finished.
prostate_doc2vec_t0_vs500_w5_results.csv finished.
prostate_doc2vec_t0_vs100_w3_results.csv finished.
prostate_doc2vec_t1_vs100_w5_results.csv finished.
prostate_doc2vec_t1_vs500_w5_results.csv unfinished: 175
prostate_doc2vec_t1_vs100_w3_results.csv finished.
prostate_fasttext_t1_vs100_w5_results.csv finished.
prostate_fasttext_t0_vs100_w3_results.csv finished.
prostate_fasttext_t0_vs300_w3_results.csv finished.
prostate_fasttext_t0_vs100_w5_results.csv finished.
prostate_fasttext_t1_vs500_w3_results.csv finished.
prostate_fasttext_t1_vs300_w5_results.csv finished.
prostate_fasttext_t0_vs500_w5_results.csv finished.
prostate_fas

In [149]:
# Process and concatenate both together
best_struct = create_summary_table(results_struct, treatments)
best_embed = create_summary_table(results_embed, treatments)

In [150]:
best_summary = pd.concat([best_struct, best_embed], ignore_index=True)

In [151]:
reindex = ["structured", "bow", "doc2vec", "fasttext", "structured+bow", "structured+doc2vec", "structured+fasttext"]
best_summary.index = best_summary.arch
best_summary = best_summary.reindex(reindex)
best_summary.to_csv("{}_summary.csv".format(cancer_type), index=False)

In [152]:
best_summary

Unnamed: 0_level_0,arch,rad_recall_CI,rad_precision_CI,rad_f1_CI,surg_recall_CI,surg_precision_CI,surg_f1_CI
arch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
structured,structured,"0.83 [0.72, 0.92]","0.98 [0.93, 1.00]","0.90 [0.82, 0.95]","0.99 [0.98, 1.00]","0.94 [0.90, 0.97]","0.97 [0.94, 0.98]"
bow,bow,"0.98 [0.94, 1.00]","0.98 [0.94, 1.00]","0.98 [0.95, 1.00]","0.99 [0.98, 1.00]","0.99 [0.98, 1.00]","0.99 [0.98, 1.00]"
doc2vec,doc2vec,"0.98 [0.94, 1.00]","0.98 [0.95, 1.00]","0.98 [0.96, 1.00]","0.99 [0.98, 1.00]","0.99 [0.98, 1.00]","0.99 [0.98, 1.00]"
fasttext,fasttext,"0.80 [0.70, 0.90]","0.92 [0.83, 0.98]","0.86 [0.78, 0.92]","0.97 [0.95, 0.99]","0.93 [0.89, 0.97]","0.95 [0.93, 0.97]"
structured+bow,,,,,,,
structured+doc2vec,structured+doc2vec,"0.98 [0.94, 1.00]","1.00 [1.00, 1.00]","0.99 [0.97, 1.00]","1.00 [1.00, 1.00]","0.99 [0.98, 1.00]","1.00 [0.99, 1.00]"
structured+fasttext,structured+fasttext,"0.98 [0.95, 1.00]","0.95 [0.89, 1.00]","0.97 [0.93, 0.99]","0.98 [0.96, 1.00]","0.99 [0.98, 1.00]","0.99 [0.97, 1.00]"


In [39]:
methods_pd.data.unique()

array(['doc2vec', 'structured+doc2vec', 'fasttext', 'structured+fasttext'],
      dtype=object)

In [6]:
doc2vec = results.loc[(methods_pd.data == 'doc2vec') & (methods_pd.type == "test")]

In [27]:
doc2vec = results.loc[(methods_pd.data == 'structured+fasttext') & (methods_pd.type == "test")]

In [28]:
max(doc2vec["all"])

0.78125

In [29]:
doc2vec[doc2vec["all"] > 0.78]

Unnamed: 0,model,alg,vs,epochs,alpha,window,sample,ns_exponent,method,all,Surgery+others,"('CHEMO', 'RADIATION')",Surgery+others_precision,Surgery+others_recall,Surgery+others_f1,"('CHEMO', 'RADIATION')_precision","('CHEMO', 'RADIATION')_recall","('CHEMO', 'RADIATION')_f1"
9239,fasttext,0,500,5,0.25,5,0.0001,0.75,structured+fasttext-xgb-test,0.78125,0.761905,0.818182,0.888889,0.761905,0.820513,0.642857,0.818182,0.72
9911,fasttext,1,300,5,0.25,3,0.01,0.75,structured+fasttext-xgb-test,0.78125,0.761905,0.818182,0.888889,0.761905,0.820513,0.642857,0.818182,0.72
12743,fasttext,0,500,10,0.25,3,0.0,0.75,structured+fasttext-xgb-test,0.78125,0.761905,0.818182,0.888889,0.761905,0.820513,0.642857,0.818182,0.72
15095,fasttext,1,100,5,0.25,5,0.01,0.75,structured+fasttext-xgb-test,0.78125,0.714286,0.909091,0.9375,0.714286,0.810811,0.625,0.909091,0.740741


In [38]:
results.loc[(methods_pd.data == 'structured+fasttext') & (methods_pd.method == "xgb") & (results.alg == 0) & (results.vs == 500) & (results.epochs == 5) & (results.alpha == 0.25) & (results.window == 5)]

Unnamed: 0,model,alg,vs,epochs,alpha,window,sample,ns_exponent,method,all,Surgery+others,"('CHEMO', 'RADIATION')",Surgery+others_precision,Surgery+others_recall,Surgery+others_f1,"('CHEMO', 'RADIATION')_precision","('CHEMO', 'RADIATION')_recall","('CHEMO', 'RADIATION')_f1"


In [51]:
temp = doc2vec[doc2vec["all"] > 0.99].iloc[3]

In [52]:
temp

model                                      doc2vec
alg                                              0
vs                                             500
epochs                                           5
alpha                                        0.025
window                                           5
sample                                        0.01
ns_exponent                                   0.75
method                          doc2vec-ridge-test
all                                        0.99061
Radiation/+hormone                        0.982456
('SURGERY_1',)                             0.99359
Radiation/+hormone_precision              0.982456
Radiation/+hormone_recall                 0.982456
Radiation/+hormone_f1                     0.982456
('SURGERY_1',)_precision                   0.99359
('SURGERY_1',)_recall                      0.99359
('SURGERY_1',)_f1                          0.99359
Name: 4664, dtype: object

In [47]:
# Calculate the sensitivity [13] precision [12] recall [14] f1
sensitivity = temp.iloc[13]
precision = temp.iloc[12]
n_pos = 503
n_neg = 1642

In [48]:
TP = sensitivity * n_pos
FP = TP * (1 - precision) / precision
TN = n_neg - FP

In [49]:
TN / (TN + FP) # specificity

1.0

In [50]:
TP / n_pos # check sensitivity

0.9649122807017544

In [153]:
# Read baseline results
results_base = pd.read_csv(mypath + "{}_1_results.csv".format(cancer_type), index_col=0)
methods_base = pd.DataFrame(results_base.model.str.split("-").tolist(), columns = ['data', 'apdx', 'method', 'type'])

In [154]:
results_base.head()

Unnamed: 0,model,all,Surgery+others,"('CHEMO', 'RADIATION', 'SURGERY_1')",Surgery+others_precision,Surgery+others_recall,Surgery+others_f1,"('CHEMO', 'RADIATION', 'SURGERY_1')_precision","('CHEMO', 'RADIATION', 'SURGERY_1')_recall","('CHEMO', 'RADIATION', 'SURGERY_1')_f1"
0,structured-lr-train,0.867403,0.85,0.901639,0.944444,0.85,0.894737,0.753425,0.901639,0.820896
1,structured-lr-valid,0.809524,0.846154,0.75,0.846154,0.846154,0.846154,0.75,0.75,0.75
2,structured-lr-test,0.913043,0.882353,1.0,1.0,0.882353,0.9375,0.75,1.0,0.857143
3,structured-rf-train,0.977901,0.991667,0.95082,0.97541,0.991667,0.983471,0.983051,0.95082,0.966667
4,structured-rf-valid,0.904762,1.0,0.75,0.866667,1.0,0.928571,1.0,0.75,0.857143


In [161]:
structured = results_base.loc[(methods_base.data == 'bow') & (methods_base.method == "test")]

In [159]:
structured = results_base.loc[(methods_base.data == 'structured') & (methods_base.apdx == 'bow') & (methods_base.type == "test")]

In [162]:
structured

Unnamed: 0,model,all,Surgery+others,"('CHEMO', 'RADIATION', 'SURGERY_1')",Surgery+others_precision,Surgery+others_recall,Surgery+others_f1,"('CHEMO', 'RADIATION', 'SURGERY_1')_precision","('CHEMO', 'RADIATION', 'SURGERY_1')_recall","('CHEMO', 'RADIATION', 'SURGERY_1')_f1"
14,bow-lr-test,0.956522,0.941176,1.0,1.0,0.941176,0.969697,0.857143,1.0,0.923077
17,bow-rf-test,0.913043,0.941176,0.833333,0.941176,0.941176,0.941176,0.833333,0.833333,0.833333
20,bow-ridge-test,0.869565,0.941176,0.666667,0.888889,0.941176,0.914286,0.8,0.666667,0.727273
23,bow-xgb-test,0.913043,1.0,0.666667,0.894737,1.0,0.944444,1.0,0.666667,0.8
