In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

project_dir = '/Users/cpd/Projects/strongcnn/'
datapath = '/Users/cpd/Desktop/batches/'
bin_dir = project_dir + 'binarys/'

In [2]:
data = pd.read_csv(datapath + 'database.csv',
                       index_col=522)

print('data loaded')
# load up catalog for cutouts
cat = pd.read_csv(project_dir + 'catalog/cluster_catalog.csv')

data = data[data['category'] == 'training']
cat = cat[cat['category'] == 'training']

feature_cols = ['nn{0}'.format(i) for i in xrange(500)]

cuts = np.load(bin_dir + 'cuts.npy').item()
scores = np.load(bin_dir + 'scores.npy').item()

cut_train, cut_test = cuts['all']
cut_train_1, cut_test_1 = cuts[1]
cut_train_2, cut_test_2 = cuts[2]

train = {'all': (data.loc[cut_train][feature_cols].values,
                 data.loc[cut_train]['alpha'] == 1),
         1: (data[data['stage'] == 1].loc[cut_train_1][feature_cols].values,
             data[data['stage'] == 1].loc[cut_train_1]['alpha'] == 1),
         2: (data[data['stage'] == 2].loc[cut_train_2][feature_cols].values,
             data[data['stage'] == 2].loc[cut_train_2]['alpha'] == 1),
        }
test = {'all': (data.loc[cut_test][feature_cols].values,
                data.loc[cut_test]['alpha'] == 1),
         1: (data[data['stage'] == 1].loc[cut_test_1][feature_cols].values,
             data[data['stage'] == 1].loc[cut_test_1]['alpha'] == 1),
         2: (data[data['stage'] == 2].loc[cut_test_2][feature_cols].values,
             data[data['stage'] == 2].loc[cut_test_2]['alpha'] == 1),
        }


data loaded


In [15]:
# now compare against the mean_probability
# this isn't totally correct because the classifications are done by subject,
# and here subjects are now represented multiple numbers of times unevenly
y_test_sw = cat['kind'] == 'sim'
y_score_sw = cat['mean_probability']

cond_sw1 = cat['stage'] == 1
cond_sw2 = cat['stage'] == 2

y_test_sw1 = y_test_sw[cond_sw1]
y_score_sw1 = y_score_sw[cond_sw1]
y_test_sw2 = y_test_sw[cond_sw2]
y_score_sw2 = y_score_sw[cond_sw2]

# this is somewhat unfair because sw comes from the image,
# not the cutout
fpr_sw1, tpr_sw1, _ = roc_curve(y_test_sw1, y_score_sw1)
fpr_sw2, tpr_sw2, _ = roc_curve(y_test_sw2, y_score_sw2)
fpr_swall, tpr_swall, _ = roc_curve(y_test_sw, y_score_sw)

In [5]:
fpr = {}
tpr = {}
thresh = {}
classifiers = ['RandomForest', 'Softmax', 'SVM']
for classifier in classifiers:
    for stage in train:
        key_train = (classifier, stage, 'train')
        key_test = (classifier, stage, 'test')
        fpr[key_train], tpr[key_train], thresh[key_train] = roc_curve(train[stage][1], scores[key_train])
        fpr[key_test], tpr[key_test], thresh[key_test] = roc_curve(test[stage][1], scores[key_test])


In [20]:
classifier_colors = {'RandomForest': 'r', 
                     'Softmax': 'g',
                     'SVM': 'm'}
testtrain_style = {'test': '--',
                   'train': ':'}

for stage in [1, 2]:
    plt.figure()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    if (stage == 1) or (stage == 'all'):
        plt.plot(fpr_sw1, tpr_sw1, linestyle='-', color='Blue', linewidth=3, label='SpaceWarps Stage 1')
    if (stage == 2) or (stage == 'all'):
        plt.plot(fpr_sw2, tpr_sw2, linestyle='-', color='DarkOrange', linewidth=3, label='SpaceWarps Stage 2')
    if stage == 'all':
        plt.plot(fpr_swall, tpr_swall, linestyle='-', color='Crimson', linewidth=3, label='SpaceWarps All')
    for testtrain in ['test', 'train']:
        for classifier in classifiers:
            key = (classifier, stage, testtrain)
            label = '{0} Stage {1} {2}'.format(*key)
            plt.plot(fpr[key], tpr[key], linestyle=testtrain_style[testtrain], color=classifier_colors[classifier],
                     linewidth=3, label=label)
    plt.legend(loc='lower right')
    plt.xlim(-0.005, 0.5)
    plt.ylim(0.5, 1.0)
    plt.savefig(project_dir + 'doc/roc_curve_stage_{0}.pdf'.format(stage))