In [2]:
%matplotlib inline

In [3]:
import matplotlib.pyplot as plt

import scipy as sp
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import utils
from sklearn import metrics

# Testing OneClassSVM

In [4]:
#unsupervised outlier detection
#anomaly detection
#train only on protons

In [5]:
p_data = sp.sparse.load_npz('../data/tilt/20x20x20/pDisc_40000_20x20x20_tilt.npz')
p_labels = np.ones((p_data.shape[0],))

In [6]:
p_train, p_test, p_labels_train, p_labels_test = train_test_split(p_data, p_labels, test_size=0.25, random_state=42)

In [7]:
#import carbon and noise for test
C_data = sp.sparse.load_npz('../data/tilt/20x20x20/CDisc_40000_20x20x20_tilt.npz')
noise_data = sp.sparse.load_npz('../data/tilt/20x20x20/noiseDisc_40000_20x20x20.npz')

In [8]:
print(C_data.shape)
print(noise_data.shape)

C_test = C_data[0:10000]
noise_test  = noise_data[0:10000]

print(C_test.shape)
print(noise_test.shape)

(40001, 8000)
(40000, 8000)
(10000, 8000)
(10000, 8000)


In [9]:
full_test = sp.sparse.vstack([p_test, C_test, noise_test], format='csr')

print(p_test.shape)
print(C_test.shape)
print(noise_test.shape)

#third label array p, c, j

full_labels = np.hstack((p_labels_test, np.negative(np.ones(C_test.shape[0])), np.negative(np.ones(noise_test.shape[0]))))
#print(full_data_nocharge.shape)

full_charLabels = ['p']*p_test.shape[0] + ['c']*C_test.shape[0] + ['n']*noise_test.shape[0]

full_test, full_labels, full_charLabels = utils.shuffle(full_test, full_labels, full_charLabels)

print(full_test.shape)
print(full_labels.shape)
print(len(full_charLabels))

(10001, 8000)
(10000, 8000)
(10000, 8000)
(30001, 8000)
(30001,)
30001


In [19]:
clf = svm.OneClassSVM(nu=0.08, kernel='rbf')
clf.fit(p_train[0:-1])

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.08, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [20]:
subset_size = -1

In [21]:
y_pred_train = clf.predict(p_train[0:subset_size]) 
y_pred_test = clf.predict(full_test)

In [22]:
print("f1-score with proton training data: " + str(metrics.f1_score(p_labels_train[0:subset_size], y_pred_train)))
print("f1-score with mixed proton/carbon/noise test data: " + str(metrics.f1_score(full_labels, y_pred_test)))

f1-score with proton training data: 0.958313801066
f1-score with mixed proton/carbon/noise test data: 0.567796349309


# Looking at test set results

In [23]:
n_test_p = p_test.shape[0]
n_test_C = C_test.shape[0]
n_test_noise = noise_test.shape[0]

p_correct = 0              #protons predicted correctly (1)
C_correct = 0              #carbons predicted correctly (-1)
noise_correct = 0          #noise events predicted correctly = (-1)

for i in range(len(y_pred_test)):
    #if prediction was correct
    if(full_labels[i] == y_pred_test[i]):
        if (full_charLabels[i] == 'p'):
            p_correct += 1
        elif (full_charLabels[i] == 'c'):
            C_correct += 1
        elif (full_charLabels[i] == 'n'):
            noise_correct += 1

In [24]:
print("Proton accuracy: %.2f%%" % ((p_correct/n_test_p)*100))
print("Carbon accuracy: %.2f%%" % ((C_correct/n_test_C)*100))
print("Noise event accuracy: %.2f%%" % ((noise_correct/n_test_noise)*100))

Proton accuracy: 92.22%
Carbon accuracy: 64.34%
Noise event accuracy: 3.03%


In [25]:
print("protons correct = " + str(p_correct) + "   | protons incorrect = " + str(n_test_p-p_correct) )
print("carbons correct = " + str(C_correct) + "   | carbons incorrect = " + str(n_test_C-C_correct) )
print("noise   correct = " + str(noise_correct) + "    | noise incorrect = " + str(n_test_noise-noise_correct) )

protons correct = 9223   | protons incorrect = 778
carbons correct = 6434   | carbons incorrect = 3566
noise   correct = 303    | noise incorrect = 9697
