In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt

import scipy as sp
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import utils
from sklearn import metrics

# Testing OneClassSVM

# Tests with Simulated Data

In [3]:
#unsupervised outlier detection
#anomaly detection
#train only on protons

In [4]:
p_data = sp.sparse.load_npz('../data/tilt/20x20x20/pDisc_40000_20x20x20_tilt_largeEvts.npz')
p_labels = np.ones((p_data.shape[0],))

In [5]:
p_train, p_test, p_labels_train, p_labels_test = train_test_split(p_data, p_labels, test_size=0.25, random_state=42)

In [6]:
#import carbon and noise for test
C_data = sp.sparse.load_npz('../data/tilt/20x20x20/CDisc_40000_20x20x20_tilt_largeEvts.npz')
noise_data = sp.sparse.load_npz('../data/tilt/20x20x20/noiseDisc_40000_20x20x20.npz')

In [7]:
print(C_data.shape)
print(noise_data.shape)

C_test = C_data[0:10000]
noise_test  = noise_data[0:10000]

print(C_test.shape)
print(noise_test.shape)

(40001, 8000)
(40000, 8000)
(10000, 8000)
(10000, 8000)


In [8]:
full_test = sp.sparse.vstack([p_test, C_test, noise_test], format='csr')
print(full_test.shape)

full_labels = np.hstack((p_labels_test, np.negative(np.ones(C_test.shape[0])), np.negative(np.ones(noise_test.shape[0]))))

full_charLabels = ['p']*p_test.shape[0] + ['c']*C_test.shape[0] + ['n']*noise_test.shape[0]

full_test, full_labels, full_charLabels = utils.shuffle(full_test, full_labels, full_charLabels)

(30001, 8000)


In [9]:
clf = svm.OneClassSVM(nu=0.08, kernel='linear')
clf.fit(p_train[0:-1])

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto',
      kernel='linear', max_iter=-1, nu=0.05, random_state=None,
      shrinking=True, tol=0.001, verbose=False)

In [10]:
subset_size = -1

In [11]:
y_pred_train = clf.predict(p_train[0:subset_size]) 
y_pred_test = clf.predict(full_test)

In [12]:
print(metrics.classification_report(full_labels, y_pred_test))
print(metrics.confusion_matrix(full_labels, y_pred_test))

             precision    recall  f1-score   support

       -1.0       0.71      0.07      0.13     20000
        1.0       0.34      0.94      0.50     10001

avg / total       0.58      0.36      0.25     30001

[[ 1384 18616]
 [  577  9424]]


In [13]:
print("f1-score with proton training data: " + str(metrics.f1_score(p_labels_train[0:subset_size], y_pred_train)))
print("f1-score with mixed proton/carbon/noise test data: " + str(metrics.f1_score(full_labels, y_pred_test)))

f1-score with proton training data: 0.9741125777990561
f1-score with mixed proton/carbon/noise test data: 0.49546541889014484


In [14]:
n_test_p = p_test.shape[0]
n_test_C = C_test.shape[0]
n_test_noise = noise_test.shape[0]

p_correct = 0              #protons predicted correctly (1)
C_correct = 0              #carbons predicted correctly (-1)
noise_correct = 0          #noise events predicted correctly = (-1)

for i in range(len(y_pred_test)):
    #if prediction was correct
    if(full_labels[i] == y_pred_test[i]):
        if (full_charLabels[i] == 'p'):
            p_correct += 1
        elif (full_charLabels[i] == 'c'):
            C_correct += 1
        elif (full_charLabels[i] == 'n'):
            noise_correct += 1

In [15]:
print("Proton accuracy: %.2f%%" % ((p_correct/n_test_p)*100))
print("Carbon accuracy: %.2f%%" % ((C_correct/n_test_C)*100))
print("Noise event accuracy: %.2f%%" % ((noise_correct/n_test_noise)*100))

Proton accuracy: 94.23%
Carbon accuracy: 0.08%
Noise event accuracy: 13.76%


In [16]:
print("protons correct = " + str(p_correct) + "   | protons incorrect = " + str(n_test_p-p_correct) )
print("carbons correct = " + str(C_correct) + "   | carbons incorrect = " + str(n_test_C-C_correct) )
print("noise   correct = " + str(noise_correct) + "    | noise incorrect = " + str(n_test_noise-noise_correct) )

print(metrics.confusion_matrix(full_labels, y_pred_test))

protons correct = 9424   | protons incorrect = 577
carbons correct = 8   | carbons incorrect = 9992
noise   correct = 1376    | noise incorrect = 8624
[[ 1384 18616]
 [  577  9424]]


# Tests with Real Data

In [18]:
#loading real data
p_0130 = sp.sparse.load_npz('../data/real/20x20x20/run_0130_pDisc.npz')
C_0130 = sp.sparse.load_npz('../data/real/20x20x20/run_0130_CDisc.npz')
junk_0130 = sp.sparse.load_npz('../data/real/20x20x20/run_0130_junkDisc.npz')
p_0210 = sp.sparse.load_npz('../data/real/20x20x20/run_0210_pDisc.npz')
C_0210 = sp.sparse.load_npz('../data/real/20x20x20/run_0210_CDisc.npz')
junk_0210 = sp.sparse.load_npz('../data/real/20x20x20/run_0210_junkDisc.npz')

p_real = sp.sparse.vstack([p_0130, p_0210], format='csr')
C_real = sp.sparse.vstack([C_0130, C_0210], format='csr')
junk_real = sp.sparse.vstack([junk_0130, junk_0210], format='csr')

In [19]:
p_real_labels = np.ones((p_real.shape[0],))
C_real_labels = np.negative(np.ones((C_real.shape[0],)))
junk_real_labels = np.negative(np.ones((junk_real.shape[0],)))

In [20]:
p_real_train, p_real_test, p_real_labels_train, p_real_labels_test = train_test_split(p_real, p_real_labels, test_size=0.25, random_state=42)

In [28]:
real_test = sp.sparse.vstack([p_real_test, C_real, junk_real], format='csr')
real_test_labels = np.hstack((p_real_labels_test, C_real_labels, junk_real_labels))

In [29]:
clf_real = svm.OneClassSVM(nu=0.08, kernel='linear')
clf_real.fit(p_real_train)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto',
      kernel='linear', max_iter=-1, nu=0.08, random_state=None,
      shrinking=True, tol=0.001, verbose=False)

In [30]:
y_real_pred = clf_real.predict(real_test)

In [31]:
print(metrics.classification_report(real_test_labels, y_real_pred))
print(metrics.confusion_matrix(real_test_labels, y_real_pred))

             precision    recall  f1-score   support

       -1.0       0.95      0.42      0.58      2026
        1.0       0.09      0.72      0.16       166

avg / total       0.88      0.44      0.55      2192

[[ 853 1173]
 [  47  119]]


# Tests with Transfer Learning

In [34]:
real_full = sp.sparse.vstack([p_real, C_real, junk_real], format='csr')
real_full_labels = np.hstack((p_real_labels, C_real_labels, junk_real_labels))
print(real_full.shape)
print(real_full_labels.shape)

(2689, 8000)
(2689,)


In [35]:
clf_transfer = svm.OneClassSVM(nu=0.08, kernel='linear')
clf_transfer.fit(p_train)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto',
      kernel='linear', max_iter=-1, nu=0.08, random_state=None,
      shrinking=True, tol=0.001, verbose=False)

In [38]:
y_transfer_pred = clf_real.predict(real_full)

In [39]:
print(metrics.classification_report(real_full_labels, y_transfer_pred))
print(metrics.confusion_matrix(real_full_labels, y_transfer_pred))

             precision    recall  f1-score   support

       -1.0       0.86      0.42      0.56      2026
        1.0       0.31      0.79      0.44       663

avg / total       0.72      0.51      0.53      2689

[[ 853 1173]
 [ 142  521]]
