In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt

import scipy as sp
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [3]:
#loading simulated data
p_sim = sp.sparse.load_npz('../../data/tilt/20x20x20/pDisc_noise_40000_20x20x20_tilt.npz')
C_sim = sp.sparse.load_npz('../../data/tilt/20x20x20/CDisc_noise_40000_20x20x20_tilt.npz')
junk_sim = sp.sparse.load_npz('../../data/tilt/20x20x20/noiseDisc_40000_20x20x20.npz')

#loading real data
p_0130 = sp.sparse.load_npz('../../data/real/20x20x20/run_0130_pDisc.npz')
C_0130 = sp.sparse.load_npz('../../data/real/20x20x20/run_0130_CDisc.npz')
junk_0130 = sp.sparse.load_npz('../../data/real/20x20x20/run_0130_junkDisc.npz')
p_0210 = sp.sparse.load_npz('../../data/real/20x20x20/run_0210_pDisc.npz')
C_0210 = sp.sparse.load_npz('../../data/real/20x20x20/run_0210_CDisc.npz')
junk_0210 = sp.sparse.load_npz('../../data/real/20x20x20/run_0210_junkDisc.npz')

p_real = sp.sparse.vstack([p_0130, p_0210], format='csr')
C_real = sp.sparse.vstack([C_0130, C_0210], format='csr')
junk_real = sp.sparse.vstack([junk_0130, junk_0210], format='csr')

In [4]:
#creating labels
p_sim_labels = np.zeros((p_sim.shape[0],))
C_sim_labels = np.ones((C_sim.shape[0],))
junk_sim_labels = np.ones((junk_sim.shape[0],))

p_real_labels = np.zeros((p_real.shape[0],))
C_real_labels = np.ones((C_real.shape[0],))
junk_real_labels = np.ones((junk_real.shape[0],))

# Totaling Charges by Event

In [5]:
p_sim_charge = []
for i in range(p_sim.shape[0]):
    p_sim_charge.append(p_sim[i].sum())
    
C_sim_charge = []
for i in range(C_sim.shape[0]):
    C_sim_charge.append(C_sim[i].sum())
    
junk_sim_charge = []
for i in range(junk_sim.shape[0]):
    junk_sim_charge.append(junk_sim[i].sum())

In [6]:
p_real_charge = []
for i in range(p_real.shape[0]):
    p_real_charge.append(p_real[i].sum())
    
C_real_charge = []
for i in range(C_real.shape[0]):
    C_real_charge.append(C_real[i].sum())
    
junk_real_charge = []
for i in range(junk_real.shape[0]):
    junk_real_charge.append(junk_real[i].sum())

# Testing LR with ONE feature

# Predicting proton vs. Carbon (w/ Simulated Noise)

In [20]:
pC_sim = np.reshape(p_sim_charge + C_sim_charge, (-1,1))
pC_sim_labels = np.hstack((p_sim_labels, C_sim_labels))

pC_real = np.reshape(p_real_charge + C_real_charge, (-1,1))
pC_real_labels = np.hstack((p_real_labels, C_real_labels))

print(pC_sim.shape)
print(pC_real.shape)

(80002, 1)
(1003, 1)


In [21]:
pC_sim_train, pC_sim_test, pC_sim_labels_train, pC_sim_labels_test = train_test_split(pC_sim, pC_sim_labels, test_size=0.25, random_state=42)

In [22]:
reg = 0.1
lr_pC = LogisticRegression(C=reg)
lr_pC.fit(pC_sim_train, pC_sim_labels_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
#train on simulated + test on simulated 
pC_sim_pred = lr_pC.predict(pC_sim_test)

print("Accuracy: " + str(metrics.accuracy_score(pC_sim_labels_test, pC_sim_pred)))
print(metrics.classification_report(pC_sim_labels_test, pC_sim_pred))
print(metrics.confusion_matrix(pC_sim_labels_test, pC_sim_pred))

Accuracy: 0.6246687665616719
             precision    recall  f1-score   support

        0.0       0.62      0.66      0.64     10007
        1.0       0.63      0.59      0.61      9994

avg / total       0.63      0.62      0.62     20001

[[6623 3384]
 [4123 5871]]


In [24]:
#train on simulated + test on real
pC_real_pred = lr_pC.predict(pC_real)

print("Accuracy: " + str(metrics.accuracy_score(pC_real_labels, pC_real_pred)))
print(metrics.classification_report(pC_real_labels, pC_real_pred))
print(metrics.confusion_matrix(pC_real_labels, pC_real_pred))

Accuracy: 0.8404785643070788
             precision    recall  f1-score   support

        0.0       0.81      0.98      0.89       663
        1.0       0.95      0.56      0.70       340

avg / total       0.86      0.84      0.83      1003

[[652  11]
 [149 191]]


# Predicting proton vs. junk (w/ Simulated Noise)

In [36]:
# proton 0s
# junk 1s
pjunk_sim = np.reshape(p_sim_charge + junk_sim_charge, (-1,1))
pjunk_sim_labels = np.hstack((p_sim_labels, junk_sim_labels))

pjunk_real = np.reshape(p_real_charge + junk_real_charge, (-1,1))
pjunk_real_labels = np.hstack((p_real_labels, junk_real_labels))

In [37]:
pjunk_sim_train, pjunk_sim_test, pjunk_sim_labels_train, pjunk_sim_labels_test = train_test_split(pjunk_sim, pjunk_sim_labels, test_size=0.25, random_state=42)

In [38]:
reg = 0.001
lr_pjunk = LogisticRegression(C=reg)
lr_pjunk.fit(pjunk_sim_train, pjunk_sim_labels_train)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
#train on simulated + test on simulated 
pjunk_sim_pred = lr_pjunk.predict(pjunk_sim_test)

print("Accuracy: " + str(metrics.accuracy_score(pjunk_sim_labels_test, pjunk_sim_pred)))
print(metrics.classification_report(pjunk_sim_labels_test, pjunk_sim_pred))
print(metrics.confusion_matrix(pjunk_sim_labels_test, pjunk_sim_pred))

Accuracy: 0.544072796360182
             precision    recall  f1-score   support

        0.0       0.54      0.58      0.56     10002
        1.0       0.55      0.51      0.53      9999

avg / total       0.54      0.54      0.54     20001

[[5810 4192]
 [4927 5072]]


In [40]:
#train on simulated + test on real 
pjunk_real_pred = lr_pjunk.predict(pjunk_real)

print("Accuracy: " + str(metrics.accuracy_score(pjunk_real_labels, pjunk_real_pred)))
print(metrics.classification_report(pjunk_real_labels, pjunk_real_pred))
print(metrics.confusion_matrix(pjunk_real_labels, pjunk_real_pred))

Accuracy: 0.6287782034908471
             precision    recall  f1-score   support

        0.0       0.17      0.08      0.11       663
        1.0       0.70      0.84      0.77      1686

avg / total       0.55      0.63      0.58      2349

[[  54  609]
 [ 263 1423]]


# Predicting proton vs. Carbon + junk (w/ Simulated Noise)

In [25]:
# proton 0s
# Carbon+junk 1s
pCjunk_sim = np.reshape(p_sim_charge + C_sim_charge + junk_sim_charge, (-1,1))
pCjunk_sim_labels = np.hstack((p_sim_labels, C_sim_labels, junk_sim_labels))

pCjunk_real = np.reshape(p_real_charge + C_real_charge + junk_real_charge, (-1,1))
pCjunk_real_labels = np.hstack((p_real_labels, C_real_labels, junk_real_labels))

In [26]:
pCjunk_sim_train, pCjunk_sim_test, pCjunk_sim_labels_train, pCjunk_sim_labels_test = train_test_split(pCjunk_sim, pCjunk_sim_labels, test_size=0.25, random_state=42)

In [33]:
reg = 0.1
lr_pCjunk = LogisticRegression(C=reg)
lr_pCjunk.fit(pCjunk_sim_train, pCjunk_sim_labels_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
#train on simulated + test on simulated 
pCjunk_sim_pred = lr_pCjunk.predict(pCjunk_sim_test)

print("Accuracy: " + str(metrics.accuracy_score(pCjunk_sim_labels_test, pCjunk_sim_pred)))
print(metrics.classification_report(pCjunk_sim_labels_test, pCjunk_sim_pred))
print(metrics.confusion_matrix(pCjunk_sim_labels_test, pCjunk_sim_pred))

Accuracy: 0.6650111662944569
             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00     10050
        1.0       0.67      1.00      0.80     19951

avg / total       0.44      0.67      0.53     30001

[[    0 10050]
 [    0 19951]]


  'precision', 'predicted', average, warn_for)


In [35]:
#train on simulated + test on real 
pCjunk_real_pred = lr_pCjunk.predict(pCjunk_real)

print("Accuracy: " + str(metrics.accuracy_score(pCjunk_real_labels, pCjunk_real_pred)))
print(metrics.classification_report(pCjunk_real_labels, pCjunk_real_pred))
print(metrics.confusion_matrix(pCjunk_real_labels, pCjunk_real_pred))

Accuracy: 0.7534399404983265
             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00       663
        1.0       0.75      1.00      0.86      2026

avg / total       0.57      0.75      0.65      2689

[[   0  663]
 [   0 2026]]


  'precision', 'predicted', average, warn_for)
