In [2]:
%matplotlib inline

In [13]:
import matplotlib.pyplot as plt

import scipy as sp
import numpy as np
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Training and testing on real data (10 Fold CV)

In [4]:
#loading real data
p_0130 = sp.sparse.load_npz('../../data/real/20x20x20/run_0130_pDisc.npz')
C_0130 = sp.sparse.load_npz('../../data/real/20x20x20/run_0130_CDisc.npz')
junk_0130 = sp.sparse.load_npz('../../data/real/20x20x20/run_0130_junkDisc.npz')
p_0210 = sp.sparse.load_npz('../../data/real/20x20x20/run_0210_pDisc.npz')
C_0210 = sp.sparse.load_npz('../../data/real/20x20x20/run_0210_CDisc.npz')
junk_0210 = sp.sparse.load_npz('../../data/real/20x20x20/run_0210_junkDisc.npz')

p_real = sp.sparse.vstack([p_0130, p_0210], format='csr')
C_real = sp.sparse.vstack([C_0130, C_0210], format='csr')
junk_real = sp.sparse.vstack([junk_0130, junk_0210], format='csr')

In [5]:
#creating labels
#protons 0s
#carbons 1s
#junk 1s
p_real_labels = np.zeros((p_real.shape[0],))
C_real_labels = np.ones((C_real.shape[0],))
junk_real_labels = np.ones((junk_real.shape[0],))

# Proton vs. Carbon

In [6]:
pC_real = sp.sparse.vstack([p_real, C_real], format='csr')
pC_real_labels = np.hstack((p_real_labels, C_real_labels))

In [36]:
reg = 0.001
lr_pC = LogisticRegression(C=reg)
scores = cross_val_score(lr_pC, pC_real, pC_real_labels, cv=10)
#scores = cross_val_score(lr_pC, pC_real, pC_real_labels, cv=10, scoring='f1')
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.87128713  0.82178218  0.87128713  0.89        0.83        0.89        0.79
  0.87        0.83        0.83      ]
Accuracy: 0.85 (+/- 0.06)


In [37]:
pC_pred = cross_val_predict(lr_pC, pC_real, pC_real_labels, cv=10)
print(metrics.accuracy_score(pC_real_labels, pC_pred))

0.849451645065


In [38]:
print(metrics.classification_report(pC_real_labels, pC_pred))
print(metrics.confusion_matrix(pC_real_labels, pC_pred))

             precision    recall  f1-score   support

        0.0       0.85      0.93      0.89       663
        1.0       0.84      0.69      0.76       340

avg / total       0.85      0.85      0.85      1003

[[617  46]
 [105 235]]


# Proton vs. Junk

In [45]:
# proton 0s
# junk 1s
pjunk_real = sp.sparse.vstack([p_real, junk_real], format='csr')
pjunk_real_labels = np.hstack((p_real_labels, junk_real_labels))

In [46]:
reg = 0.001
lr_pjunk = LogisticRegression(C=reg)
scores = cross_val_score(lr_pjunk, pjunk_real, pjunk_real_labels, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.84322034  0.83050847  0.86864407  0.82978723  0.86808511  0.87659574
  0.82905983  0.85042735  0.84615385  0.8034188 ]
Accuracy: 0.84 (+/- 0.04)


In [47]:
pjunk_pred = cross_val_predict(lr_pjunk, pjunk_real, pjunk_real_labels, cv=10)
print(metrics.accuracy_score(pjunk_real_labels, pjunk_pred))

0.844614729672


In [49]:
print(metrics.classification_report(pjunk_real_labels, pjunk_pred))
print(metrics.confusion_matrix(pjunk_real_labels, pjunk_pred))

             precision    recall  f1-score   support

        0.0       0.91      0.50      0.64       663
        1.0       0.83      0.98      0.90      1686

avg / total       0.85      0.84      0.83      2349

[[ 330  333]
 [  32 1654]]


# Proton vs. Carbon+Junk

In [39]:
pCjunk_real = sp.sparse.vstack([p_real, C_real, junk_real], format='csr')
pCjunk_real_labels = np.hstack((p_real_labels, C_real_labels, junk_real_labels))

In [40]:
reg = 0.001
lr_pCjunk = LogisticRegression(C=reg)

In [41]:
scores = cross_val_score(lr_pCjunk, pCjunk_real, pCjunk_real_labels, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.78148148  0.78148148  0.84444444  0.82527881  0.866171    0.86988848
  0.83208955  0.83955224  0.85074627  0.81343284]
Accuracy: 0.83 (+/- 0.06)


In [42]:
pCjunk_pred = cross_val_predict(lr_pCjunk, pCjunk_real, pCjunk_real_labels, cv=10)
print(metrics.accuracy_score(pCjunk_real_labels, pCjunk_pred))

0.830420230569


In [43]:
print(metrics.classification_report(pCjunk_real_labels, pCjunk_pred))
print(metrics.confusion_matrix(pCjunk_real_labels, pCjunk_pred))

             precision    recall  f1-score   support

        0.0       0.83      0.40      0.53       663
        1.0       0.83      0.97      0.90      2026

avg / total       0.83      0.83      0.81      2689

[[ 262  401]
 [  55 1971]]


# Proton vs. Carbon vs. Junk (multiclass)

In [50]:
#protons 0
#carbons 1
#junk 2

#create junk 2s labels
junkMC_real_labels = np.full(junk_real_labels.shape, 2)

MC_real = sp.sparse.vstack([p_real, C_real, junk_real], format='csr')
MC_real_labels = np.hstack((p_real_labels, C_real_labels, junkMC_real_labels))

In [53]:
reg = 0.001
lr_MC = LogisticRegression(C=reg, multi_class='ovr')

In [54]:
scores = cross_val_score(lr_MC, MC_real, MC_real_labels, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.74814815  0.7         0.77777778  0.78066914  0.79925651  0.81040892
  0.75        0.77985075  0.76492537  0.69776119]
Accuracy: 0.76 (+/- 0.07)


In [55]:
MC_pred = cross_val_predict(lr_MC, MC_real, MC_real_labels, cv=10)
print(metrics.accuracy_score(MC_real_labels, MC_pred))

0.760877649684


In [56]:
print(metrics.classification_report(MC_real_labels, MC_pred))
print(metrics.confusion_matrix(MC_real_labels, MC_pred))

             precision    recall  f1-score   support

        0.0       0.84      0.50      0.63       663
        1.0       0.60      0.33      0.42       340
        2.0       0.76      0.95      0.85      1686

avg / total       0.76      0.76      0.74      2689

[[ 330   15  318]
 [  42  111  187]
 [  21   60 1605]]
