In [1]:
# Import basic packages
import numpy as np
import scipy as sp
import pandas as pd
from astropy.io import fits


# ==== Scikit-learn =======================
# Preprocessing
from sklearn.preprocessing import StandardScaler #Standar scaler for standardization
from sklearn.model_selection import train_test_split # For random split

# Classifiers
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

# ==========================================
# Matplotlib, urlib etc 
import urllib
import urllib.request
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
from PIL import Image
%matplotlib inline

In [2]:
from sklearn.metrics import confusion_matrix

def rates(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    # Initialize
    X_mat = np.zeros(4)
    
    # Populate
    X_mat[0] = TN
    X_mat[1] = FN
    X_mat[2] = TP
    X_mat[3] = FP
    
    return X_mat  

### Import feature matrix and labels

In [72]:
randoms = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/Randoms/X_randoms_40k_feat.npy")
randoms_l = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/Randoms/y_randoms_40k_lab.npy")

randoms_e = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/Randoms/X_randoms_v3_feat.npy")
randoms_e_l = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/Randoms/y_randoms_v3_lab.npy")

burcin = np.load("/data/des80.b/data/burcinmp/y6_lsbg/y6/test_classifier/random_forest/v3/X_mat_v4_a.npy")
burcin_l = np.load("/data/des80.b/data/burcinmp/y6_lsbg/y6/test_classifier/random_forest/v3/y_lab_v4_a.npy")

In [73]:
sel = (burcin_l == 1)
burcin = burcin[sel]
burcin_l = burcin_l[sel]

In [74]:
sel = (randoms_e[:,16] >= 24.0) & (randoms_e[:,16] < 25.0)
bin0_rands = randoms_e[sel][0:5000]
bin0_lab = randoms_e_l[sel][0:5000]
sel = (randoms_e[:,16] >= 25.0) & (randoms_e[:,16] < 26.0)
bin1_rands = randoms_e[sel][0:6000]
bin1_lab = randoms_e_l[sel][0:6000]
sel = (randoms_e[:,16] >= 26.0) & (randoms_e[:,16] <= 27.0)
bin2_rands = randoms_e[sel][0:12000]
bin2_lab = randoms_e_l[sel][0:12000]

In [75]:
len(bin1_rands)

6000

In [85]:
#Load in training files
X_feat_real = np.concatenate((burcin, bin0_rands, bin1_rands, bin2_rands))
y_lab_real = np.concatenate((burcin_l, bin0_lab, bin1_lab, bin2_lab))

In [86]:
#Split up LSBGs and aritfacts into train and test (70% train and 30% test)

index_real=(y_lab_real==1)
index_art=(y_lab_real==0)

X_art = X_feat_real[index_art]
X_lsbg = X_feat_real[index_real]

y_art = y_lab_real[index_art]
y_lsbg = y_lab_real[index_real]

X_train_art, X_test_art, y_train_art, y_test_art = train_test_split(X_art, y_art,
                                                                        train_size = 0.70, random_state = 42)
X_train_lsbg, X_test_lsbg, y_train_lsbg, y_test_lsbg = train_test_split(X_lsbg, y_lsbg,
                                                                        train_size = 0.70, random_state = 42)
print('===========Train==Test')
print("ARTIFACTS:",len(X_train_art), len(X_test_art))
print("LSBGs    :",len(X_train_lsbg), len(X_test_lsbg))


ARTIFACTS: 16099 6901
LSBGs    : 21332 9143


In [87]:
#Create training arrays

X_train = np.concatenate((X_train_art,X_train_lsbg))
y_train = np.concatenate((y_train_art,y_train_lsbg))

# Standardize the two sets
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [88]:
#Make the classifier
model_rf_fin = RandomForestClassifier(n_estimators=100)
#                                       criterion='entropy',
#                                       max_depth=10,
#                                       max_features=None)
#                                       min_samples_leaf=2,
#                                       min_samples_split=12)
#                                       n_estimators=240)
model_rf_fin.fit(X_train,y_train)


In [89]:
print('AUC Score ={:.4f}'.format(metrics.f1_score(y_train,model_rf_fin.predict(X_train))))

AUC Score =0.9913


In [90]:
# Make test set

X_test_1 = np.concatenate((X_test_lsbg,X_test_art)) 
y_test = np.concatenate((y_test_lsbg,y_test_art))

X_test = scaler.transform(X_test_1)

y_pred = model_rf_fin.predict(X_test)

In [91]:
#Make some binzzz
# 24 < mu_g < 25

sel = (X_test_1[:,16] >= 24.0) & (X_test_1[:,16] < 25.0)

y_test_bin1 = y_test[sel]
y_pred_bin1 = y_pred[sel]

# 25 < mu_g < 26
sel = (X_test_1[:,16] >= 25.0) & (X_test_1[:,16] < 26.0)

y_test_bin2 = y_test[sel]
y_pred_bin2 = y_pred[sel]

# 26 < mu_g < 27
sel = (X_test_1[:,16] >= 26.0) & (X_test_1[:,16] <= 27.0)

y_test_bin3 = y_test[sel]
y_pred_bin3 = y_pred[sel]


In [92]:
rate_bin1 = rates(y_test_bin1,y_pred_bin1)
print("  TN      FN     TP     FP")
print(rate_bin1)

  TN      FN     TP     FP
[1078.  201. 5640.  411.]


In [93]:
TN = rate_bin1[0]
FN = rate_bin1[1]
TP = rate_bin1[2]
FP = rate_bin1[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin1_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin1_1)
# Specificity or true negative rate
TNR_bin1_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin1_1)
# Precision or positive predictive value
PPV_bin1_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin1_1)
# Negative predictive value
NPV_bin1_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin1_1)
# Fall out or false positive rate
FPR_bin1_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin1_1)
# False negative rate
FNR_bin1_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin1_1)
# False discovery rate
FDR_bin1_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin1_1)

# Overall accuracy
ACC_bin1_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin1_1)

true positive rate: 0.965588084232152
true negative rate: 0.7239758226997985
positive predictive value: 0.932077342588002
negative predictive value: 0.8428459734167318
false positive rate: 0.27602417730020146
false negative rate: 0.034411915767847974
false discovery rate: 0.06792265741199802
Overall accuracy: 0.9165075034106412


### BIN2: 25. <= mu_g<= 26

In [94]:
rate_bin2 = rates(y_test_bin2,y_pred_bin2)

print("  TN      FN     TP     FP")
print(rate_bin2)

  TN      FN     TP     FP
[1563.  117. 2662.  250.]


In [95]:
TN = rate_bin2[0]
FN = rate_bin2[1]
TP = rate_bin2[2]
FP = rate_bin2[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin2_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin2_1)
# Specificity or true negative rate
TNR_bin2_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin2_1)
# Precision or positive predictive value
PPV_bin2_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin2_1)
# Negative predictive value
NPV_bin2_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin2_1)
# Fall out or false positive rate
FPR_bin2_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin2_1)
# False negative rate
FNR_bin2_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin2_1)
# False discovery rate
FDR_bin2_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin2_1)

# Overall accuracy
ACC_bin2_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin2_1)

true positive rate: 0.9578985246491544
true negative rate: 0.8621070049641478
positive predictive value: 0.9141483516483516
negative predictive value: 0.9303571428571429
false positive rate: 0.13789299503585217
false negative rate: 0.04210147535084563
false discovery rate: 0.08585164835164835
Overall accuracy: 0.9200783972125436


### BIN3: 26. <= mu_g<= 27

In [96]:
rate_bin3 = rates(y_test_bin3,y_pred_bin3)

print("  TN      FN     TP     FP")
print(rate_bin3)

  TN      FN     TP     FP
[3518.  104.  408.   81.]


In [97]:
TN = rate_bin3[0]
FN = rate_bin3[1]
TP = rate_bin3[2]
FP = rate_bin3[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin3_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin3_1)
# Specificity or true negative rate
TNR_bin3_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin3_1)
# Precision or positive predictive value
PPV_bin3_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin3_1)
# Negative predictive value
NPV_bin3_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin3_1)
# Fall out or false positive rate
FPR_bin3_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin3_1)
# False negative rate
FNR_bin3_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin3_1)
# False discovery rate
FDR_bin3_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin3_1)

# Overall accuracy
ACC_bin3_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin3_1)

true positive rate: 0.796875
true negative rate: 0.9774937482634065
positive predictive value: 0.8343558282208589
negative predictive value: 0.9712865819988956
false positive rate: 0.0225062517365935
false negative rate: 0.203125
false discovery rate: 0.1656441717791411
Overall accuracy: 0.9549987837509122


In [18]:
print(rate_bin1[1]+rate_bin1[2]+rate_bin2[1]+rate_bin2[2]+rate_bin3[1]+rate_bin3[2])

9132.0


In [19]:
tpr_bin0_40k = TPR_bin1_1
tpr_bin1_40k = TPR_bin2_1
tpr_bin2_40k = TPR_bin3_1

In [20]:
%store tpr_bin0_40k tpr_bin1_40k tpr_bin2_40k

Stored 'tpr_bin0_40k' (float64)
Stored 'tpr_bin1_40k' (float64)
Stored 'tpr_bin2_40k' (float64)


In [21]:
tnr_bin0_40k = TNR_bin1_1
tnr_bin1_40k = TNR_bin2_1
tnr_bin2_40k = TNR_bin3_1

In [22]:
%store tnr_bin0_40k tnr_bin1_40k tnr_bin2_40k

Stored 'tnr_bin0_40k' (float64)
Stored 'tnr_bin1_40k' (float64)
Stored 'tnr_bin2_40k' (float64)


In [23]:
fpr_bin0_40k = FPR_bin1_1
fpr_bin1_40k = FPR_bin2_1
fpr_bin2_40k = FPR_bin3_1

In [24]:
%store fpr_bin0_40k fpr_bin1_40k fpr_bin2_40k

Stored 'fpr_bin0_40k' (float64)
Stored 'fpr_bin1_40k' (float64)
Stored 'fpr_bin2_40k' (float64)


In [25]:
%store

Stored variables and their in-db values:
fpr_bin0_40k             -> 0.3002481389578164
fpr_bin0_def             -> 0.3138401559454191
fpr_bin1_40k             -> 0.17290192113245703
fpr_bin1_def             -> 0.18947368421052632
fpr_bin2_40k             -> 0.034482758620689655
fpr_bin2_def             -> 0.03735881841876629
tnr_bin0_40k             -> 0.6997518610421837
tnr_bin0_def             -> 0.6861598440545809
tnr_bin1_40k             -> 0.8270980788675429
tnr_bin1_def             -> 0.8105263157894737
tnr_bin2_40k             -> 0.9655172413793104
tnr_bin2_def             -> 0.9626411815812337
tpr_bin0_40k             -> 0.9806539976031502
tpr_bin0_def             -> 0.9863037151172744
tpr_bin1_40k             -> 0.973011874775099
tpr_bin1_def             -> 0.9787693414897445
tpr_bin2_40k             -> 0.861328125
tpr_bin2_def             -> 0.880859375
