In [1]:
# Import basic packages
import numpy as np
import scipy as sp
import pandas as pd
from astropy.io import fits


# ==== Scikit-learn =======================
# Preprocessing
from sklearn.preprocessing import StandardScaler #Standar scaler for standardization
from sklearn.model_selection import train_test_split # For random split

# Classifiers
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

# ==========================================
# Matplotlib, urlib etc 
import urllib
import urllib.request
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
from PIL import Image
%matplotlib inline

In [2]:
from sklearn.metrics import confusion_matrix

def rates(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    # Initialize
    X_mat = np.zeros(4)
    
    # Populate
    X_mat[0] = TN
    X_mat[1] = FN
    X_mat[2] = TP
    X_mat[3] = FP
    
    return X_mat  

### Import feature matrix and labels

In [3]:
randoms = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/Randoms/X_randoms_feat.npy")
randoms_l = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/Randoms/y_randoms_lab.npy")


burcin = np.load("/data/des80.b/data/burcinmp/y6_lsbg/y6/test_classifier/random_forest/v3/X_mat_v4_a.npy")
burcin_l = np.load("/data/des80.b/data/burcinmp/y6_lsbg/y6/test_classifier/random_forest/v3/y_lab_v4_a.npy")


In [4]:
sel = (burcin_l == 1)
burcin = burcin[sel]
burcin_l = burcin_l[sel]

In [5]:
#Load in training files
X_feat_real = np.concatenate((randoms,burcin))
y_lab_real = np.concatenate((randoms_l,burcin_l))

In [6]:
#Split up LSBGs and aritfacts into train and test (70% train and 30% test)

index_real=(y_lab_real==1)
index_art=(y_lab_real==0)

X_art = X_feat_real[index_art]
X_lsbg = X_feat_real[index_real]


y_art = y_lab_real[index_art]
y_lsbg = y_lab_real[index_real]


X_train_art, X_test_art, y_train_art, y_test_art = train_test_split(X_art, y_art,
                                                                        train_size = 0.70, random_state = 42)
X_train_lsbg, X_test_lsbg, y_train_lsbg, y_test_lsbg = train_test_split(X_lsbg, y_lsbg,
                                                                        train_size = 0.70, random_state = 42)
print('===========Train==Test')
print("ARTIFACTS:",len(X_train_art), len(X_test_art))
print("LSBGs    :",len(X_train_lsbg), len(X_test_lsbg))


ARTIFACTS: 70000 30000
LSBGs    : 21332 9143


In [7]:
#Create training arrays

X_train = np.concatenate((X_train_art,X_train_lsbg))
y_train = np.concatenate((y_train_art,y_train_lsbg))

# Standardize the two sets
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [8]:
#Make the classifier
model_rf_fin = RandomForestClassifier(n_estimators=100)
#                                       criterion='entropy',
#                                       max_depth=10,
#                                       max_features=None)
#                                       min_samples_leaf=2,
#                                       min_samples_split=12)
#                                       n_estimators=240)
model_rf_fin.fit(X_train,y_train)


In [9]:
print('AUC Score ={:.4f}'.format(metrics.f1_score(y_train,model_rf_fin.predict(X_train))))

AUC Score =0.9883


In [10]:
# Make test set

X_test_1 = np.concatenate((X_test_lsbg,X_test_art)) 
y_test = np.concatenate((y_test_lsbg,y_test_art))

X_test = scaler.transform(X_test_1)

y_pred = model_rf_fin.predict(X_test)

In [11]:
#Make some binzzz
# 24 < mu_g < 25

sel = (X_test_1[:,16] >= 24.0) & (X_test_1[:,16] < 25.0)
X_test_bin1 = X_test[sel]
y_test_bin1 = y_test[sel]
y_pred_bin1 = y_pred[sel]

# 25 < mu_g < 26
sel = (X_test_1[:,16] >= 25.0) & (X_test_1[:,16] < 26.0)
X_test_bin2 = X_test[sel]
y_test_bin2 = y_test[sel]
y_pred_bin2 = y_pred[sel]

# 26 < mu_g < 27
sel = (X_test_1[:,16] >= 26.0) & (X_test_1[:,16] <= 27.0)
X_test_bin3 = X_test[sel]
y_test_bin3 = y_test[sel]
y_pred_bin3 = y_pred[sel]


### BIN1: 24. < mu_g < 25

In [12]:
rate_bin1_1 = rates(y_test_bin1,y_pred_bin1)
print("  TN      FN     TP     FP")
print(rate_bin1_1)


  TN      FN     TP     FP
[1372.  221. 5620.  549.]


In [13]:
TN = rate_bin1_1[0]
FN = rate_bin1_1[1]
TP = rate_bin1_1[2]
FP = rate_bin1_1[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin1_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin1_1)
# Specificity or true negative rate
TNR_bin1_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin1_1)
# Precision or positive predictive value
PPV_bin1_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin1_1)
# Negative predictive value
NPV_bin1_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin1_1)
# Fall out or false positive rate
FPR_bin1_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin1_1)
# False negative rate
FNR_bin1_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin1_1)
# False discovery rate
FDR_bin1_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin1_1)

# Overall accuracy
ACC_bin1_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin1_1)

true positive rate: 0.9621640130114706
true negative rate: 0.7142113482561167
positive predictive value: 0.9110066461338953
negative predictive value: 0.8612680477087257
false positive rate: 0.2857886517438834
false negative rate: 0.037835986988529365
false discovery rate: 0.08899335386610471
Overall accuracy: 0.9007987632053595


### BIN2: 25. <= mu_g<= 26

In [14]:
rate_bin2 = rates(y_test_bin2,y_pred_bin2)

print("  TN      FN     TP     FP")
print(rate_bin2)


  TN      FN     TP     FP
[2236.  153. 2626.  282.]


In [15]:
TN = rate_bin2[0]
FN = rate_bin2[1]
TP = rate_bin2[2]
FP = rate_bin2[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin2_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin2_1)
# Specificity or true negative rate
TNR_bin2_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin2_1)
# Precision or positive predictive value
PPV_bin2_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin2_1)
# Negative predictive value
NPV_bin2_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin2_1)
# Fall out or false positive rate
FPR_bin2_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin2_1)
# False negative rate
FNR_bin2_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin2_1)
# False discovery rate
FDR_bin2_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin2_1)

# Overall accuracy
ACC_bin2_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin2_1)

true positive rate: 0.9449442245412019
true negative rate: 0.8880063542494043
positive predictive value: 0.9030261348005502
negative predictive value: 0.9359564671410632
false positive rate: 0.11199364575059571
false negative rate: 0.055055775458798126
false discovery rate: 0.0969738651994498
Overall accuracy: 0.9178780441759486


### BIN3: 26. <= mu_g<= 27

In [16]:
rate_bin3 = rates(y_test_bin3,y_pred_bin3)

print("  TN      FN     TP     FP")
print(rate_bin3)


  TN      FN     TP     FP
[4628.  126.  386.   83.]


In [17]:
TN = rate_bin3[0]
FN = rate_bin3[1]
TP = rate_bin3[2]
FP = rate_bin3[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin3_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin3_1)
# Specificity or true negative rate
TNR_bin3_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin3_1)
# Precision or positive predictive value
PPV_bin3_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin3_1)
# Negative predictive value
NPV_bin3_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin3_1)
# Fall out or false positive rate
FPR_bin3_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin3_1)
# False negative rate
FNR_bin3_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin3_1)
# False discovery rate
FDR_bin3_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin3_1)

# Overall accuracy
ACC_bin3_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin3_1)

true positive rate: 0.75390625
true negative rate: 0.98238165994481
positive predictive value: 0.8230277185501066
negative predictive value: 0.9734960033655868
false positive rate: 0.01761834005518998
false negative rate: 0.24609375
false discovery rate: 0.17697228144989338
Overall accuracy: 0.9599846831322995


In [18]:
print(rate_bin1_1[1]+rate_bin1_1[2]+rate_bin2[1]+rate_bin2[2]+rate_bin3[1]+rate_bin3[2])

9132.0


In [19]:
1-TNR_bin1_1 - FPR_bin1_1

-5.551115123125783e-17

In [20]:
(1-TNR_bin2_1) - FPR_bin2_1

-4.163336342344337e-17

In [21]:
(1-TNR_bin3_1) - FPR_bin3_1

4.85722573273506e-17

In [22]:
tpr_bin0_100k = TPR_bin1_1
tpr_bin1_100k = TPR_bin2_1
tpr_bin2_100k = TPR_bin3_1
tnr_bin0_100k = TNR_bin1_1
tnr_bin1_100k = TNR_bin2_1
tnr_bin2_100k = TNR_bin3_1
fpr_bin0_100k = FPR_bin1_1
fpr_bin1_100k = FPR_bin2_1
fpr_bin2_100k = FPR_bin3_1
%store tpr_bin0_100k tpr_bin1_100k tpr_bin2_100k
%store tnr_bin0_100k tnr_bin1_100k tnr_bin2_100k
%store fpr_bin0_100k fpr_bin1_100k fpr_bin2_100k

Stored 'tpr_bin0_100k' (float64)
Stored 'tpr_bin1_100k' (float64)
Stored 'tpr_bin2_100k' (float64)
Stored 'tnr_bin0_100k' (float64)
Stored 'tnr_bin1_100k' (float64)
Stored 'tnr_bin2_100k' (float64)
Stored 'fpr_bin0_100k' (float64)
Stored 'fpr_bin1_100k' (float64)
Stored 'fpr_bin2_100k' (float64)
