In [None]:
# Import basic packages
import numpy as np
import scipy as sp
import pandas as pd
from astropy.io import fits


# ==== Scikit-learn =======================
# Preprocessing
from sklearn.preprocessing import StandardScaler #Standar scaler for standardization
from sklearn.preprocessing import RobustScaler #Robust scaler for high dispersion
from sklearn.model_selection import train_test_split # For random split

# Classifiers
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

# ==========================================
# Matplotlib, urlib etc 
import urllib
import urllib.request
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
from PIL import Image
%matplotlib inline

In [None]:
from sklearn.metrics import confusion_matrix

def rates(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    # Initialize
    X_mat = np.zeros(4)
    
    # Populate
    X_mat[0] = TN
    X_mat[1] = FN
    X_mat[2] = TP
    X_mat[3] = FP
    
    return X_mat  

### Import feature matrix and labels

In [None]:
burcin = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/training_new/X_mat_v4_a.npy")
burcin_l = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/training_new/y_lab_v4_a.npy")

In [None]:
X_feat_real = burcin
y_lab_real = burcin_l

In [None]:
#Split up LSBGs and aritfacts into train and test (70% train and 30% test)

index_real=(y_lab_real==1)
index_art=(y_lab_real==0)

X_art = X_feat_real[index_art]
X_lsbg = X_feat_real[index_real]

y_art = y_lab_real[index_art]
y_lsbg = y_lab_real[index_real]

X_train_art, X_test_art, y_train_art, y_test_art = train_test_split(X_art, y_art,
                                                                        train_size = 0.70, random_state = 42)
X_train_lsbg, X_test_lsbg, y_train_lsbg, y_test_lsbg = train_test_split(X_lsbg, y_lsbg,
                                                                        train_size = 0.70, random_state = 42)

print('===========Train==Test')
print("ARTIFACTS:",len(X_train_art), len(X_test_art))
print("LSBGs    :",len(X_train_lsbg), len(X_test_lsbg))

In [None]:
#Create training arrays

X_train = np.concatenate((X_train_art,X_train_lsbg))
y_train = np.concatenate((y_train_art,y_train_lsbg))

# Standardize the two sets
scaler = RobustScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [None]:
#Make the classifier
model_rf_fin = RandomForestClassifier(n_estimators=100)
#                                       criterion='gini',
#                                       max_depth=10,
#                                       max_features=None)
#                                       min_samples_leaf=2,
#                                       min_samples_split=12)
#                                       n_estimators=240)
model_rf_fin.fit(X_train,y_train)


In [None]:
# Make test set and validation set

X_test_art, X_validate_art, y_test_art, y_validate_art = train_test_split(X_test_art, y_test_art,
                                                                        train_size = 0.50, random_state = 42)
X_test_lsbg, X_validate_lsbg, y_test_lsbg, y_validate_lsbg = train_test_split(X_test_lsbg, y_test_lsbg,
                                                                        train_size = 0.50, random_state = 42)
X_test_1 = np.concatenate((X_test_lsbg,X_test_art)) 
y_test = np.concatenate((y_test_lsbg,y_test_art))
X_test = scaler.transform(X_test_1)

y_pred = model_rf_fin.predict(X_test)

In [None]:
#Make some binzzz
# 24 < mu_g < 25
sel = (X_test_1[:,16] >= 24.0) & (X_test_1[:,16] < 25.0)

y_test_bin1 = y_test[sel]
y_pred_bin1 = y_pred[sel]

# 25 < mu_g < 26
sel = (X_test_1[:,16] >= 25.0) & (X_test_1[:,16] < 26.0)

y_test_bin2 = y_test[sel]
y_pred_bin2 = y_pred[sel]

# 26 < mu_g < 27
sel = (X_test_1[:,16] >= 26.0) & (X_test_1[:,16] <= 27.0)

y_test_bin3 = y_test[sel]
y_pred_bin3 = y_pred[sel]


In [None]:
rate_bin1_1 = rates(y_test_bin1,y_pred_bin1)
print("  TN      FN     TP     FP")
print(rate_bin1_1)


In [None]:
TN = rate_bin1_1[0]
FN = rate_bin1_1[1]
TP = rate_bin1_1[2]
FP = rate_bin1_1[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin1_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin1_1)
# Specificity or true negative rate
TNR_bin1_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin1_1)
# Precision or positive predictive value
PPV_bin1_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin1_1)
# Negative predictive value
NPV_bin1_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin1_1)
# Fall out or false positive rate
FPR_bin1_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin1_1)
# False negative rate
FNR_bin1_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin1_1)
# False discovery rate
FDR_bin1_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin1_1)

# Overall accuracy
ACC_bin1_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin1_1)

### BIN2: 25. <= mu_g<= 26

In [None]:
rate_bin2 = rates(y_test_bin2,y_pred_bin2)

print("  TN      FN     TP     FP")
print(rate_bin2)


In [None]:
TN = rate_bin2[0]
FN = rate_bin2[1]
TP = rate_bin2[2]
FP = rate_bin2[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin2_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin2_1)
# Specificity or true negative rate
TNR_bin2_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin2_1)
# Precision or positive predictive value
PPV_bin2_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin2_1)
# Negative predictive value
NPV_bin2_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin2_1)
# Fall out or false positive rate
FPR_bin2_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin2_1)
# False negative rate
FNR_bin2_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin2_1)
# False discovery rate
FDR_bin2_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin2_1)

# Overall accuracy
ACC_bin2_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin2_1)

### BIN3: 26. <= mu_g<= 27

In [None]:
rate_bin3 = rates(y_test_bin3,y_pred_bin3)

print("  TN      FN     TP     FP")
print(rate_bin3)


In [None]:
TN = rate_bin3[0]
FN = rate_bin3[1]
TP = rate_bin3[2]
FP = rate_bin3[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin3_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin3_1)
# Specificity or true negative rate
TNR_bin3_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin3_1)
# Precision or positive predictive value
PPV_bin3_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin3_1)
# Negative predictive value
NPV_bin3_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin3_1)
# Fall out or false positive rate
FPR_bin3_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin3_1)
# False negative rate
FNR_bin3_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin3_1)
# False discovery rate
FDR_bin3_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin3_1)

# Overall accuracy
ACC_bin3_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin3_1)

In [None]:
print(rate_bin1_1[1]+rate_bin1_1[2]+rate_bin2[1]+rate_bin2[2]+rate_bin3[1]+rate_bin3[2])

In [None]:
#Predict on validation set
X_valid_1 = np.concatenate((X_validate_lsbg,X_validate_art)) 
y_valid = np.concatenate((y_validate_lsbg,y_validate_art))
X_valid = scaler.transform(X_valid_1)

#X_valid_1 = randoms
#y_valid = randoms_l
#X_valid = scaler.transform(randoms)

y_pred = model_rf_fin.predict(X_valid)

In [None]:
#Bin validation set
sel = (X_valid_1[:,16] >= 24.0) & (X_valid_1[:,16] < 25.0)

y_valid_bin1 = y_valid[sel]
y_pred_bin1 = y_pred[sel]

# 25 < mu_g < 26
sel = (X_valid_1[:,16] >= 25.0) & (X_valid_1[:,16] < 26.0)

y_valid_bin2 = y_valid[sel]
y_pred_bin2 = y_pred[sel]

# 26 < mu_g < 27
sel = (X_valid_1[:,16] >= 26.0) & (X_valid_1[:,16] <= 27.0)

y_valid_bin3 = y_valid[sel]
y_pred_bin3 = y_pred[sel]

In [None]:
#Generate predicted positive population
pn_bin1 = sum(y_pred_bin1[y_pred_bin1 == 0.])
pn_bin2 = sum(y_pred_bin2[y_pred_bin2 == 0.])
pn_bin3 = sum(y_pred_bin3[y_pred_bin3 == 0.])

In [None]:
np.unique(y_pred_bin3)

In [None]:
#Check it out and hope for the best :)
N_bin1 = (pn_bin1 - (FNR_bin1_1 * len(y_pred_bin1))) / (TNR_bin1_1 - FNR_bin1_1)
N_bin2 = (pn_bin2 - (FNR_bin2_1 * len(y_pred_bin2))) / (TNR_bin2_1 - FNR_bin2_1)
N_bin3 = (pn_bin3 - (FNR_bin3_1 * len(y_pred_bin3))) / (TNR_bin3_1 - FNR_bin3_1)

print('BIN 1:')
print('Predicted negatives:',N_bin1)
print('Actual negatives:', len(y_valid_bin1[y_valid_bin1 == 0.]))
print('Total sample size:', len(y_pred_bin1))
#print('(Actual-Predicted) / sample size:',(sum(y_valid_bin1[y_valid_bin1 == 1])-P_bin1)/len(y_pred_bin1))
print()
print('BIN 2:')
print('Predicted negatives:', N_bin2)
print('Actual negatives:', len(y_valid_bin2[y_valid_bin2 == 0.]))
print('Total sample size:', len(y_pred_bin2))
#print('% of sample error:',(sum(y_valid_bin2[y_valid_bin2 == 1])-P_bin2)/len(y_pred_bin2))
print()
print('BIN 3:')
print('Predicted negatives:', N_bin3)
print('Actual negatives:', len(y_valid_bin3[y_valid_bin3 == 0.]))
print('Total sample size:', len(y_pred_bin3))
#print('% of sample error:',(sum(y_valid_bin3[y_valid_bin3 == 1])-P_bin2)/len(y_pred_bin3))

In [None]:
X_evaluation_feat = np.load('/data/des81.a/data/kherron/LSBG/Default_Robust/X_eval_feat.npy')
X_eval = scaler.transform(X_evaluation_feat)

In [None]:
y_eval_pred = model_rf_fin.predict(X_eval)

In [None]:
sel = (X_evaluation_feat[:,16] >= 24.0) & (X_evaluation_feat[:,16] < 25.0)
pp_bin1 = sum(y_eval_pred[sel])
T_bin1 = len(y_eval_pred[sel])
sel = (X_evaluation_feat[:,16] >= 25.0) & (X_evaluation_feat[:,16] < 26.0)
pp_bin2 = sum(y_eval_pred[sel])
T_bin2 = len(y_eval_pred[sel])
sel = (X_evaluation_feat[:,16] >= 26.0) & (X_evaluation_feat[:,16] <= 27.0)
pp_bin2 = sum(y_eval_pred[sel])
T_bin3 = len(y_eval_pred[sel])

In [None]:
P_bin1 = (pp_bin1 - (FPR_bin1_1 * T_bin1)) / (TPR_bin1_1 - FPR_bin1_1)
P_bin2 = (pp_bin2 - (FPR_bin2_1 * T_bin2)) / (TPR_bin2_1 - FPR_bin2_1)
P_bin3 = (pp_bin3 - (FPR_bin3_1 * T_bin3)) / (TPR_bin3_1 - FPR_bin3_1)

In [None]:
print('BIN 1:')
print('Predicted positives:',P_bin1)
print('Total sample size:', T_bin1)
print('% of sample error:',(sum(y_valid_bin1[y_valid_bin1 == 1])-P_bin1)/len(y_pred_bin1))
print()
print('BIN 2:')
print('Predicted positives:', P_bin2)
print('Total sample size:', T_bin2)
print('% of sample error:',(sum(y_valid_bin2[y_valid_bin2 == 1])-P_bin2)/len(y_pred_bin2))
print()
print('BIN 3:')
print('Predicted positives:', P_bin3)
print('Total sample size:', T_bin3)
print('% of sample error:',(sum(y_valid_bin3[y_valid_bin3 == 1])-P_bin2)/len(y_pred_bin3))

In [None]:
PPV_bin1_1*

In [None]:
%store -z

In [None]:
tpr_bin0_def = TPR_bin1_1
tpr_bin1_def = TPR_bin2_1
tpr_bin2_def = TPR_bin3_1
tnr_bin0_def = TNR_bin1_1
tnr_bin1_def = TNR_bin2_1
tnr_bin2_def = TNR_bin3_1
fpr_bin0_def = FPR_bin1_1
fpr_bin1_def = FPR_bin2_1
fpr_bin2_def = FPR_bin3_1
%store tpr_bin0_def tpr_bin1_def tpr_bin2_def
%store tnr_bin0_def tnr_bin1_def tnr_bin2_def
%store fpr_bin0_def fpr_bin1_def fpr_bin2_def