In [1]:
# Import basic packages
import numpy as np
import scipy as sp
import pandas as pd
from astropy.io import fits


# ==== Scikit-learn =======================
# Preprocessing
from sklearn.preprocessing import StandardScaler #Standar scaler for standardization
from sklearn.preprocessing import RobustScaler #Robust scaler for high dispersion
from sklearn.model_selection import train_test_split # For random split

# Classifiers
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

# ==========================================
# Matplotlib, urlib etc 
import urllib
import urllib.request
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
from PIL import Image
%matplotlib inline

In [2]:
from sklearn.metrics import confusion_matrix

def rates(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    # Initialize
    X_mat = np.zeros(4)
    
    # Populate
    X_mat[0] = TN
    X_mat[1] = FN
    X_mat[2] = TP
    X_mat[3] = FP
    
    return X_mat  

### Import feature matrix and labels

In [3]:
burcin = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/training_new/X_mat_v4_a.npy")
burcin_l = np.load("/data/des81.a/data/kherron/LSBG/trainingfiles/training_new/y_lab_v4_a.npy")


In [4]:
X_feat_real = burcin
y_lab_real = burcin_l

In [5]:
#Split up LSBGs and aritfacts into train and test (70% train and 30% test)

index_real=(y_lab_real==1)
index_art=(y_lab_real==0)

X_art = X_feat_real[index_art]
X_lsbg = X_feat_real[index_real]

y_art = y_lab_real[index_art]
y_lsbg = y_lab_real[index_real]

X_train_art, X_test_art, y_train_art, y_test_art = train_test_split(X_art, y_art,
                                                                        train_size = 0.70, random_state = 42)
X_train_lsbg, X_test_lsbg, y_train_lsbg, y_test_lsbg = train_test_split(X_lsbg, y_lsbg,
                                                                        train_size = 0.70, random_state = 42)

print('===========Train==Test')
print("ARTIFACTS:",len(X_train_art), len(X_test_art))
print("LSBGs    :",len(X_train_lsbg), len(X_test_lsbg))

ARTIFACTS: 25604 10974
LSBGs    : 21332 9143


In [6]:
#Create training arrays

X_train = np.concatenate((X_train_art,X_train_lsbg))
y_train = np.concatenate((y_train_art,y_train_lsbg))

# Standardize the two sets
scaler = RobustScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [8]:
#Make the classifier
model_rf_fin = RandomForestClassifier(n_estimators=100)
#                                       criterion='gini',
#                                       max_depth=10,
#                                       max_features=None)
#                                       min_samples_leaf=2,
#                                       min_samples_split=12)
#                                       n_estimators=240)
model_rf_fin.fit(X_train,y_train)


In [9]:
print('AUC Score ={:.4f}'.format(metrics.f1_score(y_train,model_rf_fin.predict(X_train))))

AUC Score =0.9995


In [10]:
# Make test set and validation set
X_test_art, X_validate_art, y_test_art, y_validate_art = train_test_split(X_test_art, y_test_art,
                                                                        train_size = 0.50, random_state = 42)
X_test_lsbg, X_validate_lsbg, y_test_lsbg, y_validate_lsbg = train_test_split(X_test_lsbg, y_test_lsbg,
                                                                        train_size = 0.50, random_state = 42)
X_test_1 = np.concatenate((X_test_lsbg,X_test_art)) 
y_test = np.concatenate((y_test_lsbg,y_test_art))
X_test = scaler.transform(X_test_1)

y_pred = model_rf_fin.predict(X_test)

In [11]:
len(X_validate_art)

5487

In [12]:
rate_bin1_1 = rates(y_test,y_pred)
print("  TN      FN     TP     FP")
print(rate_bin1_1)


  TN      FN     TP     FP
[5345.   91. 4480.  142.]


In [13]:
TN = rate_bin1_1[0]
FN = rate_bin1_1[1]
TP = rate_bin1_1[2]
FP = rate_bin1_1[3]

# Sensitivity, hit rate, recall, or true positive rate
TPR_bin1_1 = TP/(TP+FN)
print("true positive rate:",TPR_bin1_1)
# Specificity or true negative rate
TNR_bin1_1 = TN/(TN+FP) 
print("true negative rate:",TNR_bin1_1)
# Precision or positive predictive value
PPV_bin1_1 = TP/(TP+FP)
print("positive predictive value:",PPV_bin1_1)
# Negative predictive value
NPV_bin1_1 = TN/(TN+FN)
print("negative predictive value:",NPV_bin1_1)
# Fall out or false positive rate
FPR_bin1_1 = FP/(FP+TN)
print("false positive rate:",FPR_bin1_1)
# False negative rate
FNR_bin1_1 = FN/(TP+FN)
print("false negative rate:",FNR_bin1_1)
# False discovery rate
FDR_bin1_1 = FP/(TP+FP)
print("false discovery rate:",FDR_bin1_1)

# Overall accuracy
ACC_bin1_1 = (TP+TN)/(TP+FP+FN+TN)
print("Overall accuracy:",ACC_bin1_1)

true positive rate: 0.9800918836140888
true negative rate: 0.9741206488062694
positive predictive value: 0.9692773691042839
negative predictive value: 0.9832597498160413
false positive rate: 0.025879351193730638
false negative rate: 0.019908116385911178
false discovery rate: 0.03072263089571614
Overall accuracy: 0.9768343607078942


In [34]:
#Predict on validation set
X_valid_1 = np.concatenate((X_validate_lsbg,X_validate_art)) 
y_valid = np.concatenate((y_validate_lsbg,y_validate_art))
X_valid = scaler.transform(X_valid_1)

y_pred = model_rf_fin.predict(X_valid)

In [35]:
#Generate predicted positive population
pp = sum(y_pred[y_pred == 1.])

In [37]:
#Check it out and hope for the best :)
P = (pp - (FNR_bin1_1 * len(y_pred))) / (TNR_bin1_1 - FNR_bin1_1)

print('Predicted positive population:',P)
print('Classified positives:',len(y_pred[y_pred==1]))
print('Actual positives:', len(y_valid[y_valid == 1.]))
print('Total sample size:', len(y_pred))

Predicted positive population: 4671.647149683792
Classified positives: 4658
Actual positives: 4572
Total sample size: 10059


In [29]:
X_evaluation_feat = np.load('/data/des81.a/data/kherron/LSBG/Default_Robust/X_eval_feat.npy')
X_eval = scaler.transform(X_evaluation_feat)

In [30]:
y_eval_pred = model_rf_fin.predict(X_eval)

In [31]:
pp = len(y_eval_pred[y_eval_pred==1])
P = (pp - (FPR_bin1_1 * len(y_eval_pred))) / (TPR_bin1_1 - FPR_bin1_1)

In [33]:
print('Predicted positive population:',P)
print('Classified positives:', pp)
print('Total sample size:', len(y_eval_pred))

Predicted positive population: 32114.105350934813
Classified positives: 82834
Total sample size: 2016678
