# Library

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.ticker import AutoMinorLocator
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import preprocessing
from astroML.utils import completeness_contamination
from sklearn.metrics import precision_recall_curve, roc_curve
from astroML.utils import split_samples, completeness_contamination
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve#
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import itertools
import datetime
from pandas.plotting import scatter_matrix

# Load File

In [2]:
big_frame = pd.read_csv("big_frame_1.csv")
type_ID = big_frame['type_ID']
# big_frame = big_frame.drop(big_frame.loc[:,'type_ID'].head(0).columns, axis=1)
big_frame.drop('type_ID', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(big_frame,type_ID, random_state=None)
# Create standardizer
standardizer = StandardScaler()
standardizer.fit(X_train)
# Standardize features
X_std_train = standardizer.fit_transform(X_train)
X_std_test = standardizer.fit_transform(X_test)
# Standardize features
minmax = MinMaxScaler()
minmax.fit(X_train)
X_mm_train = minmax.fit_transform(X_train)
X_mm_test = minmax.fit_transform(X_test)

# Function

In [3]:
def print_score (y_test,y_pred,time1,time2):
    print("comfusion matrix = ")
    print(confusion_matrix(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    
    print("")
    accuracy = accuracy_score(y_test, y_pred)
    print('accuracy_score = ', accuracy)

#In multilabel classification, 
#this function computes subset accuracy: the set of labels predicted for 
#a sample must exactly match the corresponding set of labels in y_true     
    
    print("")
    completeness, contamination = completeness_contamination(y_pred, y_test)
    print ("completeness = ", completeness)
    print ("contamination = ", contamination)
    
#completeness = true positives/true positives + false negatives,
#contamination = false positives/true positives + false positives.
#The completeness measures the fraction of total detections identified by our classifier,
#while the contamination measures the fraction of detected objects which are misclassified

# Depending on the nature of the problem and the goal of the classification,
# we may wish to optimize one or the other.
# Alternative names for these measures abound: in some fields the completeness
# and contamination are respectively referred to as the “sensitivity” and the “Type I
# error.” In astronomy, one minus the contamination is often referred to as the
# “efficiency.” In machine learning communities, the efficiency and completeness are
# respectively referred to as the “precision” and “recall.”

    print("")
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print ("precision score = ", precision)
    print ("recall score = ", recall)

#precision An interesting one to look at is the accuracy of the positive pre‐ dictions; this is called the precision of the classifier 
# recall, also called sensitivity or true positive rate (TPR): this is the ratio of positive instances that are correctly detected by the classifier
#precision = TP/TP + FP
#recall = TP/TP + FN
    
    print("")
    f1 = f1_score(y_test, y_pred)
    print ("F1 score = ", f1)
    
#The F1 score is the harmonic mean of precision and recall (Equation 3-3). 
#Whereas the regular mean treats all values equally, 
#the harmonic mean gives much more weight to low values.

    print("")
    print ("time span= ", time2 - time1)

In [4]:
def score (y_test,y_pred,time1,time2):
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    completeness, contamination = completeness_contamination(y_pred, y_test)
    completeness = np.max(completeness)
    contamination = np.max(contamination)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    time = time2 - time1
    return(cm,accuracy,completeness,contamination,precision,recall,f1,time)

In [5]:
feature= ['Width','Height','Orientation','Eccentricity','Variance_x','Variance_y','Covariance','Intensity_x',
                     'mean_x','sigma_x','RMS_x','Intensity_y','mean_y','sigma_y','RMS_y',
                     'psfMag_u','psfMag_g','psfMag_r','psfMag_i','psfMag_z',
                     'psfMagErr_u','psfMagErr_g','psfMagErr_r','psfMagErr_i','psfMagErr_z',
                     'cModelMag_u','cModelMag_g','cModelMag_r','cModelMag_i','cModelMag_z',
                     'cModelMagErr_u','cModelMagErr_g','cModelMagErr_r','cModelMagErr_i',
                     'cModelMagErr_z','petroMag_u','petroMag_g','petroMag_r','petroMag_i','petroMag_z',
                     'petroMagErr_u','petroMagErr_g','petroMagErr_r','petroMagErr_i','petroMagErr_z',
                     'psf-cM_Mag_u','psf-cM_Mag_g','psf-cM_Mag_r','psf-cM_Mag_i','psf-cM_Mag_z']

# Bagging

### Gini

In [6]:
time1_tree_gini_bag = datetime.datetime.now()
bag_tree_gini = BaggingClassifier(
    DecisionTreeClassifier(random_state=None,criterion="gini"), n_estimators=500,
    max_samples=100, bootstrap=True, bootstrap_features=True, n_jobs=-1, random_state=None, 
    warm_start =True, verbose=True)

bag_tree_gini.fit(X_train, y_train)
y_pred_tree_gini_bag = bag_tree_gini.predict(X_test)
y_probs_tree_gini_bag = bag_tree_gini.predict_proba(X_test)[:, 1]
time2_tree_gini_bag = datetime.datetime.now()
print_score(y_test,y_pred_tree_gini_bag,time1_tree_gini_bag,time2_tree_gini_bag)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    4.9s remaining:   15.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.0s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.6s remaining:    1.9s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.5s remaining:    1.7s


comfusion matrix = 
[[2780   96]
 [  77 4140]]

accuracy_score =  0.975609756097561

completeness =  0.9817405738676784
contamination =  0.0226628895184136

precision score =  0.9773371104815864
recall score =  0.9817405738676784

F1 score =  0.9795338932923222

time span=  0:00:08.282913


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished


### entropy

In [7]:
time1_tree_ent_bag = datetime.datetime.now()
bag_tree_ent = BaggingClassifier(
    DecisionTreeClassifier(random_state=None,criterion="entropy"), n_estimators=500,
    max_samples=100, bootstrap=True, bootstrap_features=True, n_jobs=-1, random_state=None, 
    warm_start =True, verbose=True)

bag_tree_ent.fit(X_train, y_train)
y_pred_tree_ent_bag = bag_tree_ent.predict(X_test)
y_probs_tree_ent_bag = bag_tree_ent.predict_proba(X_test)[:, 1]
time2_tree_ent_bag = datetime.datetime.now()
print_score(y_test,y_pred_tree_ent_bag,time1_tree_ent_bag,time2_tree_ent_bag)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.1s remaining:    6.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    3.0s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.6s remaining:    1.9s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.7s remaining:    2.4s


comfusion matrix = 
[[2782   94]
 [  77 4140]]

accuracy_score =  0.9758917242351615

completeness =  0.9817405738676784
contamination =  0.022201228153046763

precision score =  0.9777987718469532
recall score =  0.9817405738676784

F1 score =  0.979765708200213

time span=  0:00:05.364962


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished


# Out-of-Bag

### Gini

In [8]:
time1_tree_gini_obb = datetime.datetime.now()
bag_gini_oob = BaggingClassifier(
    DecisionTreeClassifier(random_state=None,criterion="gini"), n_estimators=500,
    max_samples=100, bootstrap=True, bootstrap_features=True, n_jobs=-1, random_state=None,
    verbose=True, oob_score=True)

bag_gini_oob.fit(X_train, y_train)
y_pred_tree_gini_oob = bag_gini_oob.predict(X_test)
y_probs_tree_gini_oob = bag_gini_oob.predict_proba(X_test)[:, 1]
time2_tree_gini_obb = datetime.datetime.now()
print_score(y_test,y_pred_tree_gini_oob,time1_tree_gini_obb,time2_tree_gini_obb)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.2s remaining:    6.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.6s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.6s remaining:    2.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.6s remaining:    1.9s


comfusion matrix = 
[[2787   89]
 [  80 4137]]

accuracy_score =  0.9761736923727619

completeness =  0.9810291676547308
contamination =  0.02106010411736867

precision score =  0.9789398958826313
recall score =  0.9810291676547308

F1 score =  0.9799834182162739

time span=  0:00:21.316368


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished


In [9]:
time1_tree_ent_obb = datetime.datetime.now()
bag_ent_oob = BaggingClassifier(
    DecisionTreeClassifier(random_state=None,criterion="entropy"), n_estimators=500,
    max_samples=100, bootstrap=True, bootstrap_features=True, n_jobs=-1, random_state=None,
    verbose=True, oob_score=True)

bag_ent_oob.fit(X_train, y_train)
y_pred_tree_ent_oob = bag_ent_oob.predict(X_test)
y_probs_tree_ent_oob = bag_ent_oob.predict_proba(X_test)[:, 1]
time2_tree_ent_obb = datetime.datetime.now()
print_score(y_test,y_pred_tree_ent_oob,time1_tree_ent_obb,time2_tree_ent_obb)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.5s remaining:    7.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    3.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.7s remaining:    2.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.5s remaining:    1.8s


comfusion matrix = 
[[2781   95]
 [  72 4145]]

accuracy_score =  0.9764556605103624

completeness =  0.9829262508892578
contamination =  0.02240566037735849

precision score =  0.9775943396226415
recall score =  0.9829262508892578

F1 score =  0.9802530448149462

time span=  0:00:22.162783


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished


## Cross-validation

In [10]:
scores = cross_val_score(bag_tree_ent, X_train, y_train,scoring="accuracy", cv=10)
print(scores)
print("Average cross-validation score: {:.2f}".format(scores.mean()))
scores = pd.DataFrame(scores)
display(scores)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.1s remaining:    6.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.8s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.1s remaining:    0.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    2.2s remaining:    6.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.7s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.1s remaining:    0.6s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Don

[0.97180451 0.98308271 0.97368421 0.97697368 0.97650376 0.97415414
 0.98214286 0.98167293 0.97601129 0.98071496]
Average cross-validation score: 0.98


Unnamed: 0,0
0,0.971805
1,0.983083
2,0.973684
3,0.976974
4,0.976504
5,0.974154
6,0.982143
7,0.981673
8,0.976011
9,0.980715


### Confusion Matrix

In [12]:
labels = ["non-star", "star"]
cms = confusion_matrix(y_test, y_pred_tree_ent_bag)
accuracy = np.trace(cms) / float(np.sum(cms))
misclass = 1 - accuracy

figs = plt.figure(figsize=(5, 5))
cmap = plt.get_cmap('Blues')
plt.imshow(cms, interpolation='nearest', cmap=cmap)
plt.title('Confusion matrix\n')
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels , rotation=45)
plt.yticks(tick_marks, labels)
plt.ylim(len(cms)-0.5, -0.5)
# plt.colorbar()

thresh = cms.max() / 2
for i, j in itertools.product(range(cms.shape[0]), range(cms.shape[1])):
    plt.text(j, i, "{:,}".format(cms[i, j]), horizontalalignment="center",
             color="white" if cms[i, j] > thresh else "black")

plt.xlabel('Predicted label\n\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plt.ylabel('True')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### Result

In [13]:
score_tree_gini_bag = score(y_test,y_pred_tree_gini_bag,time1_tree_gini_bag,time2_tree_gini_bag)
score_tree_ent_bag = score(y_test,y_pred_tree_ent_bag,time1_tree_ent_bag,time2_tree_ent_bag)
score_tree_gini_oob = score(y_test,y_pred_tree_gini_oob,time1_tree_gini_obb,time2_tree_gini_obb)
score_tree_ent_oob = score(y_test,y_pred_tree_ent_oob,time1_tree_ent_obb,time2_tree_ent_obb)

In [14]:
m1_t = pd.DataFrame({
 'accuracy' : [score_tree_gini_bag[1],score_tree_ent_bag[1],score_tree_gini_oob[1],score_tree_ent_oob[1]],
 'completeness' : [score_tree_gini_bag[2],score_tree_ent_bag[2],score_tree_gini_oob[2],score_tree_ent_oob[2]],
 'contamination' : [score_tree_gini_bag[3],score_tree_ent_bag[3],score_tree_gini_oob[3],score_tree_ent_oob[3]],
 'precision' : [score_tree_gini_bag[4],score_tree_ent_bag[4],score_tree_gini_oob[4],score_tree_ent_oob[4]],
 'recall' : [score_tree_gini_bag[5],score_tree_ent_bag[5],score_tree_gini_oob[5],score_tree_ent_oob[5]],
 'F1 score' : [score_tree_gini_bag[6],score_tree_ent_bag[6],score_tree_gini_oob[6],score_tree_ent_oob[6]],
 'time span' : [score_tree_gini_bag[7],score_tree_ent_bag[7],score_tree_gini_oob[7],score_tree_ent_oob[7]]
                })

m1_t.rename(index = {0: "DT_gini_bag",
                     1: "DT_ent_bag",
                     2: "DT_gini_obb",
                     3: "DT_ent_obb"
                    },inplace = True)
m1_t

Unnamed: 0,accuracy,completeness,contamination,precision,recall,F1 score,time span
DT_gini_bag,0.97561,0.981741,0.022663,0.977337,0.981741,0.979534,00:00:08.282913
DT_ent_bag,0.975892,0.981741,0.022201,0.977799,0.981741,0.979766,00:00:05.364962
DT_gini_obb,0.976174,0.981029,0.02106,0.97894,0.981029,0.979983,00:00:21.316368
DT_ent_obb,0.976456,0.982926,0.022406,0.977594,0.982926,0.980253,00:00:22.162783


In [24]:
#------------------------------------------------------------
# Plot ROC curves and completeness/efficiency
fig = plt.figure(figsize=(20, 10))
fig.subplots_adjust(left=0.1, right=0.95, bottom=0.15, top=0.9, wspace=0.25)

# ax2 will show roc curves
ax1 = plt.subplot(121)

# ax1 will show completeness/efficiency
ax2 = plt.subplot(122)


thresholds = np.linspace(0, 1, 1001)[:-1]

names_model = list(m1_t.index)
probs_model = [y_probs_tree_gini_bag, y_probs_tree_ent_bag, y_probs_tree_gini_oob, y_probs_tree_ent_oob]
# colors = cm.nipy_spectral(np.linspace(0, 1, len(list(m1_t.index))))
colors = cm.gist_rainbow(np.linspace(0, 1, len(names_model)))
# colors = cm.tab20(np.linspace(0, 1, len(names_model)))

for name, y_prob, c in zip(names_model, probs_model, colors):
    fpr, tpr, thresh = roc_curve(y_test, y_prob)

    # add (0, 0) as first point
    fpr = np.concatenate([[0], fpr])
    tpr = np.concatenate([[0], tpr])

    ax1.plot(fpr, tpr, label=name,color=c)

    comp = np.zeros_like(thresholds)
    cont = np.zeros_like(thresholds)
    for i, t in enumerate(thresholds):
        y_pred = (y_prob >= t)
        comp[i], cont[i] = completeness_contamination(y_pred, y_test)
    ax2.plot(1 - cont, comp, label=name,color=c)

ax1.set_xlim(-0.005, 0.04)
ax1.set_ylim(0, 1.02)
ax1.xaxis.set_major_locator(plt.MaxNLocator(5))
ax1.plot([0, 1], [0, 1], 'k--')
ax1.set_xlabel('false positive rate')
ax1.set_ylabel('true positive rate')
ax1.legend(loc=4)

ax2.set_xlabel('efficiency')
ax2.set_ylabel('completeness')
ax2.set_xlim(0.7, 1.0)
ax2.set_ylim(0.7, 1.02)

plt.show()

<IPython.core.display.Javascript object>