# Isolation Forest

In [1]:
import datetime as dt

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import shap

import joblib

from sklearn.ensemble import IsolationForest

In [2]:
NO_FEATURES = ['id', 'tile', 'cnt', 'ra_k', 'dec_k']

FEATURES = [
   'Amplitude', 'Autocor_length',
   'Beyond1Std', 'Con', 'Eta_e', 'FluxPercentileRatioMid20',
   'FluxPercentileRatioMid35', 'FluxPercentileRatioMid50',
   'FluxPercentileRatioMid65', 'FluxPercentileRatioMid80',
   'Freq1_harmonics_amplitude_0', 'Freq1_harmonics_amplitude_1',
   'Freq1_harmonics_amplitude_2', 'Freq1_harmonics_amplitude_3',
   'Freq1_harmonics_rel_phase_1', 'Freq1_harmonics_rel_phase_2',
   'Freq1_harmonics_rel_phase_3', 'Freq2_harmonics_amplitude_0',
   'Freq2_harmonics_amplitude_1', 'Freq2_harmonics_amplitude_2',
   'Freq2_harmonics_amplitude_3', 'Freq2_harmonics_rel_phase_1',
   'Freq2_harmonics_rel_phase_2', 'Freq2_harmonics_rel_phase_3',
   'Freq3_harmonics_amplitude_0', 'Freq3_harmonics_amplitude_1',
   'Freq3_harmonics_amplitude_2', 'Freq3_harmonics_amplitude_3',
   'Freq3_harmonics_rel_phase_1', 'Freq3_harmonics_rel_phase_2',
   'Freq3_harmonics_rel_phase_3', 'Gskew', 'LinearTrend', 'MaxSlope',
   'Mean', 'MedianAbsDev', 'MedianBRP', 'PairSlopeTrend',
   'PercentAmplitude', 'PercentDifferenceFluxPercentile', 'PeriodLS',
   'Period_fit', 'Psi_CS', 'Psi_eta', 'Q31', 'Rcs', 'Skew',
   'SmallKurtosis', 'Std', 'c89_c3', 'c89_hk_color', 'c89_jh_color',
   'c89_jk_color', 'c89_m2', 'c89_m4', 'n09_c3', 'n09_hk_color',
   'n09_jh_color', 'n09_jk_color', 'n09_m2', 'n09_m4', 'ppmb']

MIN_SAMPLES = len(FEATURES) * 2

In [3]:
%%time

datas = {
    "b216": joblib.load("_data/blz_b216_3849_scaled.pkl.bz2"),
    "b277": joblib.load("_data/blz_b277_3041_scaled.pkl.bz2")
}

CPU times: user 1min 13s, sys: 476 ms, total: 1min 13s
Wall time: 1min 13s


In [None]:

results, trees = {}, {}
for k, v in datas.items():
    print(f"[{dt.datetime.now()}] {k}")
    
    clf = IsolationForest(
        contamination='auto', 
        n_jobs=-1, 
        random_state=42)
    
    X = v[FEATURES].values
    
    trees[k] = clf
    results[k] = clf.fit_predict(X)
    

In [7]:
%%time

shaps = {}
for k, v in datas.items():
    print(f"[{dt.datetime.now()}] {k}")
    clf = trees[k]
    shaps[k] = shap.TreeExplainer(clf).shap_values(v[FEATURES].values)

[2020-02-27 15:06:06.861022] b216


Setting feature_perturbation = "tree_path_dependent" because no background data was given.


[2020-02-27 15:19:22.936169] b277


Setting feature_perturbation = "tree_path_dependent" because no background data was given.


CPU times: user 39min 32s, sys: 1.54 s, total: 39min 33s
Wall time: 39min 30s


In [9]:
%%time
joblib.dump({
    "trees": trees,
    "results": results,
    "shaps": shaps
}, "results/results.pkl.bz2", compress=3)

CPU times: user 1min 17s, sys: 492 ms, total: 1min 17s
Wall time: 1min 17s


In [None]:
dt.datetime.now()