# Isolation Forest

In [1]:
import datetime as dt

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import shap

import joblib

from sklearn.ensemble import IsolationForest

In [2]:
NO_FEATURES = ['id', 'tile', 'cnt', 'ra_k', 'dec_k']

FEATURES = [
   'Amplitude', 'Autocor_length',
   'Beyond1Std', 'Con', 'Eta_e', 'FluxPercentileRatioMid20',
   'FluxPercentileRatioMid35', 'FluxPercentileRatioMid50',
   'FluxPercentileRatioMid65', 'FluxPercentileRatioMid80',
   'Freq1_harmonics_amplitude_0', 'Freq1_harmonics_amplitude_1',
   'Freq1_harmonics_amplitude_2', 'Freq1_harmonics_amplitude_3',
   'Freq1_harmonics_rel_phase_1', 'Freq1_harmonics_rel_phase_2',
   'Freq1_harmonics_rel_phase_3', 'Freq2_harmonics_amplitude_0',
   'Freq2_harmonics_amplitude_1', 'Freq2_harmonics_amplitude_2',
   'Freq2_harmonics_amplitude_3', 'Freq2_harmonics_rel_phase_1',
   'Freq2_harmonics_rel_phase_2', 'Freq2_harmonics_rel_phase_3',
   'Freq3_harmonics_amplitude_0', 'Freq3_harmonics_amplitude_1',
   'Freq3_harmonics_amplitude_2', 'Freq3_harmonics_amplitude_3',
   'Freq3_harmonics_rel_phase_1', 'Freq3_harmonics_rel_phase_2',
   'Freq3_harmonics_rel_phase_3', 'Gskew', 'LinearTrend', 'MaxSlope',
   'Mean', 'MedianAbsDev', 'MedianBRP', 'PairSlopeTrend',
   'PercentAmplitude', 'PercentDifferenceFluxPercentile', 'PeriodLS',
   'Period_fit', 'Psi_CS', 'Psi_eta', 'Q31', 'Rcs', 'Skew',
   'SmallKurtosis', 'Std', 'c89_c3', 'c89_hk_color', 'c89_jh_color',
   'c89_jk_color', 'c89_m2', 'c89_m4', 'n09_c3', 'n09_hk_color',
   'n09_jh_color', 'n09_jk_color', 'n09_m2', 'n09_m4', 'ppmb']

MIN_SAMPLES = len(FEATURES) * 2

In [3]:
%%time

datas = {
    "b216": joblib.load("_data/blz_b216_3849_scaled.pkl.bz2"),
    "b277": joblib.load("_data/blz_b277_3041_scaled.pkl.bz2")
}

CPU times: user 1min 15s, sys: 466 ms, total: 1min 16s
Wall time: 1min 16s


In [4]:
%%time
results, trees = {}, {}
for k, v in datas.items():
    print(f"[{dt.datetime.now()}] {k}")
    
    clf = IsolationForest(
        contamination='auto', 
        n_jobs=-1, 
        random_state=42)
    
    X = v[FEATURES].values
    
    trees[k] = clf
    results[k] = clf.fit_predict(X)
    

[2020-03-05 15:56:53.306277] b216
[2020-03-05 15:57:27.080018] b277
CPU times: user 3min 7s, sys: 54 s, total: 4min 2s
Wall time: 1min 46s


In [5]:
%%time

shaps = {}
for k, v in datas.items():
    print(f"[{dt.datetime.now()}] {k}")
    clf = trees[k]
    shaps[k] = shap.TreeExplainer(clf).shap_values(v[FEATURES].values)

[2020-03-05 15:58:40.306763] b216


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
The sklearn.ensemble.iforest module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.ensemble. Anything that cannot be imported from sklearn.ensemble is now part of the private API.


[2020-03-05 16:11:56.470736] b277


Setting feature_perturbation = "tree_path_dependent" because no background data was given.


CPU times: user 39min 34s, sys: 1.6 s, total: 39min 35s
Wall time: 39min 32s


In [6]:
%%time
joblib.dump({
    "trees": trees,
    "results": results,
    "shaps": shaps
}, "results/00_iforest/results.pkl.bz2", compress=3)

CPU times: user 1min 20s, sys: 526 ms, total: 1min 20s
Wall time: 1min 22s


In [7]:
dt.datetime.now()

datetime.datetime(2020, 3, 5, 16, 39, 35, 66367)