In [28]:
import sys
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
from tsfresh.feature_extraction import extract_features
from multiprocessing import Pool
tqdm.pandas(desc="apply progress")

In [2]:
data_dir = "/Users/hidehisa/.kaggle/competitions/plasticc"
train = pd.read_csv(data_dir + "/train_with_cluster.csv")
meta = pd.read_csv(data_dir + "/training_set_metadata.csv")

In [3]:
nova = [15, 42, 52, 62, 67, 90]
novaes = meta.query("target == @nova")
train["novae"] = 0

In [4]:
ind = train.query("object_id in @novaes.object_id").index
train.loc[ind, "novae"] = 1

In [6]:
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,cluster,novae
0,615,59750.4229,2,-544.810303,3.622952,1,2,0
1,615,59750.4306,1,-816.434326,5.55337,1,2,0
2,615,59750.4383,3,-471.385529,3.801213,1,2,0
3,615,59750.445,4,-388.984985,11.395031,1,2,0
4,615,59752.407,2,-681.858887,4.041204,1,2,0


In [13]:
train.groupby("object_id")["novae"].mean().head()

object_id
615     0
713     0
730     1
745     1
1124    1
Name: novae, dtype: int64

In [7]:
def basic(d):
    df = d.copy()
    df["flux_ratio_sq"] = np.power(df["flux"] / df["flux_err"], 2)
    df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"]

    aggs = {
        'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'detected': ['mean'],
        'flux_ratio_sq': ['sum', 'skew'],
        'flux_by_flux_ratio_sq': ['sum', 'skew'],
    }
    agg_df = df.groupby('object_id').agg(aggs)
    new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
    agg_df.columns = new_columns
    agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']
    agg_df['flux_dif2'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']
    agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df[
        'flux_ratio_sq_sum']
    agg_df['flux_dif3'] = (
        agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']

    per_passband_aggs = {
        "flux": ["min", "max", "mean", "std"],
        "flux_ratio_sq": ["sum", "skew"],
        "flux_by_flux_ratio_sq": ["sum", "skew"]
    }
    per_pass_agg_df = df.groupby(["object_id", "passband"]).agg(per_passband_aggs)
    per_pass_agg_df.columns = pd.Index([e[0] + "_" + e[1] for e in per_pass_agg_df.columns])
    per_pass_agg_df["flux_diff"] = per_pass_agg_df["flux_max"] - per_pass_agg_df["flux_min"]
    per_pass_agg_df["flux_diff2"] = (
        per_pass_agg_df["flux_max"] - per_pass_agg_df["flux_min"]) / per_pass_agg_df["flux_mean"]
    per_pass_agg_df["flux_w_mean"] = per_pass_agg_df["flux_by_flux_ratio_sq_sum"] / per_pass_agg_df[
        "flux_ratio_sq_sum"
    ]
    per_pass_agg_df["flux_dif3"] = (
    per_pass_agg_df["flux_max"] - per_pass_agg_df["flux_min"]) / per_pass_agg_df["flux_w_mean"]
    per_pass_agg_df = per_pass_agg_df.unstack()
    per_pass_agg_df.columns = pd.Index([str(e[1]) + "__" + e[0] for e in per_pass_agg_df.columns])
    
    basic_columns = [f"{i}__{j}" for i in range(6) for j in [
        "flux_min",
        "flux_max",
        "flux_mean",
        "flux_std",
        "flux_ratio_sq_sum",
        "flux_ratio_sq_skew",
        "flux_w_mean",
        "flux_diff2"
    ]]
    per_pass_agg_df.drop(basic_columns, axis=1, inplace=True)
    
    agg_df = pd.merge(agg_df, per_pass_agg_df, how="left", on="object_id")
    
    agg_flux_diff = agg_df.reset_index()[["object_id", "flux_diff"]]
    df2 = pd.merge(df, agg_df, how="left", on="object_id")
    df2["flux_norm"] = df2.flux / df2.flux_diff
    del df2["flux"]
    fcp = {
        'fft_coefficient': [{
            'coeff': 0,
            'attr': 'abs'
        }, {
            'coeff': 1,
            'attr': 'abs'
        }],
        'kurtosis':
        None,
        'skewness':
        None,
        "cid_ce": [{"normalize": True}]
    }
    fcp2 = {
        "fft_coefficient": [{
            "coeff": 0,
            "attr": "abs"
        }, {
            "coeff": 1,
            "attr": "abs"
        }],
        "abs_energy": None,
        "sample_entropy": None
    }
    fcp_flux = {
        "longest_strike_above_mean": None,
        "longest_strike_below_mean": None,
        "mean_change": None,
        "mean_abs_change": None,
        "cid_ce": [{"normalize": True}]
    }
    fcp_flux_by_flux_ratio_sq = {
        "longest_strike_above_mean": None,
        "longest_strike_below_mean": None
    }
    agg_df_ts = extract_features(
        df,
        column_id='object_id',
        column_sort='mjd',
        column_kind='passband',
        column_value='flux',
        default_fc_parameters=fcp,
        n_jobs=6)
    agg_df_ts2 = extract_features(
        df2,
        column_id="object_id",
        column_sort="mjd",
        column_kind="passband",
        column_value="flux_norm",
        default_fc_parameters=fcp2,
        n_jobs=4
    )
    agg_df_flux = extract_features(
        df,
        column_id="object_id",
        column_value="flux",
        default_fc_parameters=fcp_flux,
        n_jobs=4
    )
    agg_df_ffrs = extract_features(
        df,
        column_id="object_id",
        column_value="flux_by_flux_ratio_sq",
        default_fc_parameters=fcp_flux_by_flux_ratio_sq,
        n_jobs=4
    )
    df_det = df[df['detected'] == 1].copy()

    agg_df_mjd = extract_features(
        df_det,
        column_id='object_id',
        column_value='mjd',
        default_fc_parameters={
            'maximum': None,
            'minimum': None
        },
        n_jobs=8)
    agg_df_mjd['mjd_diff_det'] = agg_df_mjd['mjd__maximum'] - agg_df_mjd[
        'mjd__minimum']
    del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum']
    agg_df_ts2.columns = pd.Index([e + "_norm" for e in agg_df_ts2.columns])
    agg_df_ts = pd.merge(agg_df_ts, agg_df_mjd, on='id')
    agg_df_ts = pd.merge(agg_df_ts, agg_df_ts2, on="id")
    agg_df_ts = pd.merge(agg_df_ts, agg_df_flux, on="id")
    agg_df_ts = pd.merge(agg_df_ts, agg_df_ffrs, on="id")
    # tsfresh returns a dataframe with an index name='id'
    agg_df_ts.index.rename('object_id', inplace=True)
    agg_df = pd.merge(agg_df, agg_df_ts, on='object_id')
    return agg_df


def cluster_mean_diff(df):
    new_df = df.groupby(["object_id", "cluster"]).agg({
        "flux": ["mean", "max", "min"]
    })
    new_df.columns = pd.Index(
        [e[0] + "_" + e[1] for e in new_df.columns.tolist()])
    new_df["normalized_mean"] = new_df["flux_mean"] / (
        new_df["flux_max"] - new_df["flux_min"])
    new_df.reset_index(inplace=True)
    return new_df.groupby("object_id").agg({"normalized_mean": "std"})


def passband_std_difference(df):
    std_df = df.groupby(["object_id", "cluster", "passband"]).agg({
        "flux": "std"
    }).reset_index().groupby(["object_id",
                              "passband"])["flux"].mean().reset_index()
    std_df_max = std_df.groupby("object_id")["flux"].max()
    std_df_min = std_df.groupby("object_id")["flux"].min()
    return (std_df_max / std_df_min).reset_index()


def num_outliers(df):
    new_df = df.groupby("object_id").agg({"flux": ["mean", "std"]})
    new_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_df.columns])
    new_df["upper_sigma"] = new_df["flux_mean"] + new_df["flux_std"]
    new_df["upper_2sigma"] = new_df["flux_mean"] + 2 * new_df["flux_std"]
    new_df["lower_sigma"] = new_df["flux_mean"] - new_df["flux_std"]
    new_df["lower_2sigma"] = new_df["flux_mean"] - 2 * new_df["flux_std"]
    new_df.drop(["flux_mean", "flux_std"], axis=1, inplace=True)
    new_df = pd.merge(df, new_df, how="left", on="object_id")
    new_df["outside_sigma"] = (
        (new_df["flux"] > new_df["upper_sigma"]) |
        (new_df["flux"] < new_df["lower_sigma"])).astype(int)
    new_df["outside_2sigma"] = (
        (new_df["flux"] > new_df["upper_2sigma"]) |
        (new_df["flux"] < new_df["lower_2sigma"])).astype(int)

    return_df = new_df.groupby("object_id").agg({
        "outside_sigma": "sum",
        "outside_2sigma": "sum"
    })
    return_df.reset_index(inplace=True)
    return return_df

In [8]:
def haversine_plus(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees) from 
    #https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    """
    #Convert decimal degrees to Radians:
    lon1 = np.radians(lon1)
    lat1 = np.radians(lat1)
    lon2 = np.radians(lon2)
    lat2 = np.radians(lat2)

    #Implementing Haversine Formula: 
    dlon = np.subtract(lon2, lon1)
    dlat = np.subtract(lat2, lat1)

    a = np.add(np.power(np.sin(np.divide(dlat, 2)), 2),  
                          np.multiply(np.cos(lat1), 
                                      np.multiply(np.cos(lat2), 
                                                  np.power(np.sin(np.divide(dlon, 2)), 2))))
    
    haversine = np.multiply(2, np.arcsin(np.sqrt(a)))
    return {
        'haversine': haversine, 
        'latlon1': np.subtract(np.multiply(lon1, lat1), np.multiply(lon2, lat2)), 
   }


def process_meta(meta_df):
    meta_dict = dict()
    # distance
    meta_dict.update(haversine_plus(meta_df['ra'].values, meta_df['decl'].values, 
                   meta_df['gal_l'].values, meta_df['gal_b'].values))
    #
    meta_dict['hostgal_photoz_certain'] = np.multiply(
            meta_df['hostgal_photoz'].values, 
             np.exp(meta_df['hostgal_photoz_err'].values))
    
    meta_df = pd.concat([meta_df, pd.DataFrame(meta_dict, index=meta_df.index)], axis=1)
    return meta_df


def add_rank_bottom_and_top(df, feature_name):
    objid = ["object_id"]
    columns = [f"{i}{feature_name}" for i in range(6)]
    partial = df[objid+columns]
    partial_values = partial.melt(id_vars=objid, value_vars=columns).sort_values(["object_id", "value"])
    
    top_and_bottom = partial_values.groupby("object_id").agg({
        "variable": ["first", "last"]
    })
    top_and_bottom.columns = ["top"+feature_name, "bottom"+feature_name]
    for i, n in zip(["0", "1", "2", "3", "4", "5"], columns):
        top_and_bottom = top_and_bottom.replace(n, i)
    top_and_bottom = top_and_bottom.astype(int)
    return top_and_bottom


def rank(df, feature_name, thres=20):
    objid = ["object_id"]
    columns = [f"{i}{feature_name}" for i in range(6)]
    partial = df[objid+columns]
    partial_values = partial.melt(id_vars=objid, value_vars=columns).sort_values(["object_id", "value"])
    for i, n in zip(["0", "1", "2", "3", "4", "5"], columns):
        partial_values.replace(n, i, inplace=True)
    partial_values["dummy"] = 1
    rank_feats = partial_values.groupby(["object_id", "dummy"]).agg({
        "variable": "sum"
    })
    d = dict()
    cnt = 0
    for i in rank_feats["variable"]:
        if i not in d.keys():
            d[i] = cnt
            cnt += 1
    rank_feats.reset_index(inplace=True)
    rank_feats.drop("dummy", axis=1, inplace=True)
    rank_feats.rename(columns={"variable": f"rank{feature_name}"}, inplace=True)
    rank_feats[f"rank{feature_name}"].replace(d, inplace=True)
    rank_dict = (rank_feats[f"rank{feature_name}"].value_counts() > thres).to_dict()
    rank_feats[f"rank{feature_name}"] = rank_feats[f"rank{feature_name}"].map(
        lambda x: x if rank_dict[x] else cnt+1
    )
    
    return rank_feats


def add_by_features(df, feature_name, new_feat_name):
    for i in range(5):
        for j in range(1, 6):
            if j > i:
                df[f"{new_feat_name}{j}_by_{i}"] = df[f"{j}{feature_name}"] / df[f"{i}{feature_name}"]
    return df


In [9]:
def add_per_passband(d):
    df = d.copy()
    df["flux_ratio_sq"] = np.power(df["flux"] / df["flux_err"], 2)
    df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"]
    per_passband_aggs = {
        "flux": ["min", "max", "mean", "std"],
        "flux_ratio_sq": ["sum", "skew"],
        "flux_by_flux_ratio_sq": ["sum", "skew"]
    }
    per_pass_agg_df = df.groupby(["object_id", "passband"]).agg(per_passband_aggs)
    per_pass_agg_df.columns = pd.Index([e[0] + "_" + e[1] for e in per_pass_agg_df.columns])
    per_pass_agg_df["flux_diff"] = per_pass_agg_df["flux_max"] - per_pass_agg_df["flux_min"]
    per_pass_agg_df["flux_diff2"] = (
        per_pass_agg_df["flux_max"] - per_pass_agg_df["flux_min"]) / per_pass_agg_df["flux_mean"]
    per_pass_agg_df["flux_w_mean"] = per_pass_agg_df["flux_by_flux_ratio_sq_sum"] / per_pass_agg_df[
        "flux_ratio_sq_sum"
    ]
    per_pass_agg_df["flux_dif3"] = (
    per_pass_agg_df["flux_max"] - per_pass_agg_df["flux_min"]) / per_pass_agg_df["flux_w_mean"]
    per_pass_agg_df = per_pass_agg_df.unstack()
    per_pass_agg_df.columns = pd.Index([str(e[1]) + "__" + e[0] for e in per_pass_agg_df.columns])
    basic_columns = [f"{i}__{j}" for i in range(6) for j in [
        "flux_min",
        "flux_max",
        "flux_mean",
        "flux_std",
        "flux_ratio_sq_sum",
        "flux_ratio_sq_skew",
        "flux_w_mean",
        "flux_diff2"
    ]]
    per_pass_agg_df.drop(basic_columns, axis=1, inplace=True)
    return per_pass_agg_df

In [19]:
def get_full(df, meta):
    agg_basic = basic(df)
    cl_mean_diff = cluster_mean_diff(df)
    ps_std_diff = passband_std_difference(df)
    num_out = num_outliers(df)

    full = pd.merge(agg_basic, cl_mean_diff, how="left", on="object_id")
    full = pd.merge(full, ps_std_diff, how="left", on="object_id")
    full = pd.merge(agg_basic, num_out, how="left", on="object_id")
    meta = process_meta(meta)
    full = pd.merge(full, meta, how="left", on="object_id")
    full = add_by_features(full, "__fft_coefficient__coeff_0__attr_\"abs\"_norm", "flux_norm_fft_")
    full = add_by_features(full, "__abs_energy_norm", "abs_energy_")
    full = add_by_features(full, "__flux_diff", "flux_diff_")
    abs_en_rank = rank(full, "__abs_energy_norm", 0)
    flux_dif_rank = rank(full, "__flux_diff")
    
    flux_diff = add_rank_bottom_and_top(full, "__flux_diff")
    flux_dif3 = add_rank_bottom_and_top(full, "__flux_dif3")
    full = pd.merge(full, abs_en_rank, how="left", on="object_id")
    full = pd.merge(full, flux_dif_rank, how="left", on="object_id")
    full = pd.merge(full, flux_diff, how="left", on="object_id")
    full = pd.merge(full, flux_dif3, how="left", on="object_id")
    if "target" in full.columns:
        full.drop("target", axis=1, inplace=True)
    return full


def train_data(df, meta):
    full = get_full(df, meta)
    y = meta.target
    classes = sorted(y.unique())
    class_weight = {c: 1 for c in classes}

    for c in [64, 15]:
        class_weight[c] = 2
    oof_df = full[["object_id"]]
    del full['object_id'], full['distmod'], full['hostgal_specz']
    del full['ra'], full['decl'], full['gal_l'], full['gal_b'], full['ddf']
    return full, y, classes, class_weight, oof_df


def train_data_n(df, meta):
    full = get_full(df, meta)
    y = df.groupby("object_id").novae.mean()
    del full["object_id"], full["distmod"], full["hostgal_specz"]
    del full["ra"], full["decl"], full["gal_l"], full["gal_b"], full["ddf"]
    return full, y

In [20]:
%%time
full, y = train_data_n(train, meta)

Feature Extraction: 100%|██████████| 30/30 [00:07<00:00,  5.42it/s]
Feature Extraction: 100%|██████████| 20/20 [00:24<00:00,  1.66it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 11.26it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 14.73it/s]
Feature Extraction: 100%|██████████| 40/40 [00:00<00:00, 60.00it/s]


CPU times: user 59.2 s, sys: 5.35 s, total: 1min 4s
Wall time: 1min 15s


In [21]:
train_mean = full.mean(axis=0)
full.fillna(0, inplace=True)
full.head()

Unnamed: 0,flux_min,flux_max,flux_mean,flux_median,flux_std,flux_skew,flux_err_min,flux_err_max,flux_err_mean,flux_err_median,...,flux_diff_5_by_2,flux_diff_4_by_3,flux_diff_5_by_3,flux_diff_5_by_4,rank__abs_energy_norm,rank__flux_diff,top__flux_diff,bottom__flux_diff,top__flux_dif3,bottom__flux_dif3
0,-1100.440063,660.626343,-123.096998,-89.477524,394.109851,-0.34954,2.13051,12.845472,4.482743,3.835268,...,0.619088,0.82359,0.820379,0.996101,0,0,0,1,0,1
1,-14.735178,14.770886,-1.423351,-0.873033,6.471144,0.014989,0.639458,9.115748,2.35962,1.998217,...,1.407103,0.932131,1.221587,1.310531,1,548,2,0,5,1
2,-19.159811,47.310059,2.267434,0.409172,8.022239,3.177854,0.695106,11.281384,2.471061,1.990851,...,2.787751,1.204789,1.70401,1.414364,2,2,1,5,4,0
3,-15.494463,220.795212,8.909206,1.035895,27.558208,4.979826,0.56717,55.892746,2.555576,1.819875,...,0.680688,0.956449,0.728946,0.762138,3,3,0,2,1,5
4,-16.543753,143.600189,7.145702,1.141288,20.051722,4.406298,0.695277,11.38369,2.753004,2.214854,...,1.103552,1.122882,0.841532,0.749439,4,4,0,4,3,0


In [22]:
y.head()

object_id
615     0
713     0
730     1
745     1
1124    1
Name: novae, dtype: int64

In [47]:
def model_(full, seed=7):
    folds = StratifiedKFold(n_splits=7, shuffle=True, random_state=seed)
    clfs = []
    importances = pd.DataFrame()

    lgb_params = {
        'device': 'cpu', 
        'objective': 'binary',  
        'boosting_type': 'gbdt', 
        'n_jobs': -1, 
        'max_depth': 7, 
        'n_estimators': 500, 
        'subsample_freq': 2, 
        'subsample_for_bin': 5000, 
        'min_data_per_group': 100, 
        'max_cat_to_onehot': 4, 
        'cat_l2': 1.0, 
        'cat_smooth': 59.5, 
        'max_cat_threshold': 32, 
        'metric_freq': 10, 
        'verbosity': -1, 
        'metric': 'binary', 
        'xgboost_dart_mode': False, 
        'uniform_drop': False, 
        'colsample_bytree': 0.5, 
        'drop_rate': 0.173, 
        'learning_rate': 0.0267, 
        'max_drop': 5, 
        'min_child_samples': 10, 
        'min_child_weight': 100.0, 
        'min_split_gain': 0.1, 
        'num_leaves': 7, 
        'reg_alpha': 0.1, 
        'reg_lambda': 0.00023, 
        'skip_drop': 0.44, 
        'subsample': 0.75
    }

    # Compute weights
    w = y.value_counts()
    weights = {i: np.sum(w) / w[i] for i in w.index}
    oof_preds = np.zeros((len(full), np.unique(y).shape[0]))

    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = full.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            verbose=0,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights),
            categorical_feature=[
                "top__flux_diff",
                "bottom__flux_diff",
                "top__flux_dif3",
                "bottom__flux_dif3",
                "rank__abs_energy_norm",
                "rank__flux_diff"
            ]
        )
        oof_preds[val_, :] = clf.predict_proba(
            val_x, num_iteration=clf.best_iteration_)
        print(f1_score(val_y, np.argmax(oof_preds[val_, :], axis=1)))

        imp_df = pd.DataFrame()
        imp_df['feature'] = full.columns
        imp_df['gain'] = clf.feature_importances_
        imp_df['fold'] = fold_ + 1
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

        clfs.append(clf)

    print('MULTI WEIGHTED LOG LOSS : %.5f ' % f1_score(
        y, np.argmax(oof_preds, axis=1)))
    return clfs, importances, oof_preds

In [48]:
def seed_av_feature_selection(features, seeds=[1,3,5,7,9]):
    clfs_ = []
    importances_ = []
    oofs = []
    for s in seeds:
        clfs, imp, oof = model_(features, s)
        clfs_.append(clfs)
        importances_.append(imp)
        oofs.append(oof)
    return clfs_, importances_, oofs

In [49]:
clfs_, importances_, oofs = seed_av_feature_selection(full, [7])

New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.9856938483547926


New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.9857346647646219


New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.9765124555160143


New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.9801136363636365


New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.977904490377762


New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.984240687679083


New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
New categorical_feature is ['bottom__flux_dif3', 'bottom__flux_diff', 'rank__abs_energy_norm', 'rank__flux_diff', 'top__flux_dif3', 'top__flux_diff']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.9801699716713882
MULTI WEIGHTED LOG LOSS : 0.98147 


In [67]:
np.argwhere(np.logical_and(0.2 < oofs[0][:, 0], 0.8 > oofs[0][:, 0] )).shape

(488, 1)

In [65]:
oofs[0][7, :]

array([0.3028903, 0.6971097])