In [1]:
import sys
sys.path.append("../")
import os
os.chdir('../')
import matplotlib.pyplot as plt

In [2]:
import json
import argparse
import numpy as np
import pandas as pd
import copy
import gc
from collections import Counter, OrderedDict
from operator import itemgetter

from src.data.load_dataset import load_dataset

from src.utils.logger_functions import get_module_logger
from src.utils.json_dump import save_json
from src.utils.get_conf_mat import get_conf_mat

from src.features.drop_features import drop_features
from src.features.select_features import select_features
from src.features.base import load_features

from src.models.lightgbm import LightGBM, multi_weighted_logloss
from src.models.get_folds import get_StratifiedKFold

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/home/hakubishin3/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/hakubishin3/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/hakubishin3/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/hakubishin3/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/hakubishin3/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/hakubishin3/anaconda3/lib/p

In [3]:
config = json.load(open("configs/lightgbm_35.json"))
logger = get_module_logger(__name__)
debug = False

# load dataset
# read only metadata.
logger.info('load dataset.')
train_meta, test_meta = load_dataset(config, 'meta', debug)
logger.debug(f'train_meta: {train_meta.shape}, test_meta: {test_meta.shape}')

# load features
logger.info('load features')
x_train, x_test = load_features(config, debug)
logger.debug(f'number of features: {x_train.shape[1]}')

2018-11-29 16:34:47,556 [__main__] [INFO] load dataset.
2018-11-29 16:34:52,470 [__main__] [DEBUG] train_meta: (7848, 12), test_meta: (3492890, 11)
2018-11-29 16:34:52,471 [__main__] [INFO] load features
  return feather.read_dataframe(path, nthreads=nthreads)
2018-11-29 16:38:14,503 [__main__] [DEBUG] number of features: 872


In [4]:
# check classes
classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
class_weights = {6: 1.00104, 15: 2.00189, 16: 1.00104, 42: 1.00104, 52: 1.00104, 53: 1.00000, 62: 1.00104,
                 64: 2.00710, 65: 1.00104, 67: 1.00104, 88: 1.00104, 90: 1.00104, 92: 1.00104, 95: 1.00104}

# galastic vs extra-galastic
gal_classes = [6, 16, 53, 65, 92]
exgal_classes = [15, 42, 52, 62, 64, 67, 88, 90, 95]
gal_class_weights = {k: v for k, v in class_weights.items() if k in gal_classes}
exgal_class_weights = {k: v for k, v in class_weights.items() if k in exgal_classes}

In [5]:
train_gal_index = train_meta.query("hostgal_photoz == 0").index
train_exgal_index = train_meta.query("hostgal_photoz != 0").index
x_train_gal = x_train.loc[train_gal_index].reset_index(drop=True)
x_train_exgal = x_train.loc[train_exgal_index].reset_index(drop=True)

x_train_gal.dropna(how="all", axis=1, inplace=True)     # 全ての値が欠損である列を除外
x_train_exgal.dropna(how="all", axis=1, inplace=True)   # 全ての値が欠損である列を除外
std_gal = x_train_gal.apply(lambda x: np.std(x.dropna()))
x_train_gal.drop(std_gal[std_gal == 0].index.tolist(), axis=1, inplace=True)   # 同じ値しか持たない列を除外
std_exgal = x_train_exgal.apply(lambda x: np.std(x.dropna()))
x_train_exgal.drop(std_exgal[std_exgal == 0].index.tolist(), axis=1, inplace=True)   # 同じ値しか持たない列を除外
logger.debug(f'x_train_gal: {x_train_gal.shape}')
logger.debug(f'x_train_exgal: {x_train_exgal.shape}')

2018-11-29 16:38:15,406 [__main__] [DEBUG] x_train_gal: (2325, 762)
2018-11-29 16:38:15,407 [__main__] [DEBUG] x_train_exgal: (5523, 834)


In [6]:
test_gal_index = test_meta.query("hostgal_photoz == 0").index
test_exgal_index = test_meta.query("hostgal_photoz != 0").index
x_test_gal = x_test.loc[test_gal_index].reset_index(drop=True)
x_test_exgal = x_test.loc[test_exgal_index].reset_index(drop=True)
test_gal_objectid = test_meta.query("hostgal_photoz == 0").object_id.values
test_exgal_objectid = test_meta.query("hostgal_photoz != 0").object_id.values

- lightgbm_35.jsonについて
- LB: 0.975

In [7]:
output_35 = json.load(open("./data/output/output_42.json"))
gal_cols = [*output_35["evals_result_gal"]["feature_importance"].keys()]
exgal_cols = [*output_35["evals_result_exgal"]["feature_importance"].keys()]
len(gal_cols), len(exgal_cols)

(60, 100)

In [8]:
from IPython.display import display
display(x_train_gal[gal_cols].isnull().sum().sort_values(ascending=False))
display(x_test_gal[gal_cols].isnull().sum().sort_values(ascending=False))

fluxdiff_median_detected1      17
fluxdiff_min_detected1         17
fluxdiff_mean_detected1        17
minimum_y                       0
flux_err_min                    0
median_absolute_deviation_i     0
percent_close_to_median_y       0
median_absolute_deviation_y     0
skew_u                          0
median_u                        0
flux_median                     0
maximum_y                       0
flux_dif2                       0
period_fast_y                   0
stetson_j_y                     0
median_absolute_deviation_g     0
std_z                           0
period_fast_z                   0
flux_min_detected1              0
skew_z                          0
flux_by_flux_ratio_sq_skew      0
__freq_varrat___5_              0
period_fast_r                   0
skew_i                          0
skew_g                          0
qso_log_chi2_qsonu_u            0
median_absolute_deviation_r     0
flux_skew                       0
skew_r                          0
median_r      

fluxdiff_median_detected1      2196
fluxdiff_mean_detected1        2196
fluxdiff_min_detected1         2196
__freq_y_offset___3_              4
qso_log_chi2_qsonu_u              3
__freq_varrat___5_                2
period_fast_y                     0
flux_dif2                         0
maximum_y                         0
minimum_y                         0
stetson_j_y                       0
median_u                          0
skew_u                            0
median_absolute_deviation_y       0
percent_close_to_median_y         0
median_absolute_deviation_i       0
flux_median                       0
median_absolute_deviation_g       0
median_r                          0
period_fast_z                     0
flux_min_detected1                0
skew_z                            0
flux_by_flux_ratio_sq_skew        0
period_fast_r                     0
skew_i                            0
skew_g                            0
median_absolute_deviation_r       0
flux_skew                   

In [9]:
from IPython.display import display
display(x_train_exgal[exgal_cols].isnull().sum().sort_values(ascending=False))
display(x_test_exgal[exgal_cols].isnull().sum().sort_values(ascending=False))

fluxdiff_median_detected1                 79
fluxdiff_dif2_detected1                   79
fluxdiff_max_detected1                    79
fluxdiff_min_detected1                    79
__freq_varrat___0_                         1
flux_mean_detected1_fluxfactor             0
mean_y                                     0
maximum_u                                  0
flux_w_mean_1_fluxfactor                   0
median_absolute_deviation_r                0
flux_skew_fluxfactor                       0
flux_std_detected1                         0
weighted_average_y                         0
period_fast_i                              0
median_absolute_deviation_g                0
hostgal_photoz_certain                     0
__freq_varrat___5_                         0
period_fast_y                              0
distmod                                    0
skew_i                                     0
flux_by_flux_ratio_sq_sum_1_fluxfactor     0
qso_log_chi2_qsonu_y                       0
hostgal_ph

fluxdiff_dif2_detected1                   91239
fluxdiff_min_detected1                    91238
fluxdiff_median_detected1                 91238
fluxdiff_max_detected1                    91238
__freq_varrat___0_                          199
__freq_varrat___1_                          100
stetson_j_u                                  49
period_fast_u                                49
qso_log_chi2_qsonu_u                         49
fluxdiff_skew_3                              34
__freq_varrat___2_                           26
__freq_varrat___3_                           19
fluxdiff_skew_2                              15
__freq3_rel_phase4___5_                       6
__freq_varrat___4_                            4
__freq_varrat___5_                            4
fluxdiff_min_2                                3
fluxdiff_diff_2                               3
flux_skew_fluxfactor                          0
flux_std_detected1                            0
period_fast_i                           

In [48]:
output_38 = json.load(open("./kaggle_plasticc/data/output/output_38.json"))
gal_cols = [*output_38["evals_result_gal"]["feature_importance"].keys()]
exgal_cols = [*output_38["evals_result_exgal"]["feature_importance"].keys()]
len(gal_cols), len(exgal_cols)

(30, 150)

In [49]:
from IPython.display import display
display(x_train_gal[gal_cols].isnull().sum().sort_values(ascending=False))
display(x_test_gal[gal_cols].isnull().sum().sort_values(ascending=False))

flux_skew_detected1            423
percent_amplitude_r            111
fluxdiff_mean_detected1         17
fluxdiff_median_detected1       17
fluxdiff_min_detected1          17
skew_y                           0
skew_z                           0
weighted_average_y               0
median_absolute_deviation_r      0
period_fast_y                    0
median_absolute_deviation_y      0
std_y                            0
flux_by_flux_ratio_sq_skew       0
skew_i                           0
median_absolute_deviation_g      0
flux_skew                        0
skew_r                           0
percent_close_to_median_y        0
qso_log_chi2_qsonu_u             0
flux_ratio_sq_skew               0
stetson_j_y                      0
flux_dif2                        0
period_fast_z                    0
flux_min_detected1               0
period_fast_r                    0
median_r                         0
percent_close_to_median_z        0
period_fast_i                    0
skew_g              

percent_amplitude_r            73983
flux_skew_detected1            55133
fluxdiff_mean_detected1         2196
fluxdiff_median_detected1       2196
fluxdiff_min_detected1          2196
qso_log_chi2_qsonu_u               3
skew_y                             0
flux_skew                          0
weighted_average_y                 0
median_absolute_deviation_r        0
period_fast_y                      0
median_absolute_deviation_y        0
std_y                              0
flux_by_flux_ratio_sq_skew         0
skew_i                             0
median_absolute_deviation_g        0
skew_r                             0
skew_z                             0
percent_close_to_median_y          0
flux_ratio_sq_skew                 0
stetson_j_y                        0
flux_dif2                          0
period_fast_z                      0
flux_min_detected1                 0
period_fast_r                      0
median_r                           0
percent_close_to_median_z          0
p

In [39]:
from IPython.display import display
display(x_train_exgal[cols].isnull().sum().sort_values(ascending=False))
display(x_test_exgal[cols].isnull().sum().sort_values(ascending=False))

peakpoint_pass0-pass4                              4071
peakpoint_pass0-pass3                              4040
peakpoint_pass1-pass5                              3121
time_from_peak_after_thres0.1_pass4                2746
peakpoint_pass2-pass5                              2697
time_from_peak_after_thres0.2_pass4                2679
peakpoint_pass3-pass5                              2630
time_from_peak_after_thres0.3_pass4                2624
time_from_peak_after_thres0.1_pass3                2587
time_from_peak_after_thres0.4_pass4                2576
time_from_peak_after_thres0.5_pass4                2526
time_from_peak_after_thres0.2_pass3                2517
time_from_peak_after_thres0.6_pass4                2477
time_from_peak_after_thres0.3_pass3                2453
time_from_peak_after_thres0.4_pass3                2407
time_from_peak_after_thres0.1_pass2                2397
peakpoint_pass1-pass4                              2375
time_from_peak_after_thres0.2_pass2             

peakpoint_pass0-pass4                              2967875
peakpoint_pass0-pass3                              2942949
peakpoint_pass1-pass5                              2812905
peakpoint_pass2-pass5                              2710778
peakpoint_pass3-pass5                              2698158
time_from_peak_after_thres0.1_pass4                2545741
time_from_peak_after_thres0.2_pass4                2538592
time_from_peak_after_thres0.3_pass4                2531684
time_from_peak_after_thres0.4_pass4                2525256
time_from_peak_after_thres0.5_pass4                2519029
time_from_peak_after_thres0.6_pass4                2513296
peakpoint_pass1-pass4                              2440311
time_from_peak_after_thres0.1_pass3                2129555
time_from_peak_after_thres0.2_pass3                2117038
time_from_peak_after_thres0.3_pass3                2105253
time_from_peak_after_thres0.4_pass3                2094011
peakpoint_pass2-pass4                              19710

In [40]:
output_39 = json.load(open("./kaggle_plasticc/data/output/output_39.json"))
cols = [*output_39["evals_result_exgal"]["feature_importance"].keys()]
len(cols)

100

In [41]:
from IPython.display import display
display(x_train_exgal[cols].isnull().sum().sort_values(ascending=False))
display(x_test_exgal[cols].isnull().sum().sort_values(ascending=False))

peakpoint_pass1-pass5                              3121
peakpoint_pass2-pass5                              2697
peakpoint_pass3-pass5                              2630
peakpoint_pass1-pass4                              2375
peakpoint_pass2-pass4                              1550
peakpoint_pass3-pass4                              1430
fluxdiff_skew_detected1                             788
flux_by_flux_ratio_sq_skew_detected1                466
flux_skew_detected1_fluxfactor                      466
flux_by_flux_ratio_sq_skew_detected1_fluxfactor     466
flux_skew_detected1                                 466
fluxdiff_std_detected1                              329
fluxdiff_median_detected1                            79
fluxdiff_dif2_detected1                              79
fluxdiff_max_detected1                               79
fluxdiff_min_detected1                               79
__freq_varrat___0_                                    1
median_absolute_deviation_g                     

peakpoint_pass1-pass5                              2812905
peakpoint_pass2-pass5                              2710778
peakpoint_pass3-pass5                              2698158
peakpoint_pass1-pass4                              2440311
peakpoint_pass2-pass4                              1971063
peakpoint_pass3-pass4                              1944836
fluxdiff_skew_detected1                            1203297
flux_by_flux_ratio_sq_skew_detected1_fluxfactor     910812
flux_by_flux_ratio_sq_skew_detected1                910812
flux_skew_detected1                                 910812
flux_skew_detected1_fluxfactor                      910812
fluxdiff_std_detected1                              390131
fluxdiff_dif2_detected1                              91239
fluxdiff_median_detected1                            91238
fluxdiff_max_detected1                               91238
fluxdiff_min_detected1                               91238
__freq_varrat___0_                                     1

In [55]:
x_train_exgal.isnull().sum()[x_train_exgal.isnull().sum() < 250].index.tolist()

['hostgal_photoz',
 'hostgal_photoz_err',
 'distmod',
 'mwebv',
 'haversine',
 'latlon1',
 'hostgal_photoz_certain',
 'flux_min',
 'flux_max',
 'flux_mean',
 'flux_median',
 'flux_std',
 'flux_skew',
 'flux_err_min',
 'flux_err_max',
 'flux_err_mean',
 'flux_err_median',
 'flux_err_std',
 'flux_err_skew',
 'detected_mean',
 'flux_ratio_sq_sum',
 'flux_ratio_sq_skew',
 'flux_by_flux_ratio_sq_sum',
 'flux_by_flux_ratio_sq_skew',
 'diff_mjd_maxmin_detected1',
 'flux_min_detected1',
 'flux_max_detected1',
 'flux_mean_detected1',
 'flux_median_detected1',
 'flux_std_detected1',
 'flux_err_min_detected1',
 'flux_err_max_detected1',
 'flux_err_mean_detected1',
 'flux_err_median_detected1',
 'flux_err_std_detected1',
 'flux_ratio_sq_sum_detected1',
 'flux_by_flux_ratio_sq_sum_detected1',
 'flux_diff',
 'flux_dif2',
 'flux_w_mean',
 'flux_dif3',
 'flux_diff_detected1',
 'flux_dif2_detected1',
 'flux_w_mean_detected1',
 'flux_dif3_detected1',
 'amplitude_u',
 'amplitude_g',
 'amplitude_r',
 'amp