In [1]:
import pandas as pd
import numpy as np

from datetime import datetime as dt
from functools import partial, wraps

import plasticc.lgbm_train as lgbm
from plasticc.featurize import process_meta
from plasticc.featurize import featurize
from plasticc.lgbm_train import lgbm_modeling_cross_validation
from plasticc.training import path_from_cv_score
from plasticc.final import featurize_test, predict_test

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
fcp = {
    'flux': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,
        'mean_change': None,
        'mean_abs_change': None,
        'length': None,
    },

    'flux_by_flux_ratio_sq': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,       
    },

    'flux_passband': {
        'fft_coefficient': [
                {'coeff': 0, 'attr': 'abs'}, 
                {'coeff': 1, 'attr': 'abs'}
            ],
        'kurtosis' : None, 
        'skewness' : None,
    },

    'mjd': {
        'maximum': None, 
        'minimum': None,
        'mean_change': None,
        'mean_abs_change': None,
    },
}

In [4]:
aggs = {
    'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum', 'skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}

### Generate features for model training

In [5]:
%%time
meta_train = process_meta('../data/raw/training_set_metadata.csv')
train = pd.read_csv('../data/raw/training_set.csv')

CPU times: user 1.71 s, sys: 260 ms, total: 1.97 s
Wall time: 1.58 s


In [6]:
%%time
X = featurize(train, meta_train, aggs, fcp, n_jobs=4)

Feature Extraction: 100%|██████████| 20/20 [00:08<00:00,  2.50it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00,  9.15it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 15.68it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 19.81it/s]


CPU times: user 1min 5s, sys: 1.64 s, total: 1min 6s
Wall time: 28.2 s


In [7]:
if 'target' in X:
    y = X['target']
    del X['target']
else:
    print("What the duck")
    3//0

In [8]:
# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
classes = sorted(y.unique())
class_weights = {c: 1 for c in classes}
class_weights.update({c:2 for c in [64, 15]})
print('Unique classes : {}, {}'.format(len(classes), classes))
print(class_weights)

Unique classes : 14, [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
{6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}


In [9]:
%%time
train_mean = X.mean(axis=0)
#train_mean.to_hdf('train_data.hdf5', 'data')
# pd.set_option('display.max_rows', 500)
#import pdb; pdb.set_trace()
X.fillna(0, inplace=True)

CPU times: user 260 ms, sys: 0 ns, total: 260 ms
Wall time: 13.1 ms


### Train model with CV

In [10]:
eval_func = partial(
    lgbm_modeling_cross_validation, 
    X=X, 
    y=y, 
    classes=classes, 
    class_weights=class_weights, 
    nr_fold=5, 
    random_state=1
)

In [11]:
lgbm_params = {
    'device': 'cpu',
    'objective': 'multiclass',
    'num_class': 14,
    'boosting_type': 'gbdt',
    'n_jobs': 16,
    'max_depth': 7,
    'n_estimators': 1024,
    'subsample_freq': 2,
    'subsample_for_bin': 5000,
    'min_data_per_group': 100,
    'max_cat_to_onehot': 4,
    'cat_l2': 1.0,
    'cat_smooth': 59.5,
    'max_cat_threshold': 32,
    'metric_freq': 10,
    'verbosity': -1,
    'metric': 'multi_logloss',
    'xgboost_dart_mode': False,
    'uniform_drop': False,
    'colsample_bytree': 0.5,
    'drop_rate': 0.173,
    'learning_rate': 0.0267,
    'max_drop': 5,
    'min_child_samples': 10,
    'min_child_weight': 100.0,
    'min_split_gain': 0.1,
    'num_leaves': 7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.00023,
    'skip_drop': 0.44,
    'subsample': 0.75
}


xgb_params = {
    'objective': 'multiclass',
    'booster': 'gbdtree',
    'n_jobs': 16,
    'max_depth': 7,
    'n_estimators': 1024,
    'verbosity': -1,
    'colsample_bytree': 0.5,
    'learning_rate': 0.0267,
    'min_child_weight': 100.0,
    'reg_alpha': 0.1,
    'reg_lambda': 0.00023,
    'subsample': 0.75
}

In [12]:
%%time
# modeling from CV
clfs, score, importances = eval_func(lgbm_params)

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.717861	training's wloss: 0.711255	valid_1's multi_logloss: 1.04954	valid_1's wloss: 0.884715
[200]	training's multi_logloss: 0.462997	training's wloss: 0.453975	valid_1's multi_logloss: 0.824108	valid_1's wloss: 0.681722
[300]	training's multi_logloss: 0.363312	training's wloss: 0.353314	valid_1's multi_logloss: 0.745845	valid_1's wloss: 0.62637
[400]	training's multi_logloss: 0.306263	training's wloss: 0.296382	valid_1's multi_logloss: 0.708015	valid_1's wloss: 0.609718
[500]	training's multi_logloss: 0.265356	training's wloss: 0.255547	valid_1's multi_logloss: 0.681448	valid_1's wloss: 0.603735
Early stopping, best iteration is:
[487]	training's multi_logloss: 0.269997	training's wloss: 0.260174	valid_1's multi_logloss: 0.683963	valid_1's wloss: 0.602947
no 1-fold loss: 0.6029465964400008
Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.718033

In [13]:
importances.sort_values(by='mean_gain', ascending=False).head(15)

Unnamed: 0,feature,gain,fold,mean_gain
60,hostgal_specz,2648,1,2844.8
60,hostgal_specz,3082,5,2844.8
60,hostgal_specz,2875,3,2844.8
60,hostgal_specz,2747,4,2844.8
60,hostgal_specz,2872,2,2844.8
54,mjd_diff_det,1727,3,1659.4
54,mjd_diff_det,1468,1,1659.4
54,mjd_diff_det,1863,5,1659.4
54,mjd_diff_det,1634,2,1659.4
54,mjd_diff_det,1605,4,1659.4


In [14]:
submission_file_path = path_from_cv_score(score)

### Test set features

In [15]:
chunk_size_one_100 = 453653105 // 100 + 1
# chunk_size_one_100 = 1000001 // 100 + 1
chunk_size_one_100

4536532

In [None]:
%%time
# should take 100x (time after 1st iteration)
featurize_test(
    featurize_configs={'aggs': aggs, 'fcp': fcp}, 
    n_jobs=4,
    meta_path='../data/raw/test_set_metadata.csv',
    test_path='../data/raw/test_set.csv',
    output_path='../data/feature-selection/test-all-feat-from-kernel-repro.csv',
    id_colname='object_id',
    chunks=chunk_size_one_100,  # alternatively: 5000000
)

### Test set predictions

In [16]:
%%time
submission = predict_test(
    clfs=clfs, 
    feature_colnames=X.drop(columns='object_id').columns, 
    id_colname='object_id', 
    input_path='../data/feature-selection/test-all-feat-from-kernel-repro.csv', 
    output_path=submission_file_path, 
    verbose=True
)

Loading data...


  0%|          | 0/5 [00:00<?, ?it/s]

Generating predictions...


100%|██████████| 5/5 [07:24<00:00, 88.84s/it]


Postprocessing...
Submission shape before grouping: (3492891, 16)
Submission shape after grouping: (3492890, 15)
Submission shape after postprocessing: (3492890, 15)
Validating submission file...
Saving submission...
Submission saved to f/home/kk385830/astronomical-classification/submissions/subm_0.609236_2018-12-10-14-21
CPU times: user 1h 55min 4s, sys: 1min 18s, total: 1h 56min 23s
Wall time: 9min 38s


In [17]:
submission.head()

Unnamed: 0_level_0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13,0.000305,0.002167,0.000222,0.378942,0.26028,0.000546,0.091976,0.000201,0.000235,0.002533,0.000457,0.132778,0.000239,0.00028,0.128839
14,0.019575,0.037175,0.005286,0.166122,0.125653,0.002729,0.152813,0.011442,0.213547,0.03164,0.00338,0.082647,0.002471,0.003202,0.142318
17,0.020874,0.014936,0.004936,0.106407,0.12602,0.002214,0.068507,0.070296,0.062055,0.14121,0.006181,0.229678,0.001196,0.002432,0.143058
23,0.02361,0.006495,0.00794,0.104016,0.055123,0.001947,0.139985,0.10169,0.096011,0.228287,0.001821,0.083863,0.000935,0.005069,0.143208
34,0.000393,0.004274,0.000247,0.106649,0.084917,0.000521,0.021107,0.000271,0.00046,0.004701,0.000495,0.689042,0.000207,0.000524,0.086191
