In [1]:
import pandas as pd
import numpy as np

from datetime import datetime as dt
from functools import partial, wraps

import plasticc.xgb_train as xgb
import plasticc.lgbm_train as lgbm
from plasticc.featurize import process_meta
from plasticc.featurize import featurize
from plasticc.lgbm_train import lgbm_modeling_cross_validation
from plasticc.xgb_train import xgb_modeling_cross_validation
from plasticc.final import featurize_test, predict_test

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
fcp = {
    'flux': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,
        'mean_change': None,
        'mean_abs_change': None,
        'length': None,
    },

    'flux_by_flux_ratio_sq': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,       
    },

    'flux_passband': {
        'fft_coefficient': [
                {'coeff': 0, 'attr': 'abs'}, 
                {'coeff': 1, 'attr': 'abs'}
            ],
        'kurtosis' : None, 
        'skewness' : None,
    },

    'mjd': {
        'maximum': None, 
        'minimum': None,
        'mean_change': None,
        'mean_abs_change': None,
    },
}

In [4]:
aggs = {
    'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum', 'skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}

## Data processing

In [5]:
%%time
meta_train = process_meta('../data/raw/training_set_metadata.csv')
train = pd.read_csv('../data/raw/training_set.csv')

CPU times: user 1.05 s, sys: 95.6 ms, total: 1.15 s
Wall time: 1.15 s


In [6]:
%%time
X = featurize(train, meta_train, aggs, fcp, n_jobs=4)
X_backup = X.copy()

Feature Extraction: 100%|██████████| 20/20 [00:06<00:00,  3.73it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 17.24it/s]
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 20.49it/s]
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 34.98it/s]


CPU times: user 16.7 s, sys: 916 ms, total: 17.6 s
Wall time: 20.5 s


In [7]:
if 'target' in X:
    y = X['target']
    del X['target']
else:
    print("What the duck")
    3//0

In [8]:
# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
classes = sorted(y.unique())
class_weights = {c: 1 for c in classes}
class_weights.update({c:2 for c in [64, 15]})
print('Unique classes : {}, {}'.format(len(classes), classes))
print(class_weights)

Unique classes : 14, [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
{6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}


In [9]:
%%time
train_mean = X.mean(axis=0)
#train_mean.to_hdf('train_data.hdf5', 'data')
pd.set_option('display.max_rows', 500)
#import pdb; pdb.set_trace()
X.fillna(0, inplace=True)

CPU times: user 2.64 ms, sys: 3.23 ms, total: 5.87 ms
Wall time: 5.49 ms


## Train model with CV

In [10]:
eval_func = partial(
    lgbm_modeling_cross_validation, 
    X=X, 
    y=y, 
    classes=classes, 
    class_weights=class_weights, 
    nr_fold=5, 
    random_state=1
)

In [11]:
lgbm_params = {
    'device': 'cpu',
    'objective': 'multiclass',
    'num_class': 14,
    'boosting_type': 'gbdt',
    'n_jobs': 16,
    'max_depth': 7,
    'n_estimators': 1024,
    'subsample_freq': 2,
    'subsample_for_bin': 5000,
    'min_data_per_group': 100,
    'max_cat_to_onehot': 4,
    'cat_l2': 1.0,
    'cat_smooth': 59.5,
    'max_cat_threshold': 32,
    'metric_freq': 10,
    'verbosity': -1,
    'metric': 'multi_logloss',
    'xgboost_dart_mode': False,
    'uniform_drop': False,
    'colsample_bytree': 0.5,
    'drop_rate': 0.173,
    'learning_rate': 0.0267,
    'max_drop': 5,
    'min_child_samples': 10,
    'min_child_weight': 100.0,
    'min_split_gain': 0.1,
    'num_leaves': 7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.00023,
    'skip_drop': 0.44,
    'subsample': 0.75
}


xgb_params = {
    'objective': 'multiclass',
    'booster': 'gbdtree',
    'n_jobs': 16,
    'max_depth': 7,
    'n_estimators': 1024,
    'verbosity': -1,
    'colsample_bytree': 0.5,
    'learning_rate': 0.0267,
    'min_child_weight': 100.0,
    'reg_alpha': 0.1,
    'reg_lambda': 0.00023,
    'subsample': 0.75
}

In [12]:
%%time
# modeling from CV
clfs, score = eval_func(lgbm_params)

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.719458	training's wloss: 0.712412	valid_1's multi_logloss: 1.05447	valid_1's wloss: 0.884753
[200]	training's multi_logloss: 0.464389	training's wloss: 0.45543	valid_1's multi_logloss: 0.827313	valid_1's wloss: 0.682208
[300]	training's multi_logloss: 0.363731	training's wloss: 0.354034	valid_1's multi_logloss: 0.747372	valid_1's wloss: 0.625374
[400]	training's multi_logloss: 0.306134	training's wloss: 0.296357	valid_1's multi_logloss: 0.707577	valid_1's wloss: 0.607326
[500]	training's multi_logloss: 0.264831	training's wloss: 0.255135	valid_1's multi_logloss: 0.680097	valid_1's wloss: 0.601133
Early stopping, best iteration is:
[508]	training's multi_logloss: 0.262046	training's wloss: 0.252407	valid_1's multi_logloss: 0.677936	valid_1's wloss: 0.600375
no 1-fold loss: 0.6003746421112547
Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.719866

In [13]:
filename = 'subm_{:.6f}_{}.csv'.format(
    score, 
    dt.now().strftime('%Y-%m-%d-%H-%M')
)
print('save to {}'.format(filename))

save to subm_0.609619_2018-12-09-20-07.csv


## FINAL RUN

In [14]:
# calculate test size
!wc -l ../data/raw/test_set_sample.csv

1000001 ../data/raw/test_set_sample.csv


In [15]:
#chunk_size_one_100 = 453653105 // 100 + 1
chunk_size_one_100 = 1000001 // 100 + 1
chunk_size_one_100

10001

In [16]:
featurize_test(
    featurize_configs={'aggs': aggs, 'fcp': fcp}, 
    n_jobs=4,
    meta_path='../data/raw/test_set_metadata.csv',
    test_path='../data/raw/test_set_sample.csv',
    output_path='feat_test.csv',
    id_colname='object_id',
    chunks=5000000,
)

Feature Extraction: 100%|██████████| 20/20 [00:02<00:00,  8.91it/s]
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 22.74it/s]
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 39.22it/s]
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 85.07it/s]


        5000000 done in   0.3 minutes


Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 2244.54it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 506.44it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 1188.52it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 1220.69it/s]


In [17]:
%%time
# should take 100x (time after 1st iteration)
predict_test(
    clfs,
    features=X.columns, 
    output_path=filename,
    input_path='feat_test.csv',
    chunks=chunk_size_one_100,
    n_jobs=4
)

          10001 done in   0.0 minutes
CPU times: user 6.88 s, sys: 0 ns, total: 6.88 s
Wall time: 1.24 s


In [18]:
z = pd.read_csv(filename)
print("Shape BEFORE grouping: {}".format(z.shape))
z = z.groupby('object_id').mean()
print("Shape AFTER grouping: {}".format(z.shape))
z.to_csv('single_{}'.format(filename), index=True)

Shape BEFORE grouping: (3036, 16)
Shape AFTER grouping: (3035, 15)
