In [14]:
import pandas as pd
import numpy as np

from datetime import datetime as dt
from functools import partial, wraps

import plasticc.xgb_train as xgb
import plasticc.lgbm_train as lgbm
from plasticc.training import process_meta
from plasticc.featurize import featurize
from plasticc.lgbm_train import lgbm_modeling_cross_validation
from plasticc.xgb_train import xgb_modeling_cross_validation

In [2]:
fcp = {
    'flux': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,
        'mean_change': None,
        'mean_abs_change': None,
        'length': None,
    },

    'flux_by_flux_ratio_sq': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,       
    },

    'flux_passband': {
        'fft_coefficient': [
                {'coeff': 0, 'attr': 'abs'}, 
                {'coeff': 1, 'attr': 'abs'}
            ],
        'kurtosis' : None, 
        'skewness' : None,
    },

    'mjd': {
        'maximum': None, 
        'minimum': None,
        'mean_change': None,
        'mean_abs_change': None,
    },
}

In [3]:
aggs = {
    'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum', 'skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}

## Data processing

In [4]:
%%time
meta_train = process_meta('../data/raw/training_set_metadata.csv')
train = pd.read_csv('../data/raw/training_set.csv')

CPU times: user 1.77 s, sys: 703 ms, total: 2.47 s
Wall time: 2.49 s


In [5]:
%%time
X = featurize(train, meta_train, aggs, fcp)
X_backup = X.copy()

Feature Extraction: 100%|██████████| 20/20 [00:16<00:00,  1.88it/s]
Feature Extraction: 100%|██████████| 20/20 [00:03<00:00,  5.54it/s]
Feature Extraction: 100%|██████████| 20/20 [00:02<00:00,  7.12it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 16.63it/s]


CPU times: user 26.5 s, sys: 6.06 s, total: 32.6 s
Wall time: 43.4 s


In [6]:
if 'target' in X:
    y = X['target']
    del X['target']
else:
    print("What the duck")
    3//0

In [7]:
# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
classes = sorted(y.unique())
class_weights = {c: 1 for c in classes}
class_weights.update({c:2 for c in [64, 15]})
print('Unique classes : {}, {}'.format(len(classes), classes))
print(class_weights)

Unique classes : 14, [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
{6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}


In [8]:
if 'object_id' in X:
    oof_df = X[['object_id']]
    del X['object_id'] 
    #del full_train['distmod'] 
    del X['hostgal_specz']
    del X['ra'], X['decl'], X['gal_l'], X['gal_b']
    del X['ddf']
else:
    print("What the duck")
    3//0

In [9]:
%%time
train_mean = X.mean(axis=0)
#train_mean.to_hdf('train_data.hdf5', 'data')
pd.set_option('display.max_rows', 500)
#import pdb; pdb.set_trace()
X.fillna(0, inplace=True)

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 8.05 ms


## Train model with CV

In [10]:
eval_func = partial(
    lgbm_modeling_cross_validation, 
    X=X, 
    y=y, 
    classes=classes, 
    class_weights=class_weights, 
    nr_fold=5, 
    random_state=1
)

In [11]:
lgbm_params = {
    'device': 'cpu',
    'objective': 'multiclass',
    'num_class': 14,
    'boosting_type': 'gbdt',
    'n_jobs': 16,
    'max_depth': 7,
    'n_estimators': 1024,
    'subsample_freq': 2,
    'subsample_for_bin': 5000,
    'min_data_per_group': 100,
    'max_cat_to_onehot': 4,
    'cat_l2': 1.0,
    'cat_smooth': 59.5,
    'max_cat_threshold': 32,
    'metric_freq': 10,
    'verbosity': -1,
    'metric': 'multi_logloss',
    'xgboost_dart_mode': False,
    'uniform_drop': False,
    'colsample_bytree': 0.5,
    'drop_rate': 0.173,
    'learning_rate': 0.0267,
    'max_drop': 5,
    'min_child_samples': 10,
    'min_child_weight': 100.0,
    'min_split_gain': 0.1,
    'num_leaves': 7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.00023,
    'skip_drop': 0.44,
    'subsample': 0.75
}


xgb_params = {
    'objective': 'multiclass',
    'booster': 'gbdtree',
    'n_jobs': 16,
    'max_depth': 7,
    'n_estimators': 1024,
    'verbosity': -1,
    'colsample_bytree': 0.5,
    'learning_rate': 0.0267,
    'min_child_weight': 100.0,
    'reg_alpha': 0.1,
    'reg_lambda': 0.00023,
    'subsample': 0.75
}

In [12]:
%%time
# modeling from CV
clfs, score = eval_func(lgbm_params)

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.74868	training's wloss: 0.740698	valid_1's multi_logloss: 1.09916	valid_1's wloss: 0.933424
[200]	training's multi_logloss: 0.490693	training's wloss: 0.480231	valid_1's multi_logloss: 0.877408	valid_1's wloss: 0.732898
[300]	training's multi_logloss: 0.388082	training's wloss: 0.37642	valid_1's multi_logloss: 0.798258	valid_1's wloss: 0.675027
[400]	training's multi_logloss: 0.328428	training's wloss: 0.316807	valid_1's multi_logloss: 0.75792	valid_1's wloss: 0.660313
[500]	training's multi_logloss: 0.286033	training's wloss: 0.274597	valid_1's multi_logloss: 0.730866	valid_1's wloss: 0.654883
[600]	training's multi_logloss: 0.25219	training's wloss: 0.241366	valid_1's multi_logloss: 0.710008	valid_1's wloss: 0.65427
Early stopping, best iteration is:
[569]	training's multi_logloss: 0.262032	training's wloss: 0.251051	valid_1's multi_logloss: 0.71524	valid_1's wloss: 0.653658
no 1-fold loss

In [13]:
filename = 'subm_{:.6f}_{}.csv'.format(
    score, 
    dt.now().strftime('%Y-%m-%d-%H-%M')
)
print('save to {}'.format(filename))

NameError: name 'dt' is not defined

## FINAL RUN

In [None]:
# calculate test size
!wc -l ../data/raw/test_set.csv

In [None]:
chunk_size_one_100 = int(453653105 / 100) + 1
chunk_size_one_100

In [None]:
%%time
# should take 100x (time after 1st iteration)
process_test(
    clfs, 
    features=X.columns, 
    featurize_configs={'aggs': aggs, 'fcp': fcp}, 
    train_mean=train_mean, 
    filename=filename,
    chunks=chunk_size_one_100
)

In [None]:
z = pd.read_csv(filename)
print("Shape BEFORE grouping: {}".format(z.shape))
z = z.groupby('object_id').mean()
print("Shape AFTER grouping: {}".format(z.shape))
z.to_csv('single_{}'.format(filename), index=True)