In [2]:
import pandas as pd
import numpy as np

from typing import Set
from datetime import datetime as dt

from plasticc.featurize import process_meta, featurize
from plasticc.training import path_from_cv_score, train_and_validate
from plasticc.final import featurize_test, predict_test

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
fcp = {
    'flux': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,
        'mean_change': None,
        'mean_abs_change': None,
        'length': None,
    },

    'flux_by_flux_ratio_sq': {
        'longest_strike_above_mean': None,
        'longest_strike_below_mean': None,       
    },

    'flux_passband': {
        'fft_coefficient': [
                {'coeff': 0, 'attr': 'abs'}, 
                {'coeff': 1, 'attr': 'abs'}
            ],
        'kurtosis' : None, 
        'skewness' : None,
    },

    'mjd': {
        'maximum': None, 
        'minimum': None,
        'mean_change': None,
        'mean_abs_change': None,
    },
}

In [4]:
aggs = {
    'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum', 'skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}

### Generate features for model training

In [5]:
%%time
meta_train = process_meta('../data/raw/training_set_metadata.csv')
train = pd.read_csv('../data/raw/training_set.csv')

CPU times: user 1.92 s, sys: 304 ms, total: 2.22 s
Wall time: 1.59 s


In [6]:
%%time
X = featurize(train, meta_train, aggs, fcp, n_jobs=4)

Feature Extraction: 100%|██████████| 20/20 [00:08<00:00,  2.92it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 11.65it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 15.01it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 16.52it/s]


CPU times: user 1min 4s, sys: 1.73 s, total: 1min 6s
Wall time: 28.5 s


In [7]:
if 'target' in X:
    y = X['target']
    del X['target']
else:
    print("What the duck")
    3//0

In [8]:
%%time
train_mean = X.mean(axis=0)
#train_mean.to_hdf('train_data.hdf5', 'data')
# pd.set_option('display.max_rows', 500)
#import pdb; pdb.set_trace()
X.fillna(0, inplace=True)

CPU times: user 196 ms, sys: 0 ns, total: 196 ms
Wall time: 9.88 ms


### Train model with CV

In [9]:
lgbm_params = {
    'device': 'cpu',
    'objective': 'multiclass',
    'num_class': 14,
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'max_depth': 6,
    'n_estimators': 1024,
    'subsample_freq': 2,
    'subsample_for_bin': 5000,
    'min_data_per_group': 100,
    'max_cat_to_onehot': 4,
    'cat_l2': 1.0,
    'cat_smooth': 59.5,
    'max_cat_threshold': 32,
    'metric_freq': 10,
    'verbosity': -1,
    'metric': 'multi_logloss',
    'xgboost_dart_mode': False,
    'uniform_drop': False,
    'colsample_bytree': 0.5,
    'drop_rate': 0.173,
    'learning_rate': 0.0267,
    'max_drop': 5,
    'min_child_samples': 10,
    'min_child_weight': 100.0,
    'min_split_gain': 0.126,
    'num_leaves': 7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.00023,
    'skip_drop': 0.44,
    'subsample': 0.75,
    'max_bin': 32,
    'min_data_in_leaf': 13,
    'lambda_l1': 2,
}

In [10]:
colnames_to_ignore = set([
    'object_id',
    'hostgal_specz',
    'ra',
    'decl',
    'gal_l',
    'gal_b',
    'ddf',
    
])
colnames_to_ignore_restrictive = colnames_to_ignore | set([
    'latlon1',
    'haversine',
])
colnames_to_ignore_very_restrictive = colnames_to_ignore_restrictive | set([
    'flux_err_skew',
    'flux_by_flux_ratio_sq_sum'
])
feature_colnames = [col for col in X.columns if col not in colnames_to_ignore]
id_colname = 'object_id'

In [1]:
def train_validate_predict(ignore_colnames: Set[str], name_suffix: str):
    feature_colnames = [col for col in X.columns if col not in ignore_colnames]
    clfs, score, importances = train_and_validate(
        X=X, 
        y=y,
        feature_colnames=feature_colnames, 
        id_colname=id_colname, 
        model='lgbm', 
        model_params=lgbm_params, 
        nr_fold=5, 
        random_state=1
    )
    submission_file_path = path_from_cv_score(score, suffix=name_suffix)
    print(submission_file_path)
    print(importances.sort_values(by='mean_gain', ascending=False).head(15))
    submission = predict_test(
        clfs=clfs, 
        feature_colnames=feature_colnames, 
        id_colname=id_colname, 
        input_path='../data/features/test-all-feat-from-kernel-repro.csv', 
        output_path=submission_file_path, 
        verbose=True
    )

NameError: name 'Set' is not defined

In [None]:
for i, ign_cols in enumerate([colnames_to_ignore, colnames_to_ignore_restrictive, colnames_to_ignore_very_restrictive]):
    print(f"Iteration {i}, ignoring columnsL ", ign_cols)
    train_validate_predict(ign_cols)

Iteration 0, ignoring columnsL  {'object_id', 'ddf', 'decl', 'ra', 'hostgal_specz', 'gal_l', 'gal_b'}
Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.773799	training's wloss: 0.765137	valid_1's multi_logloss: 1.10973	valid_1's wloss: 0.934213
[200]	training's multi_logloss: 0.516243	training's wloss: 0.505244	valid_1's multi_logloss: 0.889687	valid_1's wloss: 0.722783
[300]	training's multi_logloss: 0.415734	training's wloss: 0.403802	valid_1's multi_logloss: 0.809795	valid_1's wloss: 0.66331
[400]	training's multi_logloss: 0.356298	training's wloss: 0.34419	valid_1's multi_logloss: 0.771195	valid_1's wloss: 0.645507
[500]	training's multi_logloss: 0.313686	training's wloss: 0.301852	valid_1's multi_logloss: 0.746665	valid_1's wloss: 0.642145
Early stopping, best iteration is:
[527]	training's multi_logloss: 0.303952	training's wloss: 0.292147	valid_1's multi_logloss: 0.741123	valid_1's wloss: 0.640836
no 1-fold loss: 0.640835852381601
Tr

In [14]:
print("Done")

Done
