In [26]:
from typing import List

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier


from plasticc.dataset import Dataset

import plasticc.metrics as metrics

In [4]:
!pip install lightgbm



In [5]:
ds_tsfresh = Dataset('../data/sets/tsfresh-sample/', y_colname='target')

In [6]:
ds_simple = Dataset('../data/sets/simple-12-01/', y_colname='target')

In [7]:
#Xt, yt = ds_tsfresh.train

In [8]:
Xs, ys = ds_simple.train

### Eliminate null values

In [9]:
def null_values(X: pd.DataFrame) -> List[str]:
    print("Total columns:", len(X.columns))
    na_cols = [col for col in X.columns if X[col].isna().any()]
    print("Total NA columns: ", len(na_cols))
    if len(na_cols) < 10:
        print("NA values by column:")
        print({na_col: X[na_col].isna().sum() for na_col in na_cols})
    return na_cols

In [10]:
#na_tsfresh = null_values(Xt)

In [11]:
na_simple = null_values(Xs)

Total columns: 176
Total NA columns:  1
NA values by column:
{'distmod': 2325}


We will fill null values with 0 and remove values that were duplicated

In [12]:
for X in [Xs]:
    X.fillna(0, inplace=True)
#     X.dropna(axis=1, inplace=True)
    assert(X.notna().all().all())
    X.drop(columns=[col for col in set(X.columns) if col.endswith('_meta')], inplace=True)

### Eliminate inifinte values

In [13]:
for X in [Xs]:
    print("Before infinity removal:", X.shape)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    na_cols = null_values(X)
    X.drop(columns=na_cols, inplace=True)
    print("After infinity removal:", X.shape)

Before infinity removal: (7848, 176)
Total columns: 176
Total NA columns:  0
NA values by column:
{}
After infinity removal: (7848, 176)


### Train models on simple dataset

In [14]:
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
def plot_feature_importances(model, feature_names: List[str]):
    features = pd.DataFrame({"Importance": model.feature_importances_, "Feature": feature_names})
    fig, ax = plt.subplots(figsize=(6,15))
    sns.barplot(ax=ax, x='Importance', y='Feature', data=features.sort_values(by='Importance', ascending=False).head(50))
    sns.despine(left=True, bottom=True)
    plt.show()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=0.15, random_state=42)

LGBM beginning

In [22]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 14,
    'metric': 'multi_logloss',
    'learning_rate': 0.03,
    'subsample': .9,
    'colsample_bytree': 0.5,
    'reg_alpha': .01,
    'reg_lambda': .01,
    'min_split_gain': 0.01,
    'min_child_weight': 10,
    'n_estimators': 10000,
    'silent': -1,
    'verbose': -1,
    'max_depth': 7,
}
lgb = LGBMClassifier(**lgb_params)

In [None]:
grid = GridSearchCV(LGBMClassifier(), 
                    {
    'boosting_type': ['gbdt'],
    'objective': ['multiclass'],
    'num_class': [14],
    'metric': ['multi_logloss'],
    'learning_rate': [0.03, 0.1],
    'subsample':[.9],
    'colsample_bytree': [0.5],
    'reg_alpha': [.01, 0.001, 0.1, 1],
    'reg_lambda': [.01, 0.001, 0.1, 1],
    'min_split_gain': [0.01],
    'min_child_weight': [10],
    'n_estimators': [1000],
    'silent': [-1],
    'verbose': [-1],
    'max_depth': [3,5,8]
    }, cv=5, verbose=100
)

grid.fit(X_train, y_train.values.astype(np.int))

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] boosting_type=gbdt, colsample_bytree=0.5, learning_rate=0.03, max_depth=3, metric=multi_logloss, min_child_weight=10, min_split_gain=0.01, n_estimators=1000, num_class=14, objective=multiclass, reg_alpha=0.01, reg_lambda=0.01, silent=-1, subsample=0.9, verbose=-1 
[CV]  boosting_type=gbdt, colsample_bytree=0.5, learning_rate=0.03, max_depth=3, metric=multi_logloss, min_child_weight=10, min_split_gain=0.01, n_estimators=1000, num_class=14, objective=multiclass, reg_alpha=0.01, reg_lambda=0.01, silent=-1, subsample=0.9, verbose=-1, score=0.7708955223880597, total=  12.4s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.5s remaining:    0.0s
[CV] boosting_type=gbdt, colsample_bytree=0.5, learning_rate=0.03, max_depth=3, metric=multi_logloss, min_child_weight=10, min_split_gain=0.01, n_estimators=1000, num_class=14, objective=multiclas

In [None]:
print(grid.best_params_)

In [23]:
%%time
lgb.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        # eval_metric=lgb_multi_weighted_logloss,
        verbose=100,
        early_stopping_rounds=50,
        #sample_weight=trn_y.map(weights)
    )

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 0.544255	valid_1's multi_logloss: 0.769715
[200]	training's multi_logloss: 0.310562	valid_1's multi_logloss: 0.641509
[300]	training's multi_logloss: 0.206681	valid_1's multi_logloss: 0.615086
[400]	training's multi_logloss: 0.150204	valid_1's multi_logloss: 0.60715
[500]	training's multi_logloss: 0.113598	valid_1's multi_logloss: 0.605136
Early stopping, best iteration is:
[498]	training's multi_logloss: 0.114151	valid_1's multi_logloss: 0.604982
CPU times: user 14min 54s, sys: 1.94 s, total: 14min 56s
Wall time: 23 s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        importance_type='split', learning_rate=0.03, max_depth=7,
        metric='multi_logloss', min_child_samples=20, min_child_weight=10,
        min_split_gain=0.01, n_estimators=10000, n_jobs=-1, num_class=14,
        num_leaves=120, objective='multiclass', random_state=None,
        reg_alpha=0.01, reg_lambda=0.01, silent=-1, subsample=0.9,
        subsample_for_bin=200000, subsample_freq=0, verbose=-1)

In [None]:
lgb.predict(X_test)

In [None]:
lgb.score(X_test, y_test)

In [None]:
plot_feature_importances(lgb, X_test.columns)

### Train model on tsfresh dataset

In [None]:
#TODO

### Calculate feature imporatnce for selecting optimal tsfresh features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import describe
import tsfresh
import pandas as pd

In [None]:
def to_tsfresh_format(feature_set: set, train_X: pd.DataFrame) -> dict:
    """ Converts a set of features names into tsfresh-acceptable settings dict. """
    df_selected = train_X[list(feature_set)]
    return tsfresh.feature_extraction.settings.from_columns(df_selected)

def select_features_from_trained_model(model, train_X: pd.DataFrame, meta_columns: set, verbose=True) -> set:
    """
    Extracts a set of relevant features from trained model.
    Parameters:
    - train_X should be the X used to train the model
    - meta_columns should contain column names that will be excluded from the tsfresh selection (columns from metadata, not the time series)
    """
    print(describe(model.feature_importances_))
    features = pd.Series(model.feature_importances_, index=train_X.columns)
    print("Most relevant features for the model:", features.sort_values().tail(10))
    # calculating how much data is lost based on minimal importance level
    N_THRESHOLDS = 1000
    select_crit = np.zeros(N_THRESHOLDS)
    select_min = np.zeros(N_THRESHOLDS)
    for i, q in enumerate(np.linspace(features.min(), features.max(), N_THRESHOLDS, endpoint=False)):
        selected = features[features > q]
        select_crit[i] = selected.min() * len(selected) / len(features)
        select_min[i] = selected.min()
    # choosing minimum importance level that maximizes (selected feature count * minimal selected feature importance)
    min_importance = select_min[np.argmax(select_crit)]
    selected_features = set(features[features > min_importance].index) - meta_columns
    if verbose:
        print(f"Selected minimal importance: {min_importance}", f"Number of selected features: {len(selected_features)}")
        plt.plot(select_crit)
        plt.show()
    return selected_features, features

In [None]:
meta_columns = set(Dataset('../data/sets/base/').train_meta.columns)

In [None]:
xgb_fset, xgb_feature_importance = select_features_from_trained_model(model1, X, meta_columns)

In [None]:
skl_fset, skl_feature_importance = select_features_from_trained_model(model2, X, meta_columns)

In [None]:
important_for_both = xgb_fset & skl_fset

In [None]:
len(important_for_both)

In [None]:
common_dict = to_tsfresh_format(important_for_both, X)

In [None]:
type(common_dict)

#### Decrease number of features
There are some inconsistencies when it comes to which features are extracted for which series.
We will limit extracted features to those relevant for most of the 6 series for now.

In [None]:
keyset = set()
for i in range(6):
    keyset |= set(common_dict[str(i)].keys())

In [None]:
len(keyset)

In [None]:
feature_counts = dict()
for feature in keyset:
    for i in range(6):
        for key in common_dict[str(i)].keys():
            if feature in key:
                try:
                    feature_counts[feature] += 1
                except KeyError:
                    feature_counts[feature] = 1

In [None]:
feature_counts = pd.Series(feature_counts)

In [None]:
feature_counts.sort_values(ascending=False)

In [None]:
feature_counts.sort_values(ascending=False).mean()

In [None]:
final_features = set(feature_counts[feature_counts > 4].index)

In [None]:
final_features

### Save feature dict for tsfresh feature generator to use

In [None]:
import pickle

from tsfresh.feature_extraction.settings import ComprehensiveFCParameters

In [None]:
settings = ComprehensiveFCParameters()
comprehensive_keys = set(settings.keys())
for key in comprehensive_keys:
    if key not in final_features:
        del settings[key]

In [None]:
with open('../data/config/tsfresh-settings.pkl', 'wb+') as file:
    pickle.dump(settings, file)

In [None]:
model1.classes_