# XGBoost classification

## Current TODOS:
- Fix XGB objective

### Imports

In [2]:
from typing import List

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd

from plasticc.dataset import Dataset

import plasticc.metrics as metrics



### Loading data

In [4]:
ds_tsfresh = Dataset('../data/sets/tsfresh-sample/', y_colname='target')

In [5]:
ds_simple = Dataset('../data/sets/simple-12-01/', y_colname='target')

In [6]:
Xt, yt = ds_tsfresh.train

In [7]:
Xs, ys = ds_simple.train

### Eliminate null values

In [8]:
def null_values(X: pd.DataFrame) -> List[str]:
    print("Total columns:", len(X.columns))
    na_cols = [col for col in X.columns if X[col].isna().any()]
    print("Total NA columns: ", len(na_cols))
    if len(na_cols) < 10:
        print("NA values by column:")
        print({na_col: X[na_col].isna().sum() for na_col in na_cols})
    return na_cols

In [9]:
na_tsfresh = null_values(Xt)

Total columns: 1065
Total NA columns:  1
NA values by column:
{'distmod': 2325}


In [10]:
na_simple = null_values(Xs)

Total columns: 176
Total NA columns:  1
NA values by column:
{'distmod': 2325}


We will fill null values with 0 and remove values that were duplicated

In [11]:
for X in [Xs, Xt]:
    X.fillna(0, inplace=True)
#     X.dropna(axis=1, inplace=True)
    assert(X.notna().all().all())
    X.drop(columns=[col for col in set(X.columns) if col.endswith('_meta')], inplace=True)

### Eliminate inifinte values

In [12]:
for X in [Xs, Xt]:
    print("Before infinity removal:", X.shape)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    na_cols = null_values(X)
    X.drop(columns=na_cols, inplace=True)
    print("After infinity removal:", X.shape)

Before infinity removal: (7848, 176)
Total columns: 176
Total NA columns:  0
NA values by column:
{}
After infinity removal: (7848, 176)
Before infinity removal: (7848, 1064)
Total columns: 1064
Total NA columns:  0
NA values by column:
{}
After infinity removal: (7848, 1064)


### Train models on simple dataset

In [13]:
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
def plot_feature_importances(model, feature_names: List[str]):
    features = pd.DataFrame({"Importance": model.feature_importances_, "Feature": feature_names})
    fig, ax = plt.subplots(figsize=(6,15))
    sns.barplot(ax=ax, x='Importance', y='Feature', data=features.sort_values(by='Importance', ascending=False).head(50))
    sns.despine(left=True, bottom=True)
    plt.show()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=0.15, random_state=42)

### Custom objective function

In [16]:
def xgb_kaggle_loss(y_true, y_pred, **kwargs):
    y_true = y_true.reshape(-1, 1)
    y_true = OneHotEncoder(sparse=False).fit_transform(y_true)
    print(y_true)
    out = metrics.wtf_xgb_kaggle_loss(y_true, y_pred, **kwargs)
    print(out)
    return out

In [17]:
xgb_model = XGBClassifier(max_depth=7,
                           min_child_weight=10,
                           learning_rate=0.03,
                           n_estimators=2,
                           silent=True,
                           #objective= 'multi:softprob',
                           objective = xgb_kaggle_loss,
                           # gamma=0.01,
                           max_delta_step=0,
                           subsample=0.9,
                           colsample_bytree=0.5,
                           colsample_bylevel=1,
                           reg_alpha=0.01,
                           reg_lambda=0.01,
                           scale_pos_weight=1,
                           seed=1,
                           missing=None)

cat_model = CatBoostClassifier(
                         learning_rate=0.3,
                         loss_function='MultiClass',
                         random_seed=3721,
                         max_depth=7,
                         n_estimators=1000,
                         reg_lambda=0.1,
                         logging_level='Verbose'
#                         scale_pos_weight=1,
)

In [18]:
# TODO: Class weights for training and for eval

In [None]:
%%time
xgb_model.fit(X_train, y_train, verbose=True, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=50)
#cat_model.fit(X_train, y=y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=50)

In [1]:
cat_model.score(X_test, y_test)

NameError: name 'cat_model' is not defined

In [None]:
plot_feature_importances(xgb_model, X_test.columns)

### Train model on tsfresh dataset

In [None]:
#TODO

### Calculate feature imporatnce for selecting optimal tsfresh features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import describe
import tsfresh
import pandas as pd

In [None]:
def to_tsfresh_format(feature_set: set, train_X: pd.DataFrame) -> dict:
    """ Converts a set of features names into tsfresh-acceptable settings dict. """
    df_selected = train_X[list(feature_set)]
    return tsfresh.feature_extraction.settings.from_columns(df_selected)

def select_features_from_trained_model(model, train_X: pd.DataFrame, meta_columns: set, verbose=True) -> set:
    """
    Extracts a set of relevant features from trained model.
    Parameters:
    - train_X should be the X used to train the model
    - meta_columns should contain column names that will be excluded from the tsfresh selection (columns from metadata, not the time series)
    """
    print(describe(model.feature_importances_))
    features = pd.Series(model.feature_importances_, index=train_X.columns)
    print("Most relevant features for the model:", features.sort_values().tail(10))
    # calculating how much data is lost based on minimal importance level
    N_THRESHOLDS = 1000
    select_crit = np.zeros(N_THRESHOLDS)
    select_min = np.zeros(N_THRESHOLDS)
    for i, q in enumerate(np.linspace(features.min(), features.max(), N_THRESHOLDS, endpoint=False)):
        selected = features[features > q]
        select_crit[i] = selected.min() * len(selected) / len(features)
        select_min[i] = selected.min()
    # choosing minimum importance level that maximizes (selected feature count * minimal selected feature importance)
    min_importance = select_min[np.argmax(select_crit)]
    selected_features = set(features[features > min_importance].index) - meta_columns
    if verbose:
        print(f"Selected minimal importance: {min_importance}", f"Number of selected features: {len(selected_features)}")
        plt.plot(select_crit)
        plt.show()
    return selected_features, features

In [None]:
meta_columns = set(Dataset('../data/sets/base/').train_meta.columns)

In [None]:
xgb_fset, xgb_feature_importance = select_features_from_trained_model(model1, X, meta_columns)

In [None]:
skl_fset, skl_feature_importance = select_features_from_trained_model(model2, X, meta_columns)

In [None]:
important_for_both = xgb_fset & skl_fset

In [None]:
len(important_for_both)

In [None]:
common_dict = to_tsfresh_format(important_for_both, X)

In [None]:
type(common_dict)

#### Decrease number of features
There are some inconsistencies when it comes to which features are extracted for which series.
We will limit extracted features to those relevant for most of the 6 series for now.

In [None]:
keyset = set()
for i in range(6):
    keyset |= set(common_dict[str(i)].keys())

In [None]:
len(keyset)

In [None]:
feature_counts = dict()
for feature in keyset:
    for i in range(6):
        for key in common_dict[str(i)].keys():
            if feature in key:
                try:
                    feature_counts[feature] += 1
                except KeyError:
                    feature_counts[feature] = 1

In [None]:
feature_counts = pd.Series(feature_counts)

In [None]:
feature_counts.sort_values(ascending=False)

In [None]:
feature_counts.sort_values(ascending=False).mean()

In [None]:
final_features = set(feature_counts[feature_counts > 4].index)

In [None]:
final_features

### Save feature dict for tsfresh feature generator to use

In [None]:
import pickle

from tsfresh.feature_extraction.settings import ComprehensiveFCParameters

In [None]:
settings = ComprehensiveFCParameters()
comprehensive_keys = set(settings.keys())
for key in comprehensive_keys:
    if key not in final_features:
        del settings[key]

In [None]:
with open('../data/config/tsfresh-settings.pkl', 'wb+') as file:
    pickle.dump(settings, file)

In [None]:
model1.classes_