# xgboost.XGBClassifier with reduced features
This notebook demonstrates a xgboost.XGBClassifier that removes features based on their importance being above a threshold

In [1]:
# magic to help out Jupyter notebooks
import os, sys
sys.path.append(os.path.abspath('..\src'))

In [2]:
import pandas as pd 

# allow log messages in notebooks
import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# use the pecarn module to bring the PECARN dataset into the notebook
from data import pecarn

# cleaned PECARN data
pecarn_cleaned = pecarn.clean(pecarn.load(fromCsv=False))

# processed data, ready for splitting into training and test sets
X = pecarn_cleaned.drop(columns='PosIntFinal')
y = pecarn.preprocess(pecarn_cleaned[['PosIntFinal']])

INFO:data.pecarn.load:Loading from Pickle file c:\Jan\Capstone\notebooks\PECARN_TBI.pkl


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, stratify=y,      random_state=1234)

First, lets take a quick look at feature importance that comes out of a simple XGBClassifier

In [4]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline 

selection_pipeline = Pipeline(steps=[
    ('data.pecarn.preprocess', pecarn.make_preprocess_pipeline()),
    ('feature_selection', XGBClassifier()),
])
selection_pipeline.fit(X_train, y_train)

[Pipeline] .. (step 1 of 1) Processing convert_to_float, total=   0.5s


Pipeline(steps=[('data.pecarn.preprocess',
                 Pipeline(steps=[('convert_to_float',
                                  FunctionTransformer(func=<function _convert_to_float at 0x00000287E4BA6B88>))],
                          verbose=True)),
                ('feature_selection', XGBClassifier())])

In [5]:
# construct a data frame to investigate the feature importance
feature_importance = pd.DataFrame({
    'Feature': list(X_train.columns),
    'Importance': selection_pipeline.named_steps['feature_selection'].feature_importances_
    }).sort_values(by='Importance', ascending=False)

# and show the top 5
feature_importance.head(5)

Unnamed: 0,Feature,Importance
22,GCSVerbal,0.168769
20,GCSEye,0.08477
6,ActNorm,0.083531
21,GCSMotor,0.081807
0,AMS,0.080055


For demonstration purposes, we will pick a more or less arbitrary threshold of the median of the features that have a non-zero importance value.

In [6]:
non_zero_importance = feature_importance[feature_importance['Importance'] > 0]
non_zero_importance.median()

Importance    0.010823
dtype: float64

And use the median as the threshold in a pipeline...

In [7]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline 

# the selector features from an XGBClassifier
feature_selector = SelectFromModel(XGBClassifier(), threshold=non_zero_importance.median())

# the classifier that will be trained on the selected features
clf = XGBClassifier()

pipeline = Pipeline(steps=[
    ('data.pecarn.preprocess', pecarn.make_preprocess_pipeline()),
    ('feature_selection', feature_selector),
    ('xgboost', clf)
])

In [8]:
pipeline.fit(X_train, y_train)

[Pipeline] .. (step 1 of 1) Processing convert_to_float, total=   0.0s


Pipeline(steps=[('data.pecarn.preprocess',
                 Pipeline(steps=[('convert_to_float',
                                  FunctionTransformer(func=<function _convert_to_float at 0x00000287E4BA6B88>))],
                          verbose=True)),
                ('feature_selection',
                 SelectFromModel(estimator=XGBClassifier(),
                                 threshold=Importance    0.010823
dtype: float64)),
                ('xgboost', XGBClassifier())])

In [9]:
pipeline.score(X_test, y_test)

0.9875507192917743

In [10]:
from sklearn.metrics import f1_score
y_pred = pipeline.predict(X_test)
f1_score(y_test, y_pred)

0.5090909090909091