# Feature Selection - XGBoost

https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/

In [1]:
import pandas as pd
from xgboost import XGBClassifier
import xgboost
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading the training and testing datasets.
test = pd.read_csv("UNSW_NB15_testing-set.csv", sep=',', header=0)
train = pd.read_csv("UNSW_NB15_training-set.csv", sep=',', header=0)

# Combining the testing and training dataset so we can split it more even.
combined_data = pd.concat([train, test]).drop(["id"], axis = 1)

# Splitting the datset into X and y
X = combined_data.drop(['label', 'attack_cat'], axis=1)
y = combined_data.loc[:, ['label']]

In [3]:
# Defining the columns that need to be label encoded.
cols = ['proto', 'service', 'state']
le = preprocessing.LabelEncoder()

In [4]:
# Label encoding the columns for the test and training set
X[cols] = X[cols].apply(le.fit_transform)

In [5]:
# OPTIONAL 2: Applying StandardScaler on X
ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X),columns = X.columns)

## Feature selection - XGBoost Feature Importance Plot

In [6]:
# fit model no training data
model = XGBClassifier(use_label_encoder=False)
model.fit(X, y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
xgboost.plot_importance(model, ax=ax)

In [None]:
# Export the files as png
# fig.savefig('xgboost.png', figsize=(50, 40), dpi=1000)