# Initial experiments

This notebook contains some initial experiments with the data. We evaluate several feature selection techniques and evaluate the features chosen by them in training a plethora of classifiers.

In [1]:
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, f_classif
from sklearn.linear_model import ElasticNet, Lasso, LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from evaluation import evaluate

warnings.filterwarnings("ignore")


In [2]:
# load data from txt files
X_train = pd.DataFrame(np.loadtxt('../data/x_train.txt'))
X_test = pd.DataFrame(np.loadtxt('../data/x_test.txt'))
y_train = pd.DataFrame(np.loadtxt('../data/y_train.txt'))
y_train = y_train.values.ravel()

In [3]:
# evaluate the feature selection methods on this dataset: SelectKBest, RFE, SFS, Random Forest, PCA, Lasso

max_features = 4

# SelectKBest
selector = SelectKBest(f_classif, k=max_features)
selector.fit(X_train, y_train)
X_train_kbest = selector.transform(X_train)
X_test_kbest = selector.transform(X_test)

# RFE
estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=max_features)
selector.fit(X_train, y_train)
X_train_rfe = selector.transform(X_train)
X_test_rfe = selector.transform(X_test)

# Random Forest
estimator = RandomForestClassifier()
selector = SelectFromModel(estimator, max_features=max_features)
selector.fit(X_train, y_train)
X_train_rf = selector.transform(X_train)
X_test_rf = selector.transform(X_test)

# XGBoost
estimator = XGBClassifier()
selector = SelectFromModel(estimator, max_features=max_features)
selector.fit(X_train, y_train)
X_train_xgb = selector.transform(X_train)
X_test_xgb = selector.transform(X_test)

# Decision Tree
estimator = DecisionTreeClassifier()
selector = SelectFromModel(estimator, max_features=max_features)
selector.fit(X_train, y_train)
X_train_dt = selector.transform(X_train)
X_test_dt = selector.transform(X_test)

# Extra Trees Classifier
estimator = ExtraTreesClassifier()
selector = SelectFromModel(estimator, max_features=max_features)
selector.fit(X_train, y_train)
X_train_et = selector.transform(X_train)
X_test_et = selector.transform(X_test)




# PCA
estimator = PCA(n_components=max_features)
estimator.fit(X_train)
X_train_pca = estimator.transform(X_train)
X_test_pca = estimator.transform(X_test)

# Lasso
estimator = Lasso(alpha=0.1)
estimator.fit(X_train, y_train)
# filter out the features that are not important
important_features = estimator.coef_ != 0
X_train_lasso = X_train.loc[:, important_features]
X_test_lasso = X_test.loc[:, important_features]

In [4]:
# initialize classifiers
lr = LogisticRegression()
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
svm = SVC(probability=True)
elastic = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5) # ElasticNet()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
nn = MLPClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier(verbose=-1)
cat = CatBoostClassifier(verbose=0)
et = ExtraTreesClassifier()

models = [lr, lda, qda, svm, elastic, dt, rf, gb, nn, xgb, lgb, cat, et]

In [5]:
# evaluate each model on each subset of features generated by the feature selection methods

data = {'SelectKBest': (X_train_kbest, X_test_kbest),
        'RFE': (X_train_rfe, X_test_rfe),
        'Random Forest': (X_train_rf, X_test_rf),
        'PCA': (X_train_pca, X_test_pca),
        'Lasso': (X_train_lasso, X_test_lasso),
        'XGBoost': (X_train_xgb, X_test_xgb),
        'Decision Tree': (X_train_dt, X_test_dt),
        'Extra Trees': (X_train_et, X_test_et)}


results = pd.DataFrame(index=[model.__class__.__name__ for model in models], columns=data.keys())

for model in models:
        for key, value in data.items():
                X_train, X_test = value
                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                model.fit(X_train, y_train)
                print(model.__class__.__name__, key, evaluate(model, X_train, y_train))
                results.loc[model.__class__.__name__, key] = cross_val_score(model, X_train, y_train, cv=cv, scoring=evaluate).mean() * 5

print(results)

Number of customers who took the offer:  566.0
Number of variables used:  4
LogisticRegression SelectKBest 4860.0
Number of customers who took the offer:  113.0
Number of variables used:  4
Number of customers who took the offer:  107.0
Number of variables used:  4
Number of customers who took the offer:  108.0
Number of variables used:  4
Number of customers who took the offer:  110.0
Number of variables used:  4
Number of customers who took the offer:  115.0
Number of variables used:  4
Number of customers who took the offer:  552.0
Number of variables used:  4
LogisticRegression RFE 4720.0
Number of customers who took the offer:  108.0
Number of variables used:  4
Number of customers who took the offer:  107.0
Number of variables used:  4
Number of customers who took the offer:  111.0
Number of variables used:  4
Number of customers who took the offer:  112.0
Number of variables used:  4
Number of customers who took the offer:  115.0
Number of variables used:  4
Number of customers 

In [13]:
results.to_csv('results_4.csv')

In [21]:
results.sum().sort_values(ascending=False)

Random Forest    79800.0
Decision Tree    79350.0
Extra Trees      79280.0
XGBoost          75260.0
Lasso            64430.0
PCA              58800.0
SelectKBest      57930.0
RFE              55110.0
dtype: object

In [20]:
results.sum(axis=1).sort_values(ascending=False)

QuadraticDiscriminantAnalysis    46660.0
MLPClassifier                    46240.0
SVC                              45860.0
GradientBoostingClassifier       44670.0
CatBoostClassifier               44150.0
LGBMClassifier                   43230.0
RandomForestClassifier           42930.0
ExtraTreesClassifier             42850.0
XGBClassifier                    42210.0
LinearDiscriminantAnalysis       38230.0
LogisticRegression               38190.0
LogisticRegression               38190.0
DecisionTreeClassifier           36550.0
dtype: object

As we see, for feature selection, the best results are given for tree-based models feature importance methods, such as Random Forest, Extra Trees, and Decision tree.
When it comes to classifiers, SVM, QDA and MLP (neural network) stood out from the rest.
This is why we will investigate further the performance of combination of these methods in hyperparameter tuning.