In [28]:
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from utils import calculate_income_1000_customers
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SequentialFeatureSelector

from xgboost import XGBClassifier

In [7]:
np.random.seed(0)
X = pd.read_csv('data/x_train.txt', sep=' ', header=None)
y = pd.read_csv('data/y_train.txt', sep=' ', header=None)

N_ITER = 1
TRAIN_SIZE = 0.8

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, shuffle=True)

### Example model on all features

In [8]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [10]:
y_train_pred = model.predict(X_train)
print(f"Accuracy train: {np.round(accuracy_score(y_train, y_train_pred), 4)}")

y_test_pred = model.predict(X_test)
print(f"Accuracy test: {np.round(accuracy_score(y_test, y_test_pred), 4)}")

Accuracy train: 1.0
Accuracy test: 0.625


### Methods for feature selection

In [26]:
def variance_treshold(df,df_test, t ):
    variance = VarianceThreshold(t)
    return variance.fit_transform(df),variance.transform(df_test)

def mean_absolute_deviance(df,df_test, t):
    mad = np.sum(np.abs(df - np.mean(df, axis=0)), axis=0) / df.shape[0]
    return df[:, mad > t] , df_test[:, mad > t]

def correlation(df, df_test,threshold = 0.85):
    columns_correlated = set()  
    correlation_matrix = df.corr()
    n = len(correlation_matrix.columns)
    for i in range(n):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                colname = correlation_matrix.columns[i]                  
                columns_correlated.add(colname)
    df_new = df.drop(columns_correlated, axis=1)
    df_test_new = df_test.drop(columns_correlated, axis=1)
    return df_new, df_test_new

def fisher_score(df,y_train, df_test, t):
    chi2_selector = SelectKBest(chi2, k=t)
    chi2_selector.fit(df, y_train)
    return df.loc[:, chi2_selector.get_support()],df_test.loc[:, chi2_selector.get_support()]


In [27]:
parameters = {
    "variance_threshold": 0.1,
    "mean_absolute_deviance": 2,
    "high_correlation": 0.8,
    "fisher_score": 5,
    "forward_feature_selection": 3,
}

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

print('Original number of features: ', X_train.shape[1])

X_train_new, X_test_new = variance_treshold(X_train,X_test, parameters["variance_threshold"])

print('Number of features after variance treshold: ', X_train_new.shape[1])

X_train_new, X_test_new = mean_absolute_deviance(X_train_new,X_test_new, parameters["mean_absolute_deviance"])

print('Number of features after mean_absolute_deviance: ', X_train_new.shape[1])

X_train_new = pd.DataFrame(X_train_new)
X_test_new = pd.DataFrame(X_test_new)

X_train_new, X_test_new = correlation(X_train_new,X_test_new, parameters["high_correlation"])

print('Number of features after correlation: ', X_train_new.shape[1])

X_train_new, X_test_new = fisher_score(X_train_new,y_train,X_test_new, parameters["fisher_score"])

print('Number of features after fisher score: ', X_train_new.shape[1])

Original number of features:  500
Number of features after variance treshold:  300
Number of features after mean_absolute_deviance:  100
Number of features after correlation:  100
Number of features after fisher score:  5


In [33]:
def forward_feature_selection(df, y_train, df_test,t):
    
    model = XGBClassifier()
    sfs = SequentialFeatureSelector(model, n_features_to_select=t)
    sfs.fit(df, y_train)
    return df.loc[:, sfs.get_support()], df_test.loc[:, sfs.get_support()]

def recursive_feature_eliminator(df, y_train, df_test, t):

    model = XGBClassifier()
    rfe = RFE(estimator=model, n_features_to_select=t, step=1)
    rfe.fit(df, y_train)
    return df.loc[:, rfe.get_support()],df_test.loc[:, rfe.get_support()]

In [34]:
X_train_ffs, X_test_ffs = forward_feature_selection(X_train_new,y_train,X_test_new, 3)

print('Number of features after forward feature selection: ', X_train_ffs.shape[1])

X_train_rfe, X_test_rfe = recursive_feature_eliminator(X_train_new,y_train,X_test_new, 3)

print('Number of features after recursive_feature_eliminator: ', X_train_rfe.shape[1])

Number of features after forward feature selection:  3
Number of features after recursive_feature_eliminator:  3


In [44]:
model = XGBClassifier()
model.fit(X_train_ffs, y_train)
y_train_pred = model.predict(X_train_ffs)

print(f"Accuracy train: {np.round(accuracy_score(y_train, y_train_pred), 4)}")

y_test_pred = model.predict(X_test_ffs)
y_proba = model.predict_proba(X_test_ffs)
print(f"Accuracy test: {np.round(accuracy_score(y_test, y_test_pred), 4)}")

print('Accuracy for 1000 customers :', calculate_income_1000_customers(X_train_ffs.shape[1], y_proba=y_proba, y_true=y_test, y_pred=y_test_pred))

model = XGBClassifier()
model.fit(X_train_rfe, y_train)
y_train_pred = model.predict(X_train_rfe)
print(f"Accuracy train: {np.round(accuracy_score(y_train, y_train_pred), 4)}")

y_test_pred = model.predict(X_test_rfe)
y_proba = model.predict_proba(X_test_rfe)
print(f"Accuracy test: {np.round(accuracy_score(y_test, y_test_pred), 4)}")

print('Accuracy for 1000 customers :', calculate_income_1000_customers(X_train_ffs.shape[1], y_proba=y_proba, y_true=y_test, y_pred=y_test_pred))

Accuracy train: 0.9038
Accuracy test: 0.527
Accuracy for 1000 customers : (0.535, 4750.0)
Accuracy train: 0.9132
Accuracy test: 0.514
Accuracy for 1000 customers : (0.46, 4000.0)
