<a href="https://colab.research.google.com/github/kubrayigitbasi/data.kubra/blob/main/Exhaustive_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [3]:
paribas_akut = pd.read_csv(r"https://raw.githubusercontent.com/kubrayigitbasi/data_kubra/main/akut_b%C3%B6brek_data.csv", nrows=20000)
paribas_akut.shape

(458, 45)

In [4]:
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = list(paribas_akut.select_dtypes(include=num_colums).columns)
paribas_akut = paribas_akut[numerical_columns]
paribas_akut.shape

(458, 45)

In [6]:
train_features, test_features, train_labels, test_labels = train_test_split(
    paribas_akut.drop(labels=['BMI', 'HTA'], axis=1),
    paribas_akut['BMI'],
    test_size=0.2,
    random_state=41)

In [7]:
correlated_features = set()
correlation_matrix = paribas_akut.corr()
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [8]:
train_features.drop(labels=correlated_features, axis=1, inplace=True)
test_features.drop(labels=correlated_features, axis=1, inplace=True)

In [9]:
train_features.shape, test_features.shape

((366, 38), (92, 38))

In [None]:
conda install -c conda-forge mlxtend

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector

feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
           k_features=15,
           forward=True,
           verbose=2,
           scoring='roc_auc',
           cv=4)

In [None]:
features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)

In [None]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]
filtered_features

In [None]:
Index(['v4', 'v10', 'v14', 'v15', 'v18', 'v20', 'v23', 'v34', 'v38', 'v42',
       'v50', 'v51', 'v69', 'v72', 'v129'],
      dtype='object')

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))

In [None]:
Accuracy on training set: 0.7072327148174093
Accuracy on test set: 0.7096973252804142

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector

feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
           k_features=15,
           forward=False,
           verbose=2,
           scoring='roc_auc',
           cv=4)

features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)

In [None]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]
filtered_features

In [None]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]
filtered_features

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))

In [None]:
Accuracy on training set: 0.7095207938140247
Accuracy on test set: 0.7114624676445211

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

feature_selector = ExhaustiveFeatureSelector(RandomForestClassifier(n_jobs=-1),
           min_features=2,
           max_features=4,
           scoring='roc_auc',
           print_progress=True,
           cv=2)

In [None]:
features = feature_selector.fit(np.array(train_features.fillna(0)), train_labels)

In [None]:
filtered_features= train_features.columns[list(features.k_feature_idx_)]
filtered_features

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=41, max_depth=3)
clf.fit(train_features[filtered_features].fillna(0), train_labels)

train_pred = clf.predict_proba(train_features[filtered_features].fillna(0))
print('Accuracy on training set: {}'.format(roc_auc_score(train_labels, train_pred[:,1])))

test_pred = clf.predict_proba(test_features[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(roc_auc_score(test_labels, test_pred [:,1])))