In [52]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np

In [3]:
df = pd.read_csv('data/DataPreprocessingGraded_dataset.csv', na_values=['?'])
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,Target
0,2.0,50.0,12500.0,98.0,NEGATIVE,YES
1,0.0,13.0,3250.0,28.0,NEGATIVE,YES
2,,,4000.0,35.0,NEGATIVE,YES
3,,20.0,5000.0,45.0,NEGATIVE,YES
4,1.0,24.0,6000.0,77.0,NEGATIVE,NO


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      743 non-null    float64
 1   V2      743 non-null    float64
 2   V3      748 non-null    float64
 3   V4      748 non-null    float64
 4   V5      748 non-null    object 
 5   Target  748 non-null    object 
dtypes: float64(4), object(2)
memory usage: 35.2+ KB


In [48]:
imputer = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='mean'), [0, 1])
], remainder='passthrough')

ord_enc = ColumnTransformer([
    ('encoder', OrdinalEncoder(), [4])
])


pre_pipe = Pipeline([
    ('imputer', imputer),
    ('scaler', ColumnTransformer([('scaler', StandardScaler(), [0, 1, 2, 3])])),
])

feat_un = FeatureUnion([
    ('pipe', pre_pipe),
    ('enc', ord_enc)
])

pipe = Pipeline([
    ('union', feat_un),
    ('varthresh', VarianceThreshold(threshold=0.1))
])


x_red = pre_pipe.fit_transform(df)

(748, 4)

In [53]:
si = SimpleImputer(strategy='mean')
x = si.fit_transform(df.values[:, :2])
x = np.c_[x, df.values[:, 2:4]]
x = StandardScaler().fit_transform(x)

x = np.c_[x, OrdinalEncoder().fit_transform(df.values[:, 4].reshape(-1, 1))]
x = VarianceThreshold(threshold=0.1).fit_transform(x)

In [54]:
x.shape

(748, 4)

In [55]:
np.allclose(x, x_red)

True

In [58]:
y = OrdinalEncoder().fit_transform(df['Target'].values.reshape(-1, 1))

In [60]:
y = y.ravel()

In [61]:
from sklearn.linear_model import LogisticRegression

In [62]:
from sklearn.feature_selection import RFE

In [64]:
RFE(LogisticRegression(), n_features_to_select=2).fit(x_red, y)

RFE(estimator=LogisticRegression(), n_features_to_select=2)

In [65]:
ref = _

In [66]:
ref.support_

array([ True, False,  True, False])

In [67]:
from sklearn.feature_selection import SequentialFeatureSelector

In [68]:
sfs = SequentialFeatureSelector(LogisticRegression(), n_features_to_select=2, direction='forward')
sfs.fit(x_red, y)

SequentialFeatureSelector(estimator=LogisticRegression(),
                          n_features_to_select=2)

In [69]:
sfs.support_

array([False,  True, False,  True])

In [70]:
x_red

array([[-9.38169390e-01,  7.70986653e+00,  7.62334626e+00,
         2.61563344e+00],
       [-1.18627754e+00,  1.30454949e+00,  1.28273826e+00,
        -2.57880900e-01],
       [ 0.00000000e+00, -1.53758496e-16,  1.79684161e+00,
         2.94705348e-02],
       ...,
       [ 1.66696622e+00, -4.26617275e-01, -4.30939574e-01,
         1.13782607e+00],
       [ 3.65183145e+00, -7.72850628e-01, -7.73675141e-01,
         1.93671355e-01],
       [ 7.74561598e+00, -7.72850628e-01, -7.73675141e-01,
         1.54832812e+00]])

In [71]:
sfs = SequentialFeatureSelector(LogisticRegression(), n_features_to_select=2, direction='backward')
sfs.fit(x_red, y)
sfs.support_

array([False, False,  True,  True])