In [7]:
import numpy as np #import package numpy and named it as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 
# import the function RandomForestClassifier of a subpackage name ensemble from sklern
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import os


In [8]:
npath=os.path.abspath(os.pardir)+"/Data/winequality_white.csv"
df = pd.read_csv(npath, sep=';') #read data winequality_white.csv


In [9]:
df1=df.describe() #describe the data and write the result in csv
dpath=os.path.abspath(os.pardir)+"/Results/PH_Description_02_18_2023.csv"
df1.to_csv(dpath)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,:-1],
    df.values[:,-1:],
    test_size=0.25,
    random_state=42) 
#split the data set, train is 75% while test is 25%

In [11]:
y_train = y_train.ravel() #take the y_train value as array
y_test = y_test.ravel()


In [12]:
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (3673, 11) (3673,)
Testing dataset shape: (1225, 11) (1225,)


In [13]:
#print out the training data shape, for x, it contains 3673 observations and 11 characteristics, 
#for y, it contains 3673 observation

In [14]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
#Build RF classifier name it as clf to use in feature selection
#n_estimatores is the number of decision tree
#n_jobs is the number of Central Processing Unit, -1 means use all the cpus

In [15]:
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)


In [55]:
# Build step forward feature selection.
# clf:the classifier is clf
# k_features:number of feature is 5
# forward=true: use a forward selection methods
# floating=false: don't use floating slection methods
# the degree of verbose is 2
# the scoring methos is accuracy
# the number ofdesireds folds of cross-validation is 5

In [16]:
# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    4.7s finished

[2023-02-18 16:16:37] Features: 1/5 -- score: 0.49849487478915266[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.4s finished

[2023-02-18 16:16:41] Features: 2/5 -- score: 0.5434276816993828[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.3s finished

[2023-02-18 16:16:46] Features: 3/5 -- score: 0.6019544384511297[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1

In [18]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

# present the best 5 feature that get highest score

[1, 5, 6, 7, 10]


In [19]:
# I run the sff several times, the fearture scores change every attemps. so they present different best feature.but it always contains feature 1 7,10

In [20]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[:, feat_cols], y_train)

RandomForestClassifier(max_depth=4, n_estimators=1000, random_state=42)

In [21]:
y_train_pred = clf.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.559
Testing accuracy on selected features: 0.518


In [22]:
#the accuracy change as i run a different sff.

In [23]:
# Build full model on ALL features, for comparison
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train, y_train)


RandomForestClassifier(max_depth=4, n_estimators=1000, random_state=42)

In [24]:
y_train_pred = clf.predict(X_train)
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on all features: 0.566
Testing accuracy on all features: 0.509


In [25]:
#Build RF classifier for 6 feature
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

#Build step backward feature selection for 6 feature
sfs2 = sfs(clf,
           k_features=6,
           forward=False,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=6)

#Perform SFFS
sfs2 = sfs2.fit(X_train, y_train)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    8.2s finished

[2023-02-18 16:18:29] Features: 10/6 -- score: 0.6613080069802073[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.4s finished

[2023-02-18 16:18:37] Features: 9/6 -- score: 0.6610347873062583[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    6.0s finished

[2023-02-18 16:18:43] Features: 8/6 -- score: 0.6637598758916291[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    5.3s finished

[2023-02-18 16:18:48] Features: 7/6 -- score: 0.6594052429744781[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    4.6s finished

[2023-02-18 16:18:53] Features: 6/6 -- score: 0.6547796116815404

In [26]:
#Print features
feat_cols = list(sfs2.k_feature_idx_)
print(feat_cols)

[1, 2, 4, 5, 8, 10]


In [28]:
# buill full moedel with selected 6 features and print the accuracy
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[:, feat_cols], y_train)

y_train_pred = clf.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.560
Testing accuracy on selected features: 0.513
