In [46]:
import numpy as np #import package numpy and named it as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 
# import the function RandomForestClassifier of a subpackage name ensemble from sklern
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import os


In [47]:
npath=os.path.abspath(os.pardir)+"/Data/winequality_white.csv"
df = pd.read_csv(npath, sep=';') #read data winequality_white.csv


In [48]:
df1=df.describe() #describe the data and write the result in csv
dpath=os.path.abspath(os.pardir)+"/Results/Description.csv"
df1.to_csv(dpath)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,:-1],
    df.values[:,-1:],
    test_size=0.25,
    random_state=42) 
#split the data set, train is 75% while test is 25%

In [50]:
y_train = y_train.ravel() #take the y_train value as array
y_test = y_test.ravel()


In [51]:
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (3673, 11) (3673,)
Testing dataset shape: (1225, 11) (1225,)


In [52]:
#print out the training data shape, for x, it contains 3673 observations and 11 characteristics, 
#for y, it contains 3673 observation

In [53]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
#Build RF classifier name it as clf to use in feature selection
#n_estimatores is the number of decision tree
#n_jobs is the number of Central Processing Unit, -1 means use all the cpus

In [54]:
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)


In [55]:
# Build step forward feature selection.
# clf:the classifier is clf
# k_features:number of feature is 5
# forward=true: use a forward selection methods
# floating=false: don't use floating slection methods
# the degree of verbose is 2
# the scoring methos is accuracy
# the number ofdesireds folds of cross-validation is 5

In [64]:
# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    3.7s finished

[2023-02-18 15:59:03] Features: 1/5 -- score: 0.4941396504105729[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.4s finished

[2023-02-18 15:59:07] Features: 2/5 -- score: 0.5447904502400416[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.2s finished

[2023-02-18 15:59:11] Features: 3/5 -- score: 0.6038614246788634[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [65]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

# present the best 5 feature that get highest score

[1, 2, 5, 7, 10]


In [None]:
# I run the sff several times, the fearture scores are different each time. so they present different best feature.

In [66]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[:, feat_cols], y_train)

RandomForestClassifier(max_depth=4, n_estimators=1000, random_state=42)

In [67]:
y_train_pred = clf.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.559
Testing accuracy on selected features: 0.518


In [None]:
#the accuracy change as i run a different sff.

In [68]:
# Build full model on ALL features, for comparison
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train, y_train)


RandomForestClassifier(max_depth=4, n_estimators=1000, random_state=42)

In [69]:
y_train_pred = clf.predict(X_train)
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on all features: 0.566
Testing accuracy on all features: 0.509
