In [None]:
Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

Read data

In [3]:
df = pd.read_csv('winequality-white.csv', sep=';')

Compute descriptive statistics for the dataset

In [16]:
desc_stats = df.describe()
# Write the descriptive statistics to a CSV file in the current directory
output_file = 'IJ_Description_02_17_2025.csv'
desc_stats.to_csv(output_file)

Train/test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:,:-1],
    df.values[:,-1:],
    test_size=0.25,
    random_state=42)

y_train = y_train.ravel()
y_test = y_test.ravel()

print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (3673, 11) (3673,)
Testing dataset shape: (1225, 11) (1225,)


Build RF Classifier and define subset of feature for step forward selection

In [12]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features=6,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)


[2025-02-17 16:53:09] Features: 1/6 -- score: 0.4938705073309978
[2025-02-17 16:53:29] Features: 2/6 -- score: 0.5439744944299246
[2025-02-17 16:53:47] Features: 3/6 -- score: 0.602778550112143
[2025-02-17 16:54:08] Features: 4/6 -- score: 0.6221012437672616
[2025-02-17 16:54:28] Features: 5/6 -- score: 0.6354460694359487
[2025-02-17 16:54:48] Features: 6/6 -- score: 0.6417012363528517

Identify selected features

In [13]:
# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[1, 3, 6, 7, 8, 10]


Build classifier with only subset of features

In [14]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[:, feat_cols], y_train)
# Make predictions on training data and compute training accuracy
y_train_pred = clf.predict(X_train[:, feat_cols])
train_accuracy = acc(y_train, y_train_pred)
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

# Make predictions on testing data and compute testing accuracy
y_test_pred = clf.predict(X_test[:, feat_cols])
test_accuracy = acc(y_test, y_test_pred)
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

# Create a DataFrame with the results
results_df = pd.DataFrame({
    'Dataset': ['Train', 'Test'],
    'Accuracy': [train_accuracy, test_accuracy]
})
# Write the results to a CSV file
results_df.to_csv('model_results.csv', index=False)
print("Results written to model_results.csv")


Training accuracy on selected features: 0.562
Testing accuracy on selected features: 0.510
Results written to model_results.csv
