# Wine Quality Analysis

## Process and Clean Data

In [21]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Get data from csv file
red_wine_df = pd.read_csv('resources/winequality-red.csv', sep=';')
white_wine_df = pd.read_csv('resources/winequality-white.csv', sep=';')

# Create a new column 'color' and assign '1' to all rows for red wine
red_wine_df['color'] = 1

# Create a new column 'color' and assign '0' to all rows for white wine
white_wine_df['color'] = 0

# Create a new dataframe 'wine_df' by combining red_wine_df and white_wine_df and reset the index
wine_df = pd.concat([red_wine_df, white_wine_df], ignore_index=True)

# Display wine_df
wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [None]:
# Describe the dataframe
wine_df.describe()

In [None]:
# Get the dataframe's info
wine_df.info()

In [None]:
# Look for null values
wine_df.isnull().sum()

In [3]:
# Drop any null values
wine_df = wine_df.dropna().reset_index(drop=True)

wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [None]:
# Check for duplicate rows
wine_df.duplicated().sum()

In [4]:
# Drop duplicate rows and reset index
wine_df = wine_df.drop_duplicates().reset_index(drop=True)

wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


## Explore the Data

In [5]:
# Get a count of the unique values in the quality column
wine_df['quality'].value_counts()

quality
6    2323
5    1752
7     856
4     206
8     148
3      30
9       5
Name: count, dtype: int64

In [None]:
# Export the cleaned data to a new csv file
wine_df.to_csv('resources/winequality-cleaned.csv', index=False)

In [None]:
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(wine_df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=wine_df, x='quality', hue='color')
plt.title("Wine Quality Distribution by Type")
plt.xlabel("Quality")
plt.ylabel("Count")
plt.legend(title="Wine Color", labels=["White", "Red"])
plt.show()

In [6]:
# drop the color column due to it not being relevant to quality, drop the free sulfur dioxide column due to it being highly correlated with total sulfur dioxide
cleaned_wine = wine_df.drop(['color', 'free sulfur dioxide'], axis=1)

In [None]:
# do a value counts on the quality column
cleaned_wine['quality'].value_counts()

In [None]:
features_to_plot = ['alcohol', 'volatile acidity', 'citric acid']
for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='quality', y=feature, data=wine_df)
    plt.title(f"{feature.title()} by Wine Quality")
    plt.show()

In [None]:
top_features = ['alcohol', 'volatile acidity', 'density', 'quality']
sns.pairplot(wine_df[top_features], hue='quality', palette='coolwarm')
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='quality', y='alcohol', data=wine_df)
plt.title("Alcohol Distribution by Wine Quality")
plt.show()

## Split the Test and Training Data

In [7]:
# Create bins for the quality column


# Two Bins for 0-5 and 6-10
bins = (0, 5, 10)

# Name the bins 0 for low quality and 1 for high quality
group_names = [0, 1]

# Rename teh values in the quality column to the bin names
wine_df['quality'] = pd.cut(wine_df['quality'], bins=bins, labels=group_names)

# List unique values in the quality column
wine_df['quality'].unique()

[0, 1]
Categories (2, int64): [0 < 1]

In [8]:
wine_df['quality'].value_counts()

quality
1    3332
0    1988
Name: count, dtype: int64

In [9]:
# Separate features and target
X = wine_df.drop(columns= ['quality'])
y= wine_df['quality']

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=1)


In [11]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(

    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

In [12]:
class_counts = y_train.value_counts()
print(class_counts)


quality
1    2687
0    1569
Name: count, dtype: int64


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter   
# Apply SMOTE on the scaled training data
smote_model = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote_model.fit_resample(X_train_scaled, y_train)

# Count distinct values for the resampled target data
print(y_resampled_smote.value_counts())

quality
0    2687
1    2687
Name: count, dtype: int64




In [14]:
from imblearn.combine import SMOTEENN
# Apply SMOTEENN on the scaled training data
smoteenn_model = SMOTEENN(random_state=42)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn_model.fit_resample(X_train, y_train)
# Count distinct values for the resampled target data
print(y_resampled_smoteenn.value_counts())

quality
0    1318
1     993
Name: count, dtype: int64




## Compare Different Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix



## AUC RUC

In [None]:
#import confusion_matrix
from sklearn.metrics import confusion_matrix
# calculate  a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
#print the results of the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Import AUC ROC curve tools
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt


In [None]:
# Calculate a ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
# calculate the ROC AUC
roc_auc = roc_auc_score(y_test, y_pred)
# Plot the ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='orange', label='ROC curve (area = %0.2f)' % roc_auc)

## Random Forest

In [None]:
# Instantiate a new RandomForestClassier model
rf_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_model.fit(X_train, y_train)
# Predict labels for resampled testing features
rf_y_pred = rf_model.predict(X_test)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, rf_y_pred))

In [None]:
# Instantiate a new RandomForestClassier model with SMOTE data
rf_smote_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_smote_model.fit(X_resampled_smote, y_resampled_smote)
# Predict labels for resampled testing features
rf_smote_y_pred = rf_smote_model.predict(X_test)

# Print classification reports
print(f"Classification Report - SMOTE Data")
print(classification_report(y_test, rf_smote_y_pred))

In [None]:
# Instantiate a new RandomForestClassier model with SMOTEENN data
rf_smoteenn_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_smoteenn_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
# Predict labels for resampled testing features
rf_smoteenn_y_pred = rf_smoteenn_model.predict(X_test)

# Print classification reports
print(f"Classification Report - SMOTE Data")
print(classification_report(y_test, rf_smoteenn_y_pred))

In [None]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, rf_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, rf_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, rf_smoteenn_y_pred))

In [None]:
# Print the class distribution before SMOTE
print("Original training class distribution:", Counter(y_train))

# Print the class distribution after SMOTE
print("Resampled training class distribution:", Counter(y_resampled))

## XG Boost

In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

XGB_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
XGB_model.fit(X_train, y_train)
xgb_y_pred = XGB_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_y_pred))


In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

xgb_smote_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
xgb_smote_model.fit(X_train_resampled, y_train_resampled)
xgb_smote_y_pred = xgb_smote_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_smote_y_pred))


In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

xgb_smoteeen_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
xgb_smoteeen_model.fit(X_resampled, y_resampled)
xgb_smoteeen_y_pred = xgb_smoteeen_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_smoteeen_y_pred))


In [None]:
print(f"Classification Report - Original Data")
print(classification_report(y_test, xgb_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, xgb_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, xgb_smoteeen_y_pred))

## Logistic Regression

In [15]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_model_scaled = LogisticRegression(max_iter=500, random_state=42)
lr_model_scaled.fit(X_train_scaled, y_train)
lr_y_pred_scaled = lr_model_scaled.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, lr_y_pred_scaled))

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.62      0.67       419
           1       0.77      0.85      0.81       645

    accuracy                           0.76      1064
   macro avg       0.75      0.73      0.74      1064
weighted avg       0.75      0.76      0.75      1064



In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_model_unscaled = LogisticRegression(max_iter=500, random_state=42)
lr_model_unscaled.fit(X_train, y_train)
lr_y_pred_unscaled = lr_model_unscaled.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, lr_y_pred_unscaled))


In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_smote_model = LogisticRegression(max_iter=500, random_state=42)
lr_smote_model.fit(X_resampled_smote, y_resampled_smote)
lr_smote_y_pred = lr_smote_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, lr_smote_y_pred))


In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_smoteenn_model = LogisticRegression(max_iter=500, random_state=42)
lr_smoteenn_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
lr_smoteenn_y_pred = lr_smoteenn_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, lr_smoteenn_y_pred))


In [None]:
print(f"Classification Report - Original Data - Scaled")
print(classification_report(y_test, lr_y_pred_scaled))
print("---------")
print(f"Classification Report - Original Data - Unscaled")
print(classification_report(y_test, lr_y_pred_unscaled))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, lr_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, lr_smoteenn_y_pred))

## LGBMC

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred))

## SVC

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred))

## KNC

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("KNN Classification Report:")
print(classification_report(y_test, y_pred))

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Gradient Boosting Report:")
print(classification_report(y_test, y_pred))

## MLPC

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("MLP (Neural Network) Report:")
print(classification_report(y_test, y_pred))

## Identify Most and Least Important Features

In [16]:
# P-values

import statsmodels.api as sm

# Assume X_train_scaled is a numpy array with your scaled features,
# and y_train is your target variable.
# Also assume you have a list of feature names corresponding to X_train_scaled.
feature_names = X.columns  # for example, if X was your original features DataFrame

# Convert the scaled training data back into a DataFrame with column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)

# Add a constant term to account for the intercept in the model
X_train_scaled_df = sm.add_constant(X_train_scaled_df)

# Fit a logistic regression model using statsmodels
logit_model = sm.Logit(y_train, X_train_scaled_df)
result = logit_model.fit()

# Print the full summary, which includes coefficients and p-values
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.518228
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                quality   No. Observations:                 4256
Model:                          Logit   Df Residuals:                     4243
Method:                           MLE   Df Model:                           12
Date:                Tue, 25 Mar 2025   Pseudo R-squ.:                  0.2127
Time:                        21:29:15   Log-Likelihood:                -2205.6
converged:                       True   LL-Null:                       -2801.5
Covariance Type:            nonrobust   LLR p-value:                1.033e-247
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.7342      0.039     18.679      0.000       0.657       0.811

In [17]:
lr = sm.OLS(y_train, X_train).fit()
pvals = lr.pvalues.sort_values()
for index, value in pvals.items():
    print(f"{index}: {value:4f}")

alcohol: 0.000000
volatile acidity: 0.000000
density: 0.000000
free sulfur dioxide: 0.000000
sulphates: 0.000000
total sulfur dioxide: 0.000000
residual sugar: 0.000005
pH: 0.004808
chlorides: 0.034103
fixed acidity: 0.483961
citric acid: 0.837452
color: 0.976420


In [19]:
# feature_importances_ from the trained RandomForest model
feature_importances = lr_model_scaled.feature_importances_

# Create a DataFrame that pairs each feature with its importance score
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importances
})

# Sort by importance in descending order (most important at the top)
importance_df.sort_values(by='importance', ascending=False, inplace=True)

# Display the entire list
print(importance_df)

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

In [22]:


# Assuming X.columns holds the feature names and your target is binary
coefficients = lr_model_scaled.coef_[0]  # For binary classification, shape is (1, n_features)
intercept = lr_model_scaled.intercept_

# Create a DataFrame for easy viewing
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Odds_Ratio': np.exp(coefficients)
})

# Sort by absolute coefficient (or odds ratio) to see the most influential features
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)

print(coef_df)


                 Feature  Coefficient  Odds_Ratio  Abs_Coefficient
10               alcohol     0.928325    2.530268         0.928325
1       volatile acidity    -0.661761    0.515942         0.661761
3         residual sugar     0.374183    1.453804         0.374183
6   total sulfur dioxide    -0.336486    0.714276         0.336486
7                density    -0.334790    0.715488         0.334790
5    free sulfur dioxide     0.318799    1.375475         0.318799
9              sulphates     0.285347    1.330224         0.285347
8                     pH     0.159434    1.172847         0.159434
11                 color     0.157807    1.170940         0.157807
0          fixed acidity     0.102435    1.107865         0.102435
4              chlorides    -0.074617    0.928099         0.074617
2            citric acid     0.001303    1.001304         0.001303


## Hyperparameter Optimization

## Conclusions