# Wine Quality Analysis

## Process and Clean Data

In [None]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
# Get data from csv file
red_wine_df = pd.read_csv('resources/winequality-red.csv', sep=';')
white_wine_df = pd.read_csv('resources/winequality-white.csv', sep=';')

# Create a new column 'color' and assign '1' to all rows for red wine
red_wine_df['color'] = 1

# Create a new column 'color' and assign '0' to all rows for white wine
white_wine_df['color'] = 0

# Create a new dataframe 'wine_df' by combining red_wine_df and white_wine_df and reset the index
wine_df = pd.concat([red_wine_df, white_wine_df], ignore_index=True)

# Display wine_df
wine_df

In [None]:
# Describe the dataframe
wine_df.describe()

In [None]:
# Get the dataframe's info
wine_df.info()

In [None]:
# Look for null values
wine_df.isnull().sum()

In [None]:
# Drop any null values
wine_df = wine_df.dropna().reset_index(drop=True)

wine_df

In [None]:
# Check for duplicate rows
wine_df.duplicated().sum()

In [None]:
# Drop duplicate rows and reset index
wine_df = wine_df.drop_duplicates().reset_index(drop=True)

wine_df

## Explore the Data

In [None]:
# Get a count of the unique values in the quality column
wine_df['quality'].value_counts()

In [None]:
# Export the cleaned data to a new csv file
wine_df.to_csv('resources/winequality-cleaned.csv', index=False)

In [None]:
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(wine_df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=wine_df, x='quality', hue='color')
plt.title("Wine Quality Distribution by Type")
plt.xlabel("Quality")
plt.ylabel("Count")
plt.legend(title="Wine Color", labels=["White", "Red"])
plt.show()

In [None]:
# drop the color column due to it not being relevant to quality, drop the free sulfur dioxide column due to it being highly correlated with total sulfur dioxide
cleaned_wine = wine_df.drop(['color', 'free sulfur dioxide', 'chlorides', 'citric acid', 'fixed acidity'], axis=1)

In [None]:
# do a value counts on the quality column
cleaned_wine['quality'].value_counts()

In [None]:
features_to_plot = ['alcohol', 'volatile acidity', 'citric acid']
for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='quality', y=feature, data=cleaned_wine)
    plt.title(f"{feature.title()} by Wine Quality")
    plt.show()

In [None]:
top_features = ['alcohol', 'volatile acidity', 'density', 'quality']
sns.pairplot(cleaned_wine[top_features], hue='quality', palette='coolwarm')
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='quality', y='alcohol', data=cleaned_wine)
plt.title("Alcohol Distribution by Wine Quality")
plt.show()

## Split the Test and Training Data

In [None]:
# Create bins for the quality column


# Two Bins for 0-5 and 6-10
bins = (0, 5, 10)

# Name the bins 0 for low quality and 1 for high quality
group_names = [0, 1]

# Rename teh values in the quality column to the bin names
cleaned_wine['quality'] = pd.cut(cleaned_wine['quality'], bins=bins, labels=group_names)

# List unique values in the quality column
cleaned_wine['quality'].unique()

In [None]:
cleaned_wine['quality'].value_counts()

In [None]:
# Separate features and target
X = cleaned_wine.drop(columns= ['quality'])
y= cleaned_wine['quality']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=1)


In [None]:
import statsmodels.api as sm

In [None]:
lr = sm.OLS(y_train, X_train).fit()
pvals = lr.pvalues.sort_values()
for index, value in pvals.items():
    print(f"{index}: {value:4f}")

In [None]:
mnl = sm.MNLogit(y, X).fit()
pvals = mnl.pvalues
pvals_sorted = pvals.iloc[:, 0].sort_values()
for index, value in pvals_sorted.items():
    print(f"{index}: {value:4f}")

In [None]:
logit = sm.Logit(y_train, X_train).fit()
pvals = logit.pvalues.sort_values()
for index, value in pvals.items():
    print(f"{index}: {value:4f}")
    

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(

    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

In [None]:
class_counts = y_train.value_counts()
print(class_counts)


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter   
# Apply SMOTE on the scaled training data
smote_model = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote_model.fit_resample(X_train, y_train)

# Count distinct values for the resampled target data
print(y_resampled_smote.value_counts())

In [None]:
from imblearn.combine import SMOTEENN
# Apply SMOTEENN on the scaled training data
smoteenn_model = SMOTEENN(random_state=42)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn_model.fit_resample(X_train, y_train)
# Count distinct values for the resampled target data
print(y_resampled_smoteenn.value_counts())

## Compare Different Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix



## AUC RUC

In [None]:
#import confusion_matrix
from sklearn.metrics import confusion_matrix
# calculate  a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
#print the results of the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Import AUC ROC curve tools
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt


In [None]:
# Calculate a ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
# calculate the ROC AUC
roc_auc = roc_auc_score(y_test, y_pred)
# Plot the ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='orange', label='ROC curve (area = %0.2f)' % roc_auc)

## Random Forest

In [None]:
# Instantiate a new RandomForestClassier model
rf_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_model.fit(X_train, y_train)
# Predict labels for resampled testing features
rf_y_pred = rf_model.predict(X_test)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, rf_y_pred))

In [None]:
# Instantiate a new RandomForestClassier model with SMOTE data
rf_smote_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_smote_model.fit(X_resampled_smote, y_resampled_smote)
# Predict labels for resampled testing features
rf_smote_y_pred = rf_smote_model.predict(X_test)

# Print classification reports
print(f"Classification Report - SMOTE Data")
print(classification_report(y_test, rf_smote_y_pred))

In [None]:
# Instantiate a new RandomForestClassier model with SMOTEENN data
rf_smoteenn_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_smoteenn_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
# Predict labels for resampled testing features
rf_smoteenn_y_pred = rf_smoteenn_model.predict(X_test)

# Print classification reports
print(f"Classification Report - SMOTE Data")
print(classification_report(y_test, rf_smoteenn_y_pred))

In [None]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, rf_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, rf_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, rf_smoteenn_y_pred))

In [None]:
# Print the class distribution before SMOTE
print("Original training class distribution:", Counter(y_train))

# Print the class distribution after SMOTE
print("Resampled training class distribution:", Counter(y_resampled))

## XG Boost

In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

XGB_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
XGB_model.fit(X_train, y_train)
xgb_y_pred = XGB_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_y_pred))


In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

xgb_smote_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
xgb_smote_model.fit(X_resampled_smote, y_resampled_smote)
xgb_smote_y_pred = xgb_smote_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_smote_y_pred))


In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

xgb_smoteeen_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
xgb_smoteeen_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
xgb_smoteeen_y_pred = xgb_smoteeen_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_smoteeen_y_pred))


In [None]:
print(f"Classification Report - Original Data")
print(classification_report(y_test, xgb_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, xgb_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, xgb_smoteeen_y_pred))

## Logistic Regression

In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_model_scaled = LogisticRegression(max_iter=500, random_state=42)
lr_model_scaled.fit(X_train_scaled, y_train)
lr_y_pred_scaled = lr_model_scaled.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, lr_y_pred_scaled))

In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_model_unscaled = LogisticRegression(max_iter=500, random_state=42)
lr_model_unscaled.fit(X_train, y_train)
lr_y_pred_unscaled = lr_model_unscaled.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, lr_y_pred_unscaled))


In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_smote_model = LogisticRegression(max_iter=500, random_state=42)
lr_smote_model.fit(X_resampled_smote, y_resampled_smote)
lr_smote_y_pred = lr_smote_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, lr_smote_y_pred))


In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_smoteenn_model = LogisticRegression(max_iter=500, random_state=42)
lr_smoteenn_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
lr_smoteenn_y_pred = lr_smoteenn_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, lr_smoteenn_y_pred))


In [None]:
print(f"Classification Report - Original Data - Scaled")
print(classification_report(y_test, lr_y_pred_scaled))
print("---------")
print(f"Classification Report - Original Data - Unscaled")
print(classification_report(y_test, lr_y_pred_unscaled))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, lr_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, lr_smoteenn_y_pred))

## LGBMC

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred))

## SVC

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred))

## KNC

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("KNN Classification Report:")
print(classification_report(y_test, y_pred))

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Gradient Boosting Report:")
print(classification_report(y_test, y_pred))

## MLPC

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("MLP (Neural Network) Report:")
print(classification_report(y_test, y_pred))

## Identify Most and Least Important Features

In [None]:
# feature_importances_ from the trained RandomForest model
feature_importances = .feature_importances_

# Create a DataFrame that pairs each feature with its importance score
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importances
})

# Sort by importance in descending order (most important at the top)
importance_df.sort_values(by='importance', ascending=False, inplace=True)

# Display the entire list
print(importance_df)

## Hyperparameter Optimization

## Conclusions