# Wine Quality Analysis

## Process and Clean Data

In [None]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
# Get data from csv file
red_wine_df = pd.read_csv('resources/winequality-red.csv', sep=';')
white_wine_df = pd.read_csv('resources/winequality-white.csv', sep=';')

# Create a new column 'color' and assign '1' to all rows for red wine
red_wine_df['color'] = 1

# Create a new column 'color' and assign '0' to all rows for white wine
white_wine_df['color'] = 0

# Create a new dataframe 'wine_df' by combining red_wine_df and white_wine_df and reset the index
wine_df = pd.concat([red_wine_df, white_wine_df], ignore_index=True)

# Display wine_df
wine_df

In [None]:
# Describe the dataframe
wine_df.describe()

In [None]:
# Get the dataframe's info
wine_df.info()

In [None]:
# Look for null values
wine_df.isnull().sum()

In [None]:
# Drop any null values
wine_df = wine_df.dropna().reset_index(drop=True)

wine_df

In [None]:
# Check for duplicate rows
wine_df.duplicated().sum()

In [None]:
# Drop duplicate rows and reset index
wine_df = wine_df.drop_duplicates().reset_index(drop=True)

wine_df

## Explore the Data

In [None]:
# Get a count of the unique values in the quality column
wine_df['quality'].value_counts()

In [None]:
# Export the cleaned data to a new csv file
wine_df.to_csv('resources/winequality-cleaned.csv', index=False)

In [None]:
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(wine_df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=wine_df, x='quality', hue='color')
plt.title("Wine Quality Distribution by Type")
plt.xlabel("Quality")
plt.ylabel("Count")
plt.legend(title="Wine Color", labels=["White", "Red"])
plt.show()

In [None]:
# drop the color column due to it not being relevant to quality, drop the free sulfur dioxide column due to it being highly correlated with total sulfur dioxide
cleaned_wine = wine_df.drop(['color', 'free sulfur dioxide'], axis=1)

In [None]:
# do a value counts on the quality column
cleaned_wine['quality'].value_counts()

In [None]:
features_to_plot = ['alcohol', 'volatile acidity', 'citric acid']
for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='quality', y=feature, data=wine_df)
    plt.title(f"{feature.title()} by Wine Quality")
    plt.show()

In [None]:
top_features = ['alcohol', 'volatile acidity', 'density', 'quality']
sns.pairplot(wine_df[top_features], hue='quality', palette='coolwarm')
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='quality', y='alcohol', data=wine_df)
plt.title("Alcohol Distribution by Wine Quality")
plt.show()

## Split the Test and Training Data

In [None]:
# Create bins for the quality column


# Two Bins for 0-5 and 6-10
bins = (0, 5, 10)

# Name the bins 0 for low quality and 1 for high quality
group_names = [0, 1]

# Rename teh values in the quality column to the bin names
wine_df['quality'] = pd.cut(wine_df['quality'], bins=bins, labels=group_names)

# List unique values in the quality column
wine_df['quality'].unique()

In [None]:
wine_df['quality'].value_counts()

In [None]:
# Separate features and target
X = wine_df.drop(columns= ['quality'])
y= wine_df['quality']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=1)


In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(

    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

## Compare Different Models

## Identify Most and Least Important Features

## Hyperparameter Optimization

## Conclusions