In [None]:
import pandas as pd

# Load the CSV files
df_red = pd.read_csv('visualization/qualityRedWineDF.csv')
df_white = pd.read_csv('visualization/qualityWhiteWineDF.csv')

# Display the first few rows of the DataFrames
df_red.head(), df_white.head()

In [None]:
# Describe the red wine data
red_desc = df_red.describe()

# Describe the white wine data
white_desc = df_white.describe()

red_desc, white_desc

In [None]:
# Drop the 'Unnamed: 0' column from the datasets
df_red = df_red.drop(columns=['Unnamed: 0'])
df_white = df_white.drop(columns=['Unnamed: 0'])

# Display the first few rows of the DataFrames to confirm
df_red.head(), df_white.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a histogram of the quality scores for the red and white wines using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(df_red['quality'], color='xkcd:red wine', label='Red Wine', bins=10)
sns.histplot(df_white['quality'], color='xkcd:pale gold', label='White Wine', bins=10)
plt.title('Distribution of Quality Scores for Red and White Wines')
plt.xlabel('Quality Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Histograms of all features in the Red Wine dataset
df_red.hist(bins=15, figsize=(20, 15))
plt.suptitle('Histograms of all features - Red Wine Data')
plt.show()

# Histograms of all features in the White Wine dataset
df_white.hist(bins=15, figsize=(20, 15))
plt.suptitle('Histograms of all features - White Wine Data')
plt.show()

In [None]:
# Create a list of variables to plot
variables = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

# Create a figure
fig, axs = plt.subplots(nrows=len(variables), figsize=(5, 30))

# For each variable, create a boxplot
for i, variable in enumerate(variables):
    sns.boxplot(data=[df_red[variable], df_white[variable]], ax=axs[i])
    axs[i].set_xticklabels(['Red Wine', 'White Wine'])
    axs[i].set_title(f'Boxplot of {variable}')

# Adjust the layout
plt.tight_layout()
plt.show()

In [None]:
# Calculate the correlation matrix for the red wine data
corr_red = df_red.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_red, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix for Red Wine Variables')
plt.show()

In [None]:
# Calculate the correlation matrix for the white wine data
corr_white = df_white.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_white, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix for White Wine Variables')
plt.show()

In [None]:
import plotly.express as px

# Create a scatter plot of alcohol vs. density, colored by quality for red wine
fig_red = px.scatter(df_red, x='alcohol', y='density', color='quality',
                 title='Alcohol vs. Density for Red Wine',
                 labels={'alcohol': 'Alcohol', 'density': 'Density', 'quality': 'Quality'},
                 color_continuous_scale='Viridis',
                 hover_data=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates'])

# Show the plot
fig_red.show()

In [None]:
# Create a scatter plot of alcohol vs. density, colored by quality for white wine
fig_white = px.scatter(df_white, x='alcohol', y='density', color='quality',
                 title='Alcohol vs. Density for White Wine',
                 labels={'alcohol': 'Alcohol', 'density': 'Density', 'quality': 'Quality'},
                 color_continuous_scale='Viridis',
                 hover_data=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates'])

# Show the plot
fig_white.show()

In [None]:
import matplotlib.pyplot as plt

# List of features to plot against quality
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

# Calculate the number of rows and columns for subplots
num_features = len(features)
num_rows = (num_features + 2) // 3  # Round up to the nearest integer
num_cols = min(3, num_features)  # Maximum 3 columns

# Determine the number of empty subplots to remove
num_empty = num_rows * num_cols - num_features

# Create a figure with subplots for each feature
fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 5*num_rows))  # Adjust the figure size as needed

# Remove the empty subplots
if num_empty > 0:
    for i in range(num_empty):
        fig.delaxes(axs[num_rows-1, num_cols-1-i])

# For each feature, create a histogram of the feature values, grouped by quality
for i, feature in enumerate(features):
    row = i // num_cols  # Calculate the row index
    col = i % num_cols  # Calculate the column index
    
    for quality in sorted(df_red['quality'].unique()):
        data = df_red[df_red['quality'] == quality][feature]
        axs[row, col].hist(data, bins=30, alpha=0.5, label=f'Quality {quality}')
    
    axs[row, col].set_title(f'Histogram of {feature} grouped by Quality')
    axs[row, col].set_xlabel(feature)
    axs[row, col].set_ylabel('Frequency')
    axs[row, col].legend()

# Adjust the layout
plt.tight_layout()
plt.show()

In [None]:
# List of features to plot against quality
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

# Calculate the number of rows and columns for subplots
num_features = len(features)
num_rows = (num_features + 2) // 3  # Round up to the nearest integer
num_cols = min(3, num_features)  # Maximum 3 columns

# Determine the number of empty subplots to remove
num_empty = num_rows * num_cols - num_features

# Create a figure with subplots for each feature
fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 5*num_rows))  # Adjust the figure size as needed

# Remove the empty subplots
if num_empty > 0:
    for i in range(num_empty):
        fig.delaxes(axs[num_rows-1, num_cols-1-i])

# For each feature, create a histogram of the feature values, grouped by quality
for i, feature in enumerate(features):
    row = i // num_cols  # Calculate the row index
    col = i % num_cols  # Calculate the column index
    
    for quality in sorted(df_white['quality'].unique()):
        data = df_white[df_white['quality'] == quality][feature]
        axs[row, col].hist(data, bins=30, alpha=0.5, label=f'Quality {quality}')
    
    axs[row, col].set_title(f'Histogram of {feature} grouped by Quality')
    axs[row, col].set_xlabel(feature)
    axs[row, col].set_ylabel('Frequency')
    axs[row, col].legend()

# Adjust the layout
plt.tight_layout()
plt.show()

In [None]:
# Visual 1: Pairplot of selected features for Red Wine
selected_features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides']
sns.pairplot(df_red[selected_features])
plt.suptitle('Pairplot of Selected Features - Red Wine Data', y=1.02)
plt.show()

In [None]:
# Visual 2: Pairplot of selected features for White Wine
sns.pairplot(df_white[selected_features])
plt.suptitle('Pairplot of Selected Features - White Wine Data', y=1.02)
plt.show()

In [None]:
# Visual 3: Violin plots of selected features for Red Wine
fig, axs = plt.subplots(nrows=1, ncols=len(selected_features), figsize=(20, 5))
for i, feature in enumerate(selected_features):
    sns.violinplot(data=df_red, y=feature, ax=axs[i])
    axs[i].set_title(f'Violin plot of {feature} - Red Wine Data')
plt.tight_layout()
plt.show()

In [None]:
# Visual 4: Violin plots of selected features for White Wine
fig, axs = plt.subplots(nrows=1, ncols=len(selected_features), figsize=(20, 5))
for i, feature in enumerate(selected_features):
    sns.violinplot(data=df_white, y=feature, ax=axs[i])
    axs[i].set_title(f'Violin plot of {feature} - White Wine Data')
plt.tight_layout()
plt.show()

In [None]:
# Visual 5: Scatter plot of 'fixed acidity' vs 'volatile acidity' for Red Wine
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_red, x='fixed acidity', y='volatile acidity', hue='quality', palette='viridis')
plt.title('Scatter plot of Fixed Acidity vs Volatile Acidity - Red Wine Data')
plt.show()

In [None]:
# Visual 6: Scatter plot of 'fixed acidity' vs 'volatile acidity' for White Wine
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_white, x='fixed acidity', y='volatile acidity', hue='quality', palette='viridis')
plt.title('Scatter plot of Fixed Acidity vs Volatile Acidity - White Wine Data')
plt.show()

In [None]:
# Visual 7: Scatter plot of 'fixed acidity' vs 'citric acid' for Red Wine
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_red, x='fixed acidity', y='citric acid', hue='quality', palette='viridis')
plt.title('Scatter plot of Fixed Acidity vs Citric Acid - Red Wine Data')
plt.show()

In [None]:
# Visual 8: Scatter plot of 'fixed acidity' vs 'citric acid' for White Wine
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_white, x='fixed acidity', y='citric acid', hue='quality', palette='viridis')
plt.title('Scatter plot of Fixed Acidity vs Citric Acid - White Wine Data')
plt.show()

In [None]:
# Visual 9: Boxplot of 'quality' for Red Wine
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_red, y='quality')
plt.title('Boxplot of Quality - Red Wine Data')
plt.show()

In [None]:
# Visual 10: Boxplot of 'quality' for White Wine
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_white, y='quality')
plt.title('Boxplot of Quality - White Wine Data')
plt.show()