In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
red_wine_df = pd.read_csv('visualization/qualityRedWineDF.csv')
white_wine_df = pd.read_csv('visualization/qualityWhiteWineDF.csv')

# Check the shape of the data and the datatypes of the features
print('Red Wine Data')
print('Shape:', red_wine_df.shape)
print('Data Types:')
print(red_wine_df.dtypes)
print('\n')
print('White Wine Data')
print('Shape:', white_wine_df.shape)
print('Data Types:')
print(white_wine_df.dtypes)

In [None]:
# Display the first few rows of the datasets
print('Red Wine Data')
print(red_wine_df.head())
print('\n')
print('White Wine Data')
print(white_wine_df.head())

# Get summary statistics
print('\nRed Wine Data - Summary Statistics')
print(red_wine_df.describe())
print('\n')
print('White Wine Data - Summary Statistics')
print(white_wine_df.describe())

In [None]:
# Check for missing values in the datasets
print('Red Wine Data - Missing Values')
print(red_wine_df.isnull().sum())
print('\n')
print('White Wine Data - Missing Values')
print(white_wine_df.isnull().sum())

In [None]:
# Calculate the skewness of all features in the datasets
red_wine_skewness = red_wine_df.drop(columns=['Unnamed: 0']).skew()
print('Red Wine Data - Skewness:')
print(red_wine_skewness)
print('\n')

white_wine_skewness = white_wine_df.drop(columns=['Unnamed: 0']).skew()
print('White Wine Data - Skewness:')
print(white_wine_skewness)

In [None]:
# Function to remove outliers based on the IQR
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df = df[~((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR)))]
    return df

# Define the columns with outliers
outlier_columns = ['residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates']

# Remove outliers from the red wine data
red_wine_df_clean = remove_outliers(red_wine_df, outlier_columns)
print('Red Wine Data - Cleaned')
print(red_wine_df_clean.describe())
print('\n')

# Remove outliers from the white wine data
white_wine_df_clean = remove_outliers(white_wine_df, outlier_columns)
print('White Wine Data - Cleaned')
print(white_wine_df_clean.describe())

In [None]:
# Calculate the skewness of all features in the cleaned datasets
red_wine_skewness_clean = red_wine_df_clean.drop(columns=['Unnamed: 0']).skew()
print('Red Wine Data (Cleaned) - Skewness:')
print(red_wine_skewness_clean)
print('\n')

white_wine_skewness_clean = white_wine_df_clean.drop(columns=['Unnamed: 0']).skew()
print('White Wine Data (Cleaned) - Skewness:')
print(white_wine_skewness_clean)

In [None]:
import seaborn as sns

# Correlation heatmap for the Red Wine dataset
plt.figure(figsize=(12,10))
correlation_matrix = red_wine_df_clean.drop(columns=['Unnamed: 0']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap - Red Wine Data')
plt.show()

# Correlation heatmap for the White Wine dataset
plt.figure(figsize=(12,10))
correlation_matrix = white_wine_df_clean.drop(columns=['Unnamed: 0']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap - White Wine Data')
plt.show()

In [None]:
# Histograms of all features in the Red Wine dataset
red_wine_df_clean.drop(columns=['Unnamed: 0']).hist(bins=15, figsize=(20, 15))
plt.suptitle('Histograms of all features - Red Wine Data')
plt.show()

# Histograms of all features in the White Wine dataset
white_wine_df_clean.drop(columns=['Unnamed: 0']).hist(bins=15, figsize=(20, 15))
plt.suptitle('Histograms of all features - White Wine Data')
plt.show()

In [None]:
# Function to create overlaid histograms of each feature to quality
def plot_histograms(df, title):
    features = df.columns.drop(['Unnamed: 0', 'quality'])
    for feature in features:
        plt.figure(figsize=(10,6))
        for quality in df['quality'].unique():
            subset = df[df['quality'] == quality]
            sns.histplot(subset[feature], kde=False, label=str(quality))
        plt.title(f'{title} - {feature} to Quality')
        plt.legend(title='Quality')
        plt.show()

# Create overlaid histograms of each feature to quality for the Red Wine dataset
plot_histograms(red_wine_df_clean, 'Red Wine Data')

# Create overlaid histograms of each feature to quality for the White Wine dataset
plot_histograms(white_wine_df_clean, 'White Wine Data')

In [None]:
# Function to create improved overlaid histograms of each feature to quality
def plot_improved_histograms(df, title):
    features = df.columns.drop(['Unnamed: 0', 'quality'])
    for feature in features:
        plt.figure(figsize=(10,6))
        for quality in df['quality'].unique():
            subset = df[df['quality'] == quality]
            sns.histplot(subset[feature], kde=True, stat='density', bins=30, label=str(quality))
        plt.title(f'{title} - {feature} to Quality')
        plt.legend(title='Quality')
        plt.show()

# Create improved overlaid histograms of each feature to quality for the Red Wine dataset
plot_improved_histograms(red_wine_df_clean, 'Red Wine Data')

# Create improved overlaid histograms of each feature to quality for the White Wine dataset
plot_improved_histograms(white_wine_df_clean, 'White Wine Data')

In [None]:
# Function to create facet grids of each feature to quality
def plot_facet_grids(df, title):
    features = df.columns.drop(['Unnamed: 0', 'quality'])
    for feature in features:
        g = sns.FacetGrid(df, col='quality', col_wrap=3, height=4, aspect=1)
        g.map(sns.histplot, feature, kde=True, bins=30)
        plt.suptitle(f'{title} - {feature} to Quality', y=1.02)
        plt.show()

# Create facet grids of each feature to quality for the Red Wine dataset
plot_facet_grids(red_wine_df_clean, 'Red Wine Data')

# Create facet grids of each feature to quality for the White Wine dataset
plot_facet_grids(white_wine_df_clean, 'White Wine Data')

In [None]:
# Correlation of features with quality for Red Wine dataset
red_wine_corr_with_quality = red_wine_df_clean.drop(columns=['Unnamed: 0']).corr()['quality'].sort_values(ascending=False)
print('Red Wine Data - Correlation of Features with Quality:')
print(red_wine_corr_with_quality)
print('\n')

# Correlation of features with quality for White Wine dataset
white_wine_corr_with_quality = white_wine_df_clean.drop(columns=['Unnamed: 0']).corr()['quality'].sort_values(ascending=False)
print('White Wine Data - Correlation of Features with Quality:')
print(white_wine_corr_with_quality)

In [None]:
# Function to create scatter plots of each feature against quality
def plot_scatter_plots(df, title):
    features = df.columns.drop(['Unnamed: 0', 'quality'])
    for feature in features:
        plt.figure(figsize=(10,6))
        sns.scatterplot(data=df, x=feature, y='quality', alpha=0.5)
        plt.title(f'{title} - {feature} vs Quality')
        plt.show()

# Create scatter plots of each feature against quality for the Red Wine dataset
plot_scatter_plots(red_wine_df_clean, 'Red Wine Data')

# Create scatter plots of each feature against quality for the White Wine dataset
plot_scatter_plots(white_wine_df_clean, 'White Wine Data')

In [None]:
# Function to create boxplots of each feature
def plot_boxplots(df, title):
    features = df.columns.drop(['Unnamed: 0', 'quality'])
    for feature in features:
        plt.figure(figsize=(10,6))
        sns.boxplot(data=df, x='quality', y=feature)
        plt.title(f'{title} - Boxplot of {feature} by Quality')
        plt.show()

# Create boxplots of each feature by quality for the Red Wine dataset
plot_boxplots(red_wine_df_clean, 'Red Wine Data')

# Create boxplots of each feature by quality for the White Wine dataset
plot_boxplots(white_wine_df_clean, 'White Wine Data')

In [None]:
# Function to create boxplots of each feature
def plot_boxplots(df, title):
    features = df.columns.drop(['Unnamed: 0', 'quality'])
    for feature in features:
        plt.figure(figsize=(10,6))
        sns.boxplot(data=df, x='quality', y=feature)
        plt.title(f'{title} - Boxplot of {feature} by Quality')
        plt.show()

# Create boxplots of each feature by quality for the Red Wine dataset
plot_boxplots(red_wine_df, 'Red Wine Data')

# Create boxplots of each feature by quality for the White Wine dataset
plot_boxplots(white_wine_df, 'White Wine Data')