In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.utils.multiclass import unique_labels
from scipy.stats import ttest_rel

Join the red wine and white wine datasets by adding the rows of one to the other. Assign the joined data to a data frame and name it wine_data_all


In [None]:
#Initializing dataset variables with URL
white = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
red = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

#Creating a dataframe for white wine dataset
white_df = pd.read_csv(white, sep=';')
print(f"White Wine dataset has {white_df.shape[0]} rows") #Printing number of rows in white wine dataset

#Creating a dataframe for red wine dataset
red_df = pd.read_csv(red, sep=';')
print(f"Red Wine dataset has {red_df.shape[0]} rows") #Printing number of rows in white wine dataset

# Merging wine dataframes and confirming sum number of rows matches
wine_data_all = pd.concat([red_df, white_df], ignore_index = True)
print(f"Sum of rows in White Wine and Red Wine dataframes: {white_df.shape[0] + red_df.shape[0]} ")
print(f"All wine dataset has {wine_data_all.shape[0]} rows\n")


#Printing first five rows of wine dataset 
print("First 5 rows of All Wine dataset:\n")
wine_data_all.head()

Check the data types of the attributes.

In [None]:
#Checking data types of attributes
print(f"Data types of All Wine dataset:\n\n{wine_data_all.dtypes}")

Are there any missing values in the dataset? How many?

In [None]:
#Checking for missing values 
missing_values = wine_data_all.isnull()
print(f"Missing values per column in All Wine dataset:\n{missing_values.sum()}")

#Initiating conditional if loop to check the total number of missing values in the dataset 
if missing_values.sum().sum() > 0:
    print(f"\nThere are {missing_values.sum().sum()} missing values in the dataset")
else:
    print(f"\nThere are no missing values in the dataset")

What is the correlation between the attributes other than Quality?

In [None]:
#Separating target variable from dataset
target = wine_data_all['quality']
wine_data_all_df = wine_data_all.drop(columns=['quality'])

In [None]:
#Checking the descriptive statistics of the All Wine dataset 
print("Descriptive Statistics Summary of All Wine dataset")
wine_data_all.describe()

In [None]:
#Initializing and displaying correlation matrix
cor_matrix = wine_data_all.corr()
print("Correlation Matrix for numeric attributes in All Wine dataset")
cor_matrix

In [None]:
#Visualizing correlation with a heat map
plt.figure(figsize=(12,6))
sns.heatmap(cor_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Between Wine Attributes')
plt.show()

Plot the frequency distribution of wine quality by using the Quality attribute.

In [None]:
# Plotting a histogram to visualize the Quality attribute distribution
sns.histplot(target, bins=6, binrange=(3, 9), kde=False, color='blue')  # Customize bins and colors as needed
plt.title('Frequency Distribution of Quality Attribute in Wine Dataset', fontsize=14)
plt.xlabel('Quality Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(range(3, 10))  # Ensure the x-axis ticks match the quality scores
plt.grid(alpha = 0.5) # Add a grid for better readability
plt.show()

Reduce the levels of rating for quality to three levels, i.e., high(2), medium(1), and low(0). Assign the levels 3 and 4 to level 0; 5 and 6 to level 1; and 7, 8, and 9 to level 2. 

In [None]:
#Creating bins for Quality rating
bins = [2, 4, 6, 9]

#Creating labels for Quality levels 
labels = [0 , 1, 2]

#Categorizing quality values according to the bins created 
target = pd.cut(target, bins=bins, labels=labels, include_lowest = True, ordered= True)

print(f"Updated target type:\n{target.dtypes}")

In [None]:
#Checking distribution of target variable
print(target.value_counts(normalize=True) * 100)

# Plotting a histogram to visualize the Quality attribute distribution
sns.histplot(target, bins=3, binrange=(0, 2), kde=False, color='blue')  # Customize bins and colors as needed
plt.title('Frequency Distribution of Levels of Quality Attribute in Wine Dataset', fontsize=14)
plt.xlabel('Quality Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(range(0, 3))  # Ensure the x-axis ticks match the quality scores
plt.grid(alpha = 0.5) # Add a grid for better readability
plt.show()

Normalize the numeric attributes. 

In [None]:
#Using the MinMax Scaler to normalize features
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(wine_data_all)

#Converting numPy array to dataframe 
normalized_wine_data = pd.DataFrame(normalized_data, columns=wine_data_all.columns )

#Printing descriptive statistics of the All Wine dataset to confirm normalization 
print("Normalized All Wine Dataset Descriptive Statistics:")
normalized_wine_data.describe()

Divide the dataset to training and test sets. 

In [None]:
#Renaming split dataset 
X = normalized_wine_data
y = target

# Split data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Use the Logistic Regression algorithm to predict the quality of wine using its attributes.

In [None]:
# Initializing the Logistic Regression model with a balanced class weight parameter to help address imbalance of class in target variable
log_reg = LogisticRegression(class_weight = 'balanced')

# Performing Cross-validation on the Training Set
log_cv_scores = cross_val_score(log_reg, X_train, y_train, cv=5, scoring='accuracy')

print("Cross-Validation Accuracy Scores on Logistic Regression Model Training Set:", log_cv_scores)
print("Average Cross-Validation Accuracy on Logistic Regression Model Training Set:", log_cv_scores.mean())