# Wine Analysis



In [80]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from scipy.stats import ttest_ind

# Load the data
data = pd.read_csv('winequality-red.csv', delimiter=',')

## Data Cleaning

In [81]:
# Check for missing values in dataframe
print(data.isnull().sum())

fixed acidity            0
volatile acidity        12
citric acid              2
residual sugar          25
chlorides               12
free sulfur dioxide      2
total sulfur dioxide     4
density                 27
pH                       0
sulphates                0
alcohol                  0
quality                  0
dtype: int64


In [82]:
# Fill missing values with mean
data = data.fillna(data.mean())

# Fill missing values with median
data = data.fillna(data.median())

# Fill missing values with mode
data = data.fillna(data.mode().iloc[0])

  data = data.fillna(data.mean())
  data = data.fillna(data.median())


In [83]:
#Checking Dupilcate Records
data.duplicated().sum()

232

In [84]:
# Removing Duplicate Records
data.drop_duplicates(keep = 'first', inplace = True, ignore_index = True)
data.duplicated().sum()

0

In [85]:
data.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                  object
dtype: object

In [86]:
unique_values = data['quality'].unique()
print(unique_values)

['Average' 'Below Average' 'Poor' 'Good' 'Very Poor' 'Excellent']


In [87]:
# Map quality values to numeric values
quality_map = {'Excellent': 8, 'Good': 7, 'Average': 6,
               'Below Average': 5, 'Poor': 4, 'Very Poor': 3}
data['quality'] = data['quality'].map(quality_map)

In [88]:
unique_values = data['quality'].unique()
print(unique_values)
data.head()

[6 5 4 7 3 8]


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,6
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,6
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,6
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,5
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,6


# Classification Model 1: Naive Bayes

In [89]:
# Define our features (X) and target variable (y)
X = data.drop('quality', axis=1)
y = data['quality']

# Split our data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train our classifier on the training data
gnb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = gnb.predict(X_test)


# Calculate the accuracy of our model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error (MSE):', mse)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print('Root Mean Squared Error (RMSE):', rmse)

Accuracy: 0.45255474452554745
Mean Squared Error (MSE): 1.0547445255474452
Root Mean Squared Error (RMSE): 1.0270075586613008


# Classification Model 2 : Random Forest

In [90]:


# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print('Random Forest Accuracy:', accuracy_rf)

# Calculate MSE and RMSE
mse_rf = mean_squared_error(y_test, y_pred_rf)
print('Random Forest MSE:', mse_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print('Random Forest RMSE:', rmse_rf)


Random Forest Accuracy: 0.6240875912408759
Random Forest MSE: 0.48175182481751827
Random Forest RMSE: 0.6940834422585791


In [91]:
# Get feature importance from random forest modal
feature_importance = rf.feature_importances_
print("Feature Importance:", feature_importance)

Feature Importance: [0.07604659 0.10334015 0.07066846 0.07316718 0.07868406 0.0693764
 0.1035735  0.09179205 0.07669507 0.10746551 0.14919103]


In [92]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

# Hypothesis Testing 

In [93]:
# Separate the features from the target variable
features = data.drop('quality', axis=1)
target = data['quality']

# Split the data into a training set and a testing set
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [94]:

from scipy.stats import ttest_ind

# Perform the t-test
t_statistic, p_value = ttest_ind(features_train['alcohol'], target_train)

# Print the results of the t-test
print(f"T-statistic: {t_statistic}, p-value: {p_value}")

# Define the significance level (alpha)
alpha = 0.05

# Check if the p-value is less than the significance level
if p_value < alpha:
    print("Reject the null hypothesis.")
    print("There is a significant relationship between the alcohol and quality variables.")
else:
    print("Fail to reject the null hypothesis.")
    print("There is no significant relationship between the alcohol and quality variables.")


T-statistic: 122.01522130357249, p-value: 0.0
Reject the null hypothesis.
There is a significant relationship between the alcohol and quality variables.
