#### Test

In [46]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [47]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV


In [48]:
random_seed = 42
np.random.seed(random_seed)

In [49]:
# prepare white wine data, with pandas
wine_data = pd.read_csv("winequality-white.csv", delimiter=";")
# print(wine_data, type(wine_data))


In [50]:
wine_data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [51]:
wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


#### Imputing
Do we need imputing?

#### test train split

In [52]:
wine_data['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [53]:
# transform wine quality into 3 categories
bins = [0, 4, 6, 10]  # Specify the bin edges
labels = [1, 2, 3]    # Specify the labels for each category
wine_data["new_quality"] = pd.cut(wine_data["quality"], bins=bins, labels=labels, include_lowest=True)


In [54]:
wine_output = wine_data["new_quality"]
wine_feature = wine_data.drop(columns=["quality", "new_quality"])

In [55]:
wine_data['new_quality'].value_counts()

2    3655
3    1060
1     183
Name: new_quality, dtype: int64

In [56]:
wine_output.describe()

count     4898
unique       3
top          2
freq      3655
Name: new_quality, dtype: int64

In [57]:
wine_feature.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2


In [58]:
# train and test split
test_size_ratio = 0.2
wine_feature_train, wine_feature_test, wine_output_train, wine_output_test = train_test_split(wine_feature, wine_output, test_size=test_size_ratio, random_state=random_seed)


In [59]:
wine_feature_train.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0
mean,6.865046,0.279338,0.332731,6.450702,0.045734,35.094564,138.001149,0.994071,3.189293,0.489781,10.50884
std,0.844483,0.101606,0.119758,5.139311,0.021797,16.676958,42.067667,0.003022,0.150183,0.11359,1.227887
min,3.8,0.08,0.0,0.6,0.009,3.0,10.0,0.98711,2.72,0.22,8.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.99174,3.09,0.41,9.5
50%,6.8,0.26,0.32,5.2,0.043,33.0,134.0,0.9938,3.18,0.47,10.4
75%,7.3,0.33,0.38,10.0,0.05,46.0,167.0,0.9962,3.28,0.55,11.4
max,11.8,1.1,1.66,65.8,0.346,146.5,313.0,1.03898,3.82,1.08,14.2


In [60]:
wine_feature_test.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0
mean,6.813776,0.273857,0.340031,6.154388,0.045926,36.161735,139.797959,0.993854,3.184163,0.490112,10.535963
std,0.840584,0.097411,0.125833,4.788953,0.022059,18.251705,44.169785,0.002859,0.154235,0.116302,1.241884
min,3.9,0.08,0.0,0.7,0.014,2.0,9.0,0.98722,2.8,0.25,8.0
25%,6.2,0.21,0.27,1.7,0.036,24.0,110.0,0.99165,3.08,0.41,9.5
50%,6.8,0.26,0.32,5.0,0.042,34.5,134.0,0.993655,3.17,0.48,10.4
75%,7.3,0.32,0.39,9.325,0.05,46.25,168.0,0.99597,3.27,0.55,11.4
max,14.2,0.76,0.99,22.6,0.24,289.0,440.0,1.001,3.8,1.01,14.0


In [61]:
wine_output_train.describe()


count     3918
unique       3
top          2
freq      2932
Name: new_quality, dtype: int64

In [62]:
wine_output_train.value_counts()

2    2932
3     833
1     153
Name: new_quality, dtype: int64

In [63]:
wine_output_test.describe()

count     980
unique      3
top         2
freq      723
Name: new_quality, dtype: int64

In [64]:
wine_output_test.value_counts()

2    723
3    227
1     30
Name: new_quality, dtype: int64

### Logistic Regression

In [65]:
# logistic regression model
log_reg = LogisticRegression(max_iter=50_000)

log_reg.fit(wine_feature_train, wine_output_train)

In [68]:
log_reg_train_score = log_reg.score(wine_feature_train, wine_output_train)
log_reg_test_score = log_reg.score(wine_feature_test, wine_output_test)

print(f"logistic regression training score:  {log_reg_train_score} ")
print(f"logistic regression test score:  {log_reg_test_score} ")

logistic regression training score:  0.7751403777437468 
logistic regression test score:  0.7571428571428571 
