In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df=pd.read_csv("wine-quality-white-and-red.csv")

In [31]:
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [32]:
df['type']=df['type'].map({'white':0,'red':1})
df['quality']=df['quality'].map({
    3:0,
    4:1,
    5:2,
    6:3,
    7:4,
    8:5,
    9:6
})

In [33]:
df['quality'].unique()

array([3, 2, 4, 5, 1, 0, 6])

In [34]:
X = df.drop("quality", axis=1)
y = df["quality"]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,stratify=y
)

In [36]:
xgb_clf = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)

In [37]:
xgb_clf.fit(X_train, y_train)

In [38]:
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6476923076923077


In [40]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [41]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

print("Best Parameters:", grid.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Test Accuracy: 0.6646153846153846


In [42]:
xgb_clf = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42
)

In [43]:
xgb_clf.fit(X_train, y_train)

In [44]:
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

In [45]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6646153846153846


In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.70      0.16      0.26        43
           2       0.68      0.73      0.71       428
           3       0.65      0.73      0.69       567
           4       0.66      0.53      0.59       216
           5       0.75      0.31      0.44        39
           6       0.00      0.00      0.00         1

    accuracy                           0.66      1300
   macro avg       0.49      0.35      0.38      1300
weighted avg       0.66      0.66      0.65      1300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [52]:
new_df={
    'type':0,
    'fixed acidity':8.9,     # fixed acidity
    'volatile acidity':0.70,    # volatile acidity
    'citric acid':1.00,    # citric acid
    'residual sugar':2.0,     # residual sugar
    'chlorides':0.076,   # chlorides
    'free sulfur dioxide':15.0,    # free sulfur dioxide
    'total sulfur dioxide':50.0,    # total sulfur dioxide
    'density':0.9978,  # density
    'pH':4.51,    # pH
    'sulphates':0.56,    # sulphates
    'alcohol':12.9
}

In [53]:
new=pd.DataFrame(['new_df'])

In [54]:
new_df = new.reindex(columns=X.columns)

In [55]:
predicted_price = xgb_clf.predict(new_df)

print("Predicted Wine Quality:", predicted_price[0])

Predicted Wine Quality: 2
