In [2]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
X = wine_quality.data.features 
y = wine_quality.data.targets 
  
# metadata 
print(wine_quality.metadata) 
  
# variable information 
print(wine_quality.variables) 




{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences

In [3]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

# Fetch dataset
wine_quality = fetch_ucirepo(id=186)

# Extract components
X = wine_quality.data.features       # Feature data
y = wine_quality.data.targets        # Target/label
variables_df = wine_quality.variables  # Metadata about variables

# Optionally, combine X and y into one dataframe
data_df = pd.concat([X, y], axis=1)

# Save to Excel (use openpyxl engine)
with pd.ExcelWriter('wine_quality_dataset.xlsx', engine='openpyxl') as writer:
    data_df.to_excel(writer, sheet_name='Data', index=False)
    variables_df.to_excel(writer, sheet_name='Variables Metadata', index=False)


In [None]:
print(X)

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0               7.4              0.70         0.00             1.9      0.076   
1               7.8              0.88         0.00             2.6      0.098   
2               7.8              0.76         0.04             2.3      0.092   
3              11.2              0.28         0.56             1.9      0.075   
4               7.4              0.70         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
6492            6.2              0.21         0.29             1.6      0.039   
6493            6.6              0.32         0.36             8.0      0.047   
6494            6.5              0.24         0.19             1.2      0.041   
6495            5.5              0.29         0.30             1.1      0.022   
6496            6.0              0.21         0.38             0.8      0.020   

      free_sulfur_dioxide  

In [None]:
print(y)

      quality
0           5
1           5
2           5
3           6
4           5
...       ...
6492        6
6493        5
6494        6
6495        7
6496        6

[6497 rows x 1 columns]


In [None]:
print(variables_df)

                    name     role         type demographic  \
0          fixed_acidity  Feature   Continuous        None   
1       volatile_acidity  Feature   Continuous        None   
2            citric_acid  Feature   Continuous        None   
3         residual_sugar  Feature   Continuous        None   
4              chlorides  Feature   Continuous        None   
5    free_sulfur_dioxide  Feature   Continuous        None   
6   total_sulfur_dioxide  Feature   Continuous        None   
7                density  Feature   Continuous        None   
8                     pH  Feature   Continuous        None   
9              sulphates  Feature   Continuous        None   
10               alcohol  Feature   Continuous        None   
11               quality   Target      Integer        None   
12                 color    Other  Categorical        None   

               description units missing_values  
0                     None  None             no  
1                     None  Non

In [None]:
print(data_df)

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0               7.4              0.70         0.00             1.9      0.076   
1               7.8              0.88         0.00             2.6      0.098   
2               7.8              0.76         0.04             2.3      0.092   
3              11.2              0.28         0.56             1.9      0.075   
4               7.4              0.70         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
6492            6.2              0.21         0.29             1.6      0.039   
6493            6.6              0.32         0.36             8.0      0.047   
6494            6.5              0.24         0.19             1.2      0.041   
6495            5.5              0.29         0.30             1.1      0.022   
6496            6.0              0.21         0.38             0.8      0.020   

      free_sulfur_dioxide  

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Assuming data_df already exists from your previous code
# Split features and target
X = data_df.drop(columns=["quality"])  # Features
y = data_df["quality"]                 # Target variable

# Optionally standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Confusion Matrix:
 [[  0   0   2   4   0   0   0]
 [  0   5  25  13   0   0   0]
 [  0   1 287 111   3   0   0]
 [  0   1 113 446  37   0   0]
 [  0   0   2  94 118   1   0]
 [  0   0   0  15   9  12   0]
 [  0   0   0   0   1   0   0]]

Classification Report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         6
           4       0.71      0.12      0.20        43
           5       0.67      0.71      0.69       402
           6       0.65      0.75      0.70       597
           7       0.70      0.55      0.62       215
           8       0.92      0.33      0.49        36
           9       0.00      0.00      0.00         1

    accuracy                           0.67      1300
   macro avg       0.52      0.35      0.38      1300
weighted avg       0.67      0.67      0.66      1300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Assuming data_df is already defined and contains the wine dataset
# Separate features and target
X = data_df.drop(columns=["quality"])
y = data_df["quality"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the entire dataset
data_df["predicted_quality"] = model.predict(X)

# Sort by predicted quality to find best combinations
best_mixes = data_df.sort_values(by="predicted_quality", ascending=False)

# Show top 5 mixes
print("Top 5 predicted best wine mixes:\n")
print(best_mixes.head(5))


Top 5 predicted best wine mixes:

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
2419            6.6              0.36         0.29             1.6      0.021   
2426            7.4              0.24         0.36             2.0      0.031   
2475            6.9              0.36         0.34             4.2      0.018   
5934            7.3              0.19         0.27            13.9      0.057   
5936            7.3              0.19         0.27            13.9      0.057   

      free_sulfur_dioxide  total_sulfur_dioxide  density    pH  sulphates  \
2419                 24.0                  85.0  0.98965  3.41       0.61   
2426                 27.0                 139.0  0.99055  3.28       0.48   
2475                 57.0                 119.0  0.98980  3.28       0.36   
5934                 45.0                 155.0  0.99807  2.94       0.41   
5936                 45.0                 155.0  0.99807  2.94       0.41   

      alcohol  q

In [5]:
import matplotlib.pyplot as plt

# Get the top 1 predicted best mix (excluding the predicted_quality column)
top_mix = best_mixes.iloc[0].drop("predicted_quality")
ingredients = top_mix.drop("quality")

# Plot as a pie chart
plt.figure(figsize=(8, 8))
plt.pie(ingredients, labels=ingredients.index, autopct='%1.1f%%', startangle=140)
plt.title(f"Top Predicted Wine MixComposition (Quality: {top_mix['quality']})")
plt.tight_layout()
plt.show()



NameError: name 'best_mixes' is not defined