In [1]:

# Take-Home Exam

#To obtain the 1 ECTS, submit a `<last_name>_<first_name>_PythonML.ipynb` file to `christian.kauth@unifr.ch` by **March 10th**, featuring:

#- The names of the authors (max. 3 per group)
#- Download one dataset from the URL (
#  [iris](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data),
#  [pima](https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv),
#  [wine](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv),
#  [housing](https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv),
#  [penguin](https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv))
#- Mask a few values in the dataset
#- Impute missing values
#- Encode categorical & target variables
#- Apply a transformation
#- Craft new feature(s)
#- Select some features
#- Pick a metric
#- Train-test split the data, **do not leak data**
#- Train **two** models on the training data
#- Evaluate them on the test data
#- Wrap the better model into a Gradio app
#- Write a conclusion, **emphasizing that one thing that makes your project super cool 😎**.




In [2]:
# 1. Name of Author

# Laura Dekker
# Matrikelnr.: 22-112-346

# Xiaoyue Deng
# Matrikelnr.: 22-118-205


In [4]:
# 2. Download dataset

from requests import get
import pandas as pd

def download_save(url, filename):
  res = get(url)
  if res.status_code != 200:
    print(f"Couldn't fetch data from {url}")
  else:
    csv_file = open(filename, 'wb')
    csv_file.write(res.content)
    csv_file.close()

download_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 'wine_quality_red.csv')
df = pd.read_csv('wine_quality_red.csv', header=0, low_memory=False, sep=';')

df.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [5]:
# 3. Mask values in dataset

dfmasked = df.mask(df > 10)
dfmasked = dfmasked.mask(df["residual sugar"] > 2.5)
dfmasked = dfmasked.mask(df["alcohol"] > 9.5)
dfmasked.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,,,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,,,0.9964,3.3,0.46,9.4,5.0
7,,,,,,,,,,,,
8,7.8,0.58,0.02,2.0,0.073,9.0,,0.9968,3.36,0.57,9.5,7.0
9,,,,,,,,,,,,


In [6]:
# 4. Impute missing values

#Import module
from sklearn.impute import SimpleImputer

#Count NaNs
print(f'NaNs before filling: {dfmasked.isnull().values.any()}')

#Copy into new variable
dffilled = dfmasked.copy()

#Fill NaNs with forward filling
dffilled.fillna(method='ffill', inplace=True)
dffilled.head(10)

#Fill remaining NaNs with average imputer
#Loop over columns
#Check for Nans
#If so impute average for Nans based on values in rest of column

col_names = dffilled.columns

for i in range(len(col_names)):
    current_col = col_names[i]
    if dffilled[current_col].isnull().values.any() == True:
        my_imputer = SimpleImputer(strategy='mean')
        dffilled[[current_col]] = my_imputer.fit_transform(dffilled[[current_col]].values)

#Check for remaining NaNs
print(f'NaNs after filling: {dffilled.isnull().values.any()}')

dffilled.head(10)


NaNs before filling: True
NaNs after filling: False


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
1,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
2,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
3,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
4,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,5.0
7,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,5.0
8,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,7.0
9,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,7.0


In [7]:
## Library Preparation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import requests

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [8]:
# 5. Encode categorical & target variables
# The wine quality dataset does not explicitly contain categorical features as it's mostly chemical properties of the wine and a numerical quality rating. However, if we were to treat 'quality' as a categorical target variable for classification, we could encode it. Since 'quality' is already numerical, we don't need to encode it for regression tasks.
# Maybe substitute the numerical categories with string ones so we show we know how to work with it.

# Overview of categories
counts = dffilled['quality'].value_counts() 
print(counts) # scores from 3-7

# Define substitute values
replacements = {3.0: 'Horrible', 4.0: 'Bad', 5.0: 'Mediocre', 6.0: 'Decent', 7.0: 'Good'}

# Make copy of dataframe
df_cat = dffilled.copy()

# Substitute the values
df_cat['quality'] = df_cat['quality'].map(replacements).fillna(df_cat['quality'])
df_cat.head(10)



5.0    1019
6.0     523
4.0      36
7.0      13
3.0       8
Name: quality, dtype: int64


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
1,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
2,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
3,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
4,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
5,7.4,0.66,0.0,1.8,0.075,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
6,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,Mediocre
7,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,Mediocre
8,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,Good
9,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,Good


In [9]:
# 6. Transforms
scaler = MinMaxScaler()

# Apply scaler to the processed dataset
df_scaled = pd.DataFrame(scaler.fit_transform(dffilled),
                         columns=dffilled.columns,
                         index=dffilled.index)
df_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,0.554044,0.339063,0.361122,0.555503,0.11106,0.668133,0.855143,0.499011,0.557686,0.238153,0.835863,0.577705
std,0.18109,0.14714,0.21661,0.240554,0.133314,0.257353,0.249549,0.156629,0.183697,0.152754,0.178828,0.138227
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.416667,0.219048,0.236842,0.416667,0.061189,0.444444,0.855143,0.394452,0.424658,0.162791,0.727273,0.5
50%,0.541667,0.32381,0.342105,0.5,0.06993,0.666667,1.0,0.485792,0.575342,0.20155,0.909091,0.5
75%,0.666667,0.452381,0.5,0.75,0.092657,0.888889,1.0,0.566982,0.719178,0.248062,1.0,0.75
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
#7. Craft new feature(s)
# Since we want a feature that makes sense, we decied to calculate the sulfur dioxide ratio (free/total) as the new feature.
df_scaled['sulfur dioxide ratio'] = df_scaled['free sulfur dioxide'] / df_scaled['total sulfur dioxide']
df_scaled.head(10)
df_scaled.replace([np.inf, -np.inf], np.nan, inplace=True)
df_scaled.fillna(df_scaled.mean(), inplace=True)


In [16]:
#8. Select some features
# calculate and return mutual information scores between features and target
def make_mi_scores(X, y):
    # Calculate mi for all features in X relative to target y
    mi_scores = mutual_info_regression(X, y)
    # Convert into pandas Series
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    # Sort the Series so that highest Mi appears on top.
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

X = df_scaled.copy()

# Select the target variable and remove it from the features dataset
# We played around this feature a bit to see what is relevant.
y = X.pop('density')
#y = X.pop('pH')

# Calculate mutual information scores between each feature in X and the target y
mi_scores = make_mi_scores(X, y)


# Features with higher scores are more informative about the target
top_features_str = mi_scores.head(10).to_string()
print("Top 10 features based on mutual information scores:\n", top_features_str)


Top 10 features based on mutual information scores:
 volatile acidity        2.823841
chlorides               2.777545
citric acid             2.769897
pH                      2.717865
sulphates               2.560971
fixed acidity           2.309503
sulfur dioxide ratio    1.847675
residual sugar          1.593043
free sulfur dioxide     1.415773
alcohol                 1.141902


In [14]:
#9. Pick a metric 
#For a regression task, common metrics include Mean Absolute Error (MAE), Mean Squared Error (MSE), or R2 score.
# For this task, we picked MSE as our matrix. 
# This is because MSE provides straight forward representation of the difference between the estimated values and true values. 
# The best model was selected based on the lowest MSE scores, since that means higher accuracy.

In [17]:
# 10 . Train the data, and prevent leakage

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Logistic Regression Task -  prevent leaking data
#
#
X_log = df.copy() 
y_log = X_log.pop('quality')  # Use 'quality' as the target for logistic regression

df.head(10) # A checkpoint for data 

# Setup cross-validation for logistic regression
kfold = KFold(n_splits=5, shuffle=True, random_state=198)

# Define a logistic regression model within the pipeline
model_log = Pipeline([
    ('std', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Perform cross-validation for logistic regression
results_log = cross_val_score(model_log, X_log, y_log, cv=kfold)
print("Logistic Regression Accuracy: %.2f%% (%.2f%%)" % (results_log.mean() * 100, results_log.std() * 100))

# Regression Task with 'pH' as the target
# Split the data for regression task with 'pH' as the target
X_reg_encoded = df.drop(columns=['pH'])  # Make sure to drop 'pH' from features
y_reg_encoded = df['pH']  # Target variable for regression

# Split the data into training, validation, and test sets for regression task
seed = 8
X_train_reg_encoded, X_test_reg_encoded, y_train_reg_encoded, y_test_reg_encoded = train_test_split(X_reg_encoded, y_reg_encoded, test_size=0.2, random_state=seed)
X_train_reg_encoded, X_valid_reg_encoded, y_train_reg_encoded, y_valid_reg_encoded = train_test_split(X_train_reg_encoded, y_train_reg_encoded, test_size=0.25, random_state=seed)

# Proceed to train models as before with the encoded dataset
print('Train: ', X_train_reg_encoded.shape, y_train_reg_encoded.shape)
print('Validation: ', X_valid_reg_encoded.shape, y_valid_reg_encoded.shape)
print('Test:  ', X_test_reg_encoded.shape, y_test_reg_encoded.shape)

# Perform cross-validation and print the results with MSE
scores = cross_val_score(model_log, X_log, y_log, cv=kfold, scoring='neg_mean_squared_error')
mean_score = -np.mean(scores)
std_dev = np.std(scores)
print(f"Cross-Validation MSE: {mean_score:.6f} ± {std_dev:.6f}")

Logistic Regression Accuracy: 59.41% (1.39%)
Train:  (959, 11) (959,)
Validation:  (320, 11) (320,)
Test:   (320, 11) (320,)
Cross-Validation MSE: 0.522839 ± 0.025515


In [19]:
# 11. Train two models on the training data 
# 
# First, we are interested in how it will turn out without leakage prevention, so we tried it. 
#
#
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

X_lin = df.copy()  # Make a copy to avoid altering original DataFrame
y_lin = X_lin.pop('quality')  # Use 'quality' as the target for logistic regression
df.head(10)
# Setup cross-validation for logistic regression
kfold = KFold(n_splits=5, shuffle=True, random_state=198)

# Train a Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_lin, y_lin)

# Predict on the test set
y_pred_linear = linear_reg.predict(X_lin)

# Evaluate the model
mse_linear = mean_squared_error(y_lin, y_pred_linear)
print(f"Linear Regression MSE: {mse_linear:.6f}")

# Train a Random Forest Regressor
random_forest_reg = RandomForestRegressor(n_estimators=100, random_state=8)
random_forest_reg.fit(X_lin, y_lin)

# Predict on the test set
y_pred_rf = random_forest_reg.predict(X_lin)

# Evaluate the model
mse_rf = mean_squared_error(y_lin, y_pred_rf)
print(f"Random Forest Regressor MSE: {mse_rf:.6f}")


Linear Regression MSE: 0.416767
Random Forest Regressor MSE: 0.044442


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

X = df.drop(columns=['quality'])  # Features
y = df['quality']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=198)

# Train a Linear Regression model on the training data
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_linear = linear_reg.predict(X_test)

# Evaluate the model on the test set
mse_linear = mean_squared_error(y_test, y_pred_linear)
print(f"Linear Regression MSE: {mse_linear:.6f}")

# Train a Random Forest Regressor on the training data
random_forest_reg = RandomForestRegressor(n_estimators=100, random_state=8)
random_forest_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = random_forest_reg.predict(X_test)

# Evaluate the model on the test set
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest Regressor MSE: {mse_rf:.6f}")

# Compared to previous case where data was leaked, we have much higher MSE in the random forest regressor.
# However, it still remaind the one with lowest MSE.


Linear Regression MSE: 0.407802
Random Forest Regressor MSE: 0.350903


In [15]:
#12. Evaluate them on the test data
# see above
# Decision Tree Regressor Cross Validation MSE is the best one.
# Despite a huge increase in MSE after preventing data leakage, it remained as the model with lowest MSE.

In [21]:
#13. Wrap the better model into a Gradio app
# Part 1: standard model

# Predict category quality based upon df_cat
# use all possible options to predict the quality

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import gradio as gr
import numpy as np


df_gr = df_cat.copy()
df_gr['quality'].head()

# Encode the previously made labels
label_encoder = LabelEncoder()
label_encoder.fit(df_gr['quality'])
labels = label_encoder.classes_
df_gr['quality'] = label_encoder.fit_transform(df_gr['quality'])


# Train-test split
X = df_gr.copy()
y = X.pop('quality')
X, y = X.values, y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = KNeighborsClassifier(n_neighbors=5)

# Train model
model.fit(X_train, y_train)

# Gradio app function
def infer_quality(fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol):
    # Make prediction
    y_hat = model.predict(np.array([[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol]])).astype(int)
    predicted = label_encoder.inverse_transform(y_hat)
    return predicted

# Make Gradio app
wine_demo = gr.Interface(fn=infer_quality, 
                        inputs=[gr.Slider(label="Fixed acidity", minimum=5.2, maximum=10),
                               gr.Slider(label="Volatile acidity", minimum=0.19, maximum=1.24),
                               gr.Slider(label="Citirc acid", minimum=0, maximum=0.76),
                               gr.Slider(label="Residual sugar", minimum=1.3, maximum=2.5),
                               gr.Slider(label="Chlorides", minimum=0.03, maximum=0.611),
                               gr.Slider(label="Free sulfur dioxide", minimum=1, maximum=10),
                               gr.Slider(label="Total sulfur dioxide", minimum=8, maximum=11),
                               gr.Slider(label="Density", minimum=0.993410, maximum=1),
                               gr.Slider(label="pH", minimum=2.86, maximum=3.59),
                               gr.Slider(label="Sulphates", minimum=0.33, maximum=1.62),
                               gr.Slider(label="Alcohol", minimum=8.4, maximum=9.5)],
                        outputs="text",
                        examples=[[7.8, 0.58, 0.02, 2, 0.073, 9, 9.71, 0.9968, 3.36, 0.57, 9.5],[7.4, 0.7, 0, 1.9, 0.076, 7, 9.71, 0.9978, 3.51, 0.56, 9.4]])

wine_demo.launch(share = True)

Running on local URL:  http://127.0.0.1:7864
Running on public URL: https://0158d9825ed5246491.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [22]:
#13. Wrap the better model into a Gradio app
# Part 2: our best tested model

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import gradio as gr
import numpy as np

df_gr = df_cat.copy()
df_gr['quality'].head()

# Encode the previously made labels
label_encoder = LabelEncoder()
label_encoder.fit(df_gr['quality'])
labels = label_encoder.classes_
df_gr['quality'] = label_encoder.fit_transform(df_gr['quality'])

# Train-test split
X = df_gr.copy()
y = X.pop('quality')
X, y = X.values, y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Train model
model_dtree.fit(X_train, y_train)

# Gradio app function
def infer_quality(fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol):
    # Make prediction
    y_hat = model_dtree.predict(np.array([[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol]])).astype(int)
    predicted = label_encoder.inverse_transform(y_hat)
    return predicted

# Make Gradio app
wine_demo = gr.Interface(fn=infer_quality, 
                        inputs=[gr.Slider(label="Fixed acidity", minimum=5.2, maximum=10),
                               gr.Slider(label="Volatile acidity", minimum=0.19, maximum=1.24),
                               gr.Slider(label="Citirc acid", minimum=0, maximum=0.76),
                               gr.Slider(label="Residual sugar", minimum=1.3, maximum=2.5),
                               gr.Slider(label="Chlorides", minimum=0.03, maximum=0.611),
                               gr.Slider(label="Free sulfur dioxide", minimum=1, maximum=10),
                               gr.Slider(label="Total sulfur dioxide", minimum=8, maximum=11),
                               gr.Slider(label="Density", minimum=0.993410, maximum=1),
                               gr.Slider(label="pH", minimum=2.86, maximum=3.59),
                               gr.Slider(label="Sulphates", minimum=0.33, maximum=1.62),
                               gr.Slider(label="Alcohol", minimum=8.4, maximum=9.5)],
                        outputs="text",
                        examples=[[7.8, 0.58, 0.02, 2, 0.073, 9, 9.71, 0.9968, 3.36, 0.57, 9.5],[7.4, 0.7, 0, 1.9, 0.076, 7, 9.71, 0.9978, 3.51, 0.56, 9.4]])

wine_demo.launch(share = True)


NameError: name 'model_dtree' is not defined

In [None]:
#14. Write a conclusion, emphasizing that one thing that makes your project super cool

# To conclude, we started our project by downloading the dataset, artificially deleting some values,
# and imputing them back through a forward fill and average imputing method. Then, we also 
# encoded some categorical values for the quality labels to use later on in the Gradio app.
# After that we separately scaled the dataset, made some new features and selected on to evaluate 
# our models with. The model that gave the lowest MSE score was the decision tree regressor, and 
# therefore we chose this one and the standard model used in the lectures to make our Gradio app.

# What makes this project really cool is that the final product can be used to infer wine quality
# based upon quantitative characteristics of the wines themselves. This can be used to make quality
# judgement of wine an easier process or a guide for wine makers on how to make their wine more 
# according to the consumers taste. A potential future tool could be a program that can analyse 
# wine content and give the producer a recommendation on how to improve it.