In [1]:

# Take-Home Exam

#To obtain the 1 ECTS, submit a `<last_name>_<first_name>_PythonML.ipynb` file to `christian.kauth@unifr.ch` by **March 10th**, featuring:

#- The names of the authors (max. 3 per group)
#- Download one dataset from the URL (
#  [iris](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data),
#  [pima](https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv),
#  [wine](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv),
#  [housing](https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv),
#  [penguin](https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv))
#- Mask a few values in the dataset
#- Impute missing values
#- Encode categorical & target variables
#- Apply a transformation
#- Craft new feature(s)
#- Select some features
#- Pick a metric
#- Train-test split the data, **do not leak data**
#- Train **two** models on the training data
#- Evaluate them on the test data
#- Wrap the better model into a Gradio app
#- Write a conclusion, **emphasizing that one thing that makes your project super cool 😎**.




In [2]:
# 1. Name of Author

# Laura Dekker
# Matrikelnr.: 22-112-346

# Xiaoyue Deng
# Matrikelnr.: 22-118-205


In [3]:
# 2. Download dataset

from requests import get
import pandas as pd

def download_save(url, filename):
  res = get(url)
  if res.status_code != 200:
    print(f"Couldn't fetch data from {url}")
  else:
    csv_file = open(filename, 'wb')
    csv_file.write(res.content)
    csv_file.close()

download_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 'wine_quality_red.csv')
df = pd.read_csv('wine_quality_red.csv', header=0, low_memory=False, sep=';')

df.head(10)
#max(df['quality']) -> 8

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [4]:
# 3. Mask values in dataset

dfmasked = df.mask(df > 10)
dfmasked = dfmasked.mask(df["residual sugar"] > 2.5)
dfmasked = dfmasked.mask(df["alcohol"] > 9.5)
dfmasked.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,,,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,,,0.9964,3.3,0.46,9.4,5.0
7,,,,,,,,,,,,
8,7.8,0.58,0.02,2.0,0.073,9.0,,0.9968,3.36,0.57,9.5,7.0
9,,,,,,,,,,,,


In [19]:
# 4. Impute missing values

#Import module
from sklearn.impute import SimpleImputer

#Count NaNs
print(f'NaNs before filling: {dfmasked.isnull().values.any()}')

#Copy into new variable
dffilled = dfmasked.copy()

#Fill NaNs with forward filling
dffilled.fillna(method='ffill', inplace=True)
dffilled.head(10)

#Fill remaining NaNs with average imputer
#Loop over columns
#Check for Nans
#If so impute average for Nans based on values in rest of column

col_names = dffilled.columns

for i in range(len(col_names)):
    current_col = col_names[i]
    if dffilled[current_col].isnull().values.any() == True:
        my_imputer = SimpleImputer(strategy='mean')
        dffilled[[current_col]] = my_imputer.fit_transform(dffilled[[current_col]].values)

#Check for remaining NaNs
print(f'NaNs after filling: {dffilled.isnull().values.any()}')

dffilled.head(10)


NaNs before filling: True
NaNs after filling: False


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
1,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
2,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
3,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
4,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,5.0
7,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,5.0
8,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,7.0
9,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,7.0


In [6]:
## Library Preparation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import requests

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [134]:
# 5. Encode categorical & target variables
# The wine quality dataset does not explicitly contain categorical features as it's mostly chemical properties of the wine and a numerical quality rating. However, if we were to treat 'quality' as a categorical target variable for classification, we could encode it. Since 'quality' is already numerical, we don't need to encode it for regression tasks.
# Maybe substitute the numerical categories with string ones so we show we know how to work with it.

# Overview of categories
counts = dffilled['quality'].value_counts() 
print(counts) # scores from 3-7

# Define substitute values
replacements = {3.0: 'Horrible', 4.0: 'Bad', 5.0: 'Mediocre', 6.0: 'Decent', 7.0: 'Good'}

# Make copy of dataframe
df_cat = dffilled.copy()

# Substitute the values
df_cat['quality'] = df_cat['quality'].map(replacements).fillna(df_cat['quality'])
df_cat.head(10)



5.0    1019
6.0     523
4.0      36
7.0      13
3.0       8
Name: quality, dtype: int64


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
1,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
2,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
3,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
4,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
5,7.4,0.66,0.0,1.8,0.075,7.013199,9.710287,0.9978,3.51,0.56,9.4,Mediocre
6,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,Mediocre
7,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,Mediocre
8,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,Good
9,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,Good


In [121]:
# 6. Transforms
scaler = MinMaxScaler()

# Apply scaler to the processed dataset
df_scaled = pd.DataFrame(scaler.fit_transform(dffilled),
                         columns=dffilled.columns,
                         index=dffilled.index)
df_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,0.554044,0.339063,0.361122,0.555503,0.11106,0.668133,0.855143,0.499011,0.557686,0.238153,0.835863,0.577705
std,0.18109,0.14714,0.21661,0.240554,0.133314,0.257353,0.249549,0.156629,0.183697,0.152754,0.178828,0.138227
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.416667,0.219048,0.236842,0.416667,0.061189,0.444444,0.855143,0.394452,0.424658,0.162791,0.727273,0.5
50%,0.541667,0.32381,0.342105,0.5,0.06993,0.666667,1.0,0.485792,0.575342,0.20155,0.909091,0.5
75%,0.666667,0.452381,0.5,0.75,0.092657,0.888889,1.0,0.566982,0.719178,0.248062,1.0,0.75
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [119]:
#7. Craft new feature(s)
# Am not sure what new features we should look at? Adding one that tells about sulfur dioxide ratio I guess.
df_scaled['sulfur dioxide ratio'] = df_scaled['free sulfur dioxide'] / df_scaled['total sulfur dioxide']
df_scaled.head(10)
df_scaled.replace([np.inf, -np.inf], np.nan, inplace=True)
# Optionally, decide how you want to handle these new NaNs - for example, filling with a placeholder value or dropping
# Here's an example of filling NaNs with the mean of the column, which could be appropriate in some contexts
df_scaled.fillna(df_scaled.mean(), inplace=True)


In [120]:
#8. Select some features
# Define a function to calculate and return mutual information scores between features and target
def make_mi_scores(X, y):
    # Calculate mutual information scores for all features in X relative to target y
    mi_scores = mutual_info_regression(X, y)
    # Convert the mutual information scores into a pandas Series with feature names as the index
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    # Sort the Series so that features with the highest mutual information scores are at the top
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

# Copy the original dataframe to preserve the original data
X = df_scaled.copy()
# Select the target variable and remove it from the features dataset
# 'pH' is chosen as the target variable here, because we care about health and homeostasis <3
y = X.pop('density')
# Calculate mutual information scores between each feature in X and the target y
mi_scores = make_mi_scores(X, y)

# Print or display the mutual information scores for each feature
# Features with higher scores are more informative about the target
top_features_str = mi_scores.head(10).to_string()
print("Top 10 features based on mutual information scores:\n", top_features_str)


Top 10 features based on mutual information scores:
 volatile acidity        2.830685
chlorides               2.784118
citric acid             2.760701
sulphates               2.574153
fixed acidity           2.317942
sulfur dioxide ratio    1.866928
residual sugar          1.589719
free sulfur dioxide     1.395391
alcohol                 1.127484
total sulfur dioxide    0.556686


In [16]:
#9. Pick a metric 
#For a regression task, common metrics include Mean Absolute Error (MAE), Mean Squared Error (MSE), or R2 score.
#So I guess we can just pick MSE from his scripts

In [139]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Logistic Regression Task
# Prepare the data for logistic regression
X_log = df_cat.copy()  # Make a copy to avoid altering original DataFrame
y_log = X_log.pop('quality')  # Use 'quality' as the target for logistic regression
df_cat.head(10)
# Setup cross-validation for logistic regression
kfold = KFold(n_splits=5, shuffle=True, random_state=198)

# Define a logistic regression model within a pipeline that includes standard scaling
model_log = Pipeline([
    ('std', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Perform cross-validation for logistic regression and print the results
results_log = cross_val_score(model_log, X_log, y_log, cv=kfold)
print("Logistic Regression Accuracy: %.2f%% (%.2f%%)" % (results_log.mean() * 100, results_log.std() * 100))

# Regression Task with 'pH' as the target
# Prepare the data for regression task
df_encoded = pd.get_dummies(df_cat, drop_first=True)
# Split the data for regression task with 'pH' as the target
X_reg_encoded = df_encoded.drop(columns=['pH'])  # Make sure to drop 'pH' from features
y_reg_encoded = df_encoded['pH']  # Target variable for regression

# Split the data into training, validation, and test sets for regression task
seed = 8
X_train_reg_encoded, X_test_reg_encoded, y_train_reg_encoded, y_test_reg_encoded = train_test_split(X_reg_encoded, y_reg_encoded, test_size=0.2, random_state=seed)
X_train_reg_encoded, X_valid_reg_encoded, y_train_reg_encoded, y_valid_reg_encoded = train_test_split(X_train_reg_encoded, y_train_reg_encoded, test_size=0.25, random_state=seed)

# Proceed to train models as before with the encoded dataset
print('Train: ', X_train_reg_encoded.shape, y_train_reg_encoded.shape)
print('Validation: ', X_valid_reg_encoded.shape, y_valid_reg_encoded.shape)
print('Test:  ', X_test_reg_encoded.shape, y_test_reg_encoded.shape)

# Perform cross-validation and print the results (e.g., Mean Squared Error)
scores = cross_val_score(model_log, X_log, y_log, cv=kfold, scoring='neg_mean_squared_error')
#results_log = cross_val_score(model_log, X, y, cv=kfold)
mean_score = -np.mean(scores)
std_dev = np.std(scores)

#print("Linear Regression: %.2f%% (%.2f%%)" % (results_log.mean() * 100, results_log.std() * 100))

print(f"Linear Regression Cross-Validation MSE: {mean_score:.2f} ± {std_dev:.2f}")

Logistic Regression Accuracy: 68.04% (2.70%)
Train:  (959, 14) (959,)
Validation:  (320, 14) (320,)
Test:   (320, 14) (320,)


Traceback (most recent call last):
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 474, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 100, in _check_reg_targets
    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtyp

Linear Regression Cross-Validation MSE: nan ± nan


Traceback (most recent call last):
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 474, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 100, in _check_reg_targets
    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
  File "c:\Users\coruf\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtyp

In [107]:
import pandas as pd

model = LinearRegression()
model.fit(X_train_reg_encoded, y_train_reg_encoded)

preds = model.predict(X_test_reg_encoded)
mean_squared_error(y_test_reg_encoded, preds)

0.011041995718034327

In [109]:
model = DecisionTreeRegressor()
model.fit(X_train_reg_encoded, y_train_reg_encoded)

preds = model.predict(X_test_reg_encoded)
mean_squared_error(y_test_reg_encoded, preds)

0.0024096875000000012

In [111]:
# K-fold validation - fitted with logistic regression data though
model = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True, random_state=8)

results = cross_val_score(model, X_train_reg_encoded, y_train_reg_encoded, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: %.3f ± %.3f" % (-results.mean(), results.std()))

MSE: 0.010 ± 0.001


In [112]:
# 11. Train two models on the training data
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Train a Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train_reg_encoded, y_train_reg_encoded)

# Predict on the test set
y_pred_linear = linear_reg.predict(X_test_reg_encoded)

# Evaluate the model
mse_linear = mean_squared_error(y_test_reg_encoded, y_pred_linear)
print(f"Linear Regression MSE: {mse_linear}")
# Train a Random Forest Regressor
random_forest_reg = RandomForestRegressor(n_estimators=100, random_state=8)
random_forest_reg.fit(X_train_reg_encoded, y_train_reg_encoded)

# Predict on the test set
y_pred_rf = random_forest_reg.predict(X_test_reg_encoded)

# Evaluate the model
mse_rf = mean_squared_error(y_test_reg_encoded, y_pred_rf)
print(f"Random Forest Regressor MSE: {mse_rf}")


Linear Regression MSE: 0.011041995718034327
Random Forest Regressor MSE: 0.0017811569062500018


In [144]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold

# Assuming df_encoded is your DataFrame prepared for regression task

# Split the data for regression task
X = df_scaled.drop(columns=['pH'])  # drop pH for it to predict
y = df_scaled['pH']

# Setup cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=198)

# Define a Linear Regression model within a pipeline that includes standard scaling
model_linreg = Pipeline([
    ('std', StandardScaler()),
    ('linreg', LinearRegression())
])

# Perform cross-validation and print the results (e.g., Mean Squared Error)
scores = cross_val_score(model_linreg, X, y, cv=kfold, scoring='neg_mean_squared_error')
#results_log = cross_val_score(model_log, X, y, cv=kfold)
mean_score = -np.mean(scores)
std_dev = np.std(scores)

#print("Linear Regression: %.2f%% (%.2f%%)" % (results_log.mean() * 100, results_log.std() * 100))

print(f"Linear Regression Cross-Validation MSE: {mean_score:.5f} ± {std_dev:.5f}")


Linear Regression Cross-Validation MSE: 0.01878 ± 0.00111


In [143]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold

# Assuming df_scaled is your DataFrame prepared for regression task

# Split the data for regression task
X = df_scaled.drop(columns=['pH'])  # drop pH for it to predict
y = df_scaled['pH']

kfold = KFold(n_splits=5, shuffle=True, random_state=198)

# Define a Decision Tree Regressor model within a pipeline that includes standard scaling
model_dtree = Pipeline([
    ('std', StandardScaler()),
    ('dtree', DecisionTreeRegressor(random_state=198))
])

# Perform cross-validation and print the results (e.g., Mean Squared Error)
scores = cross_val_score(model_dtree, X, y, cv=kfold, scoring='neg_mean_squared_error')

# Convert scores to positive values to get the actual MSE
mse_scores = -scores

# Calculate mean and standard deviation of MSE scores
mean_mse = -np.mean(mse_scores)
std_mse = np.std(mse_scores)

print(f"Decision Tree Regressor Cross-Validation MSE: {mean_mse:.5f} ± {std_mse:.5f}")


Decision Tree Regressor Cross-Validation MSE: 0.00128 ± 0.00086


In [15]:
#12. Evaluate them on the test data
# see above

In [16]:
#13. Wrap the better model into a Gradio app

In [17]:
#14. Write a conclusion, emphasizing that one thing that makes your project super cool 😎.