In [1]:

# Take-Home Exam

#To obtain the 1 ECTS, submit a `<last_name>_<first_name>_PythonML.ipynb` file to `christian.kauth@unifr.ch` by **March 10th**, featuring:

#- The names of the authors (max. 3 per group)
#- Download one dataset from the URL (
#  [iris](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data),
#  [pima](https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv),
#  [wine](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv),
#  [housing](https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv),
#  [penguin](https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv))
#- Mask a few values in the dataset
#- Impute missing values
#- Encode categorical & target variables
#- Apply a transformation
#- Craft new feature(s)
#- Select some features
#- Pick a metric
#- Train-test split the data, **do not leak data**
#- Train **two** models on the training data
#- Evaluate them on the test data
#- Wrap the better model into a Gradio app
#- Write a conclusion, **emphasizing that one thing that makes your project super cool 😎**.




In [2]:
# 1. Name of Author

# Laura Dekker
# Matrikelnr.: 22-112-346

# Xiaoyue Deng
# Matrikelnr.: 22-118-205


In [3]:
# 2. Download dataset

from requests import get
import pandas as pd

def download_save(url, filename):
  res = get(url)
  if res.status_code != 200:
    print(f"Couldn't fetch data from {url}")
  else:
    csv_file = open(filename, 'wb')
    csv_file.write(res.content)
    csv_file.close()

download_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 'wine_quality_red.csv')
df = pd.read_csv('wine_quality_red.csv', header=0, low_memory=False, sep=';')

df.head(10)
#max(df['quality']) -> 8

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [4]:
# 3. Mask values in dataset

dfmasked = df.mask(df > 10)
dfmasked = dfmasked.mask(df["residual sugar"] > 2.5)
dfmasked = dfmasked.mask(df["alcohol"] > 9.5)
dfmasked.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,,,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,,,0.9964,3.3,0.46,9.4,5.0
7,,,,,,,,,,,,
8,7.8,0.58,0.02,2.0,0.073,9.0,,0.9968,3.36,0.57,9.5,7.0
9,,,,,,,,,,,,


In [19]:
# 4. Impute missing values

#Import module
from sklearn.impute import SimpleImputer

#Count NaNs
print(f'NaNs before filling: {dfmasked.isnull().values.any()}')

#Copy into new variable
dffilled = dfmasked.copy()

#Fill NaNs with forward filling
dffilled.fillna(method='ffill', inplace=True)
dffilled.head(10)

#Fill remaining NaNs with average imputer
#Loop over columns
#Check for Nans
#If so impute average for Nans based on values in rest of column

col_names = dffilled.columns

for i in range(len(col_names)):
    current_col = col_names[i]
    if dffilled[current_col].isnull().values.any() == True:
        my_imputer = SimpleImputer(strategy='mean')
        dffilled[[current_col]] = my_imputer.fit_transform(dffilled[[current_col]].values)

#Check for remaining NaNs
print(f'NaNs after filling: {dffilled.isnull().values.any()}')

dffilled.head(10)


NaNs before filling: True
NaNs after filling: False


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
1,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
2,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
3,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
4,7.4,0.7,0.0,1.9,0.076,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,7.013199,9.710287,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,5.0
7,7.9,0.6,0.06,1.6,0.069,7.013199,9.710287,0.9964,3.3,0.46,9.4,5.0
8,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,7.0
9,7.8,0.58,0.02,2.0,0.073,9.0,9.710287,0.9968,3.36,0.57,9.5,7.0


In [6]:
## Library Preparation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import requests

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [9]:
# 5. Encode categorical & target variables
# The wine quality dataset does not explicitly contain categorical features as it's mostly chemical properties of the wine and a numerical quality rating. However, if we were to treat 'quality' as a categorical target variable for classification, we could encode it. Since 'quality' is already numerical, we don't need to encode it for regression tasks.
# Maybe substitute the numerical categories with string ones so we show we know how to work with it.


fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [27]:
# 6. Transforms
scaler = MinMaxScaler()

# Apply scaler to the processed dataset
df_scaled = pd.DataFrame(scaler.fit_transform(dffilled),
                         columns=dffilled.columns,
                         index=dffilled.index)
df_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,0.554044,0.339063,0.361122,0.555503,0.11106,0.668133,0.855143,0.499011,0.557686,0.238153,0.835863,0.577705
std,0.18109,0.14714,0.21661,0.240554,0.133314,0.257353,0.249549,0.156629,0.183697,0.152754,0.178828,0.138227
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.416667,0.219048,0.236842,0.416667,0.061189,0.444444,0.855143,0.394452,0.424658,0.162791,0.727273,0.5
50%,0.541667,0.32381,0.342105,0.5,0.06993,0.666667,1.0,0.485792,0.575342,0.20155,0.909091,0.5
75%,0.666667,0.452381,0.5,0.75,0.092657,0.888889,1.0,0.566982,0.719178,0.248062,1.0,0.75
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
#7. Craft new feature(s)
# Am not sure what new features we should look at? Adding one that tells about sulfur dioxide ratio I guess.
df_scaled['sulfur dioxide ratio'] = df_scaled['free sulfur dioxide'] / df_scaled['total sulfur dioxide']
df_scaled.head(10)
df_scaled.replace([np.inf, -np.inf], np.nan, inplace=True)
# Optionally, decide how you want to handle these new NaNs - for example, filling with a placeholder value or dropping
# Here's an example of filling NaNs with the mean of the column, which could be appropriate in some contexts
df_scaled.fillna(df_scaled.mean(), inplace=True)


In [44]:
#8. Select some features
# Define a function to calculate and return mutual information scores between features and target
def make_mi_scores(X, y):
    # Calculate mutual information scores for all features in X relative to target y
    mi_scores = mutual_info_regression(X, y)
    # Convert the mutual information scores into a pandas Series with feature names as the index
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    # Sort the Series so that features with the highest mutual information scores are at the top
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

# Copy the original dataframe to preserve the original data
X = df_scaled.copy()
# Select the target variable and remove it from the features dataset
# 'pH' is chosen as the target variable here, because we care about health and homeostasis <3
y = X.pop('density')
# Calculate mutual information scores between each feature in X and the target y
mi_scores = make_mi_scores(X, y)

# Print or display the mutual information scores for each feature
# Features with higher scores are more informative about the target
top_features_str = mi_scores.head(10).to_string()
print("Top 10 features based on mutual information scores:\n", top_features_str)


Top 10 features based on mutual information scores:
 volatile acidity        2.824042
chlorides               2.790301
citric acid             2.766302
pH                      2.700834
sulphates               2.560948
fixed acidity           2.304258
sulfur dioxide ratio    1.850919
residual sugar          1.574642
free sulfur dioxide     1.399806
alcohol                 1.069453


In [16]:
#9. Pick a metric 
#For a regression task, common metrics include Mean Absolute Error (MAE), Mean Squared Error (MSE), or R2 score.
#So I guess we can just pick MSE from his scripts

In [13]:
#10. Train-test split the data, do not leak data --- preventing data leakage
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X = df_scaled.copy()
y = X.pop('class')

kfold = KFold(n_splits=5, shuffle=True, random_state=198)

model = Pipeline([('std', StandardScaler()),
                  ('logreg', LogisticRegression(max_iter=1000))])

results = cross_val_score(model, X, y, cv=kfold)
print("%.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


# ------------------------------
X = df.copy()
# I guess we can do pH? Or probably no?
y = X.pop('pH')

seed = 8

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)

print('Train: ', X_train.shape, y_train.shape)
print('Validation: ', X_valid.shape, y_valid.shape)
print('Test:  ', X_test.shape, y_test.shape)

Train:  (959, 11) (959,)
Validation:  (320, 11) (320,)
Test:   (320, 11) (320,)


In [14]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mean_squared_error(y_test, preds)

NameError: name 'X_train' is not defined

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
# K-fold validation
model = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True, random_state=8)

results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: %.3f ± %.3f" % (-results.mean(), results.std()))

In [None]:
model = DecisionTreeRegressor()
kfold = KFold(n_splits=5, shuffle=True, random_state=8)

results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: %.3f ± %.3f" % (-results.mean(), results.std()))

In [None]:
model = GradientBoostingRegressor()
kfold = KFold(n_splits=5, shuffle=True, random_state=8)

results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: %.3f ± %.3f" % (-results.mean(), results.std()))

In [14]:
# 11. Train two models on the training data


In [15]:
#12. Evaluate them on the test data

In [16]:
#13. Wrap the better model into a Gradio app

In [17]:
#14. Write a conclusion, emphasizing that one thing that makes your project super cool 😎.