In [5]:

# Take-Home Exam

#To obtain the 1 ECTS, submit a `<last_name>_<first_name>_PythonML.ipynb` file to `christian.kauth@unifr.ch` by **March 10th**, featuring:

#- The names of the authors (max. 3 per group)
#- Download one dataset from the URL (
#  [iris](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data),
#  [pima](https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv),
#  [wine](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv),
#  [housing](https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv),
#  [penguin](https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv))
#- Mask a few values in the dataset
#- Impute missing values
#- Encode categorical & target variables
#- Apply a transformation
#- Craft new feature(s)
#- Select some features
#- Pick a metric
#- Train-test split the data, **do not leak data**
#- Train **two** models on the training data
#- Evaluate them on the test data
#- Wrap the better model into a Gradio app
#- Write a conclusion, **emphasizing that one thing that makes your project super cool ðŸ˜Ž**.




In [2]:
# 1. Name of Author

# Laura Dekker
# Matrikelnr.: 22-112-346

# Xiaoyue Deng
# Matrikelnr.: 22-118-205


In [6]:
# 2. Download dataset

from requests import get
import pandas as pd

def download_save(url, filename):
  res = get(url)
  if res.status_code != 200:
    print(f"Couldn't fetch data from {url}")
  else:
    csv_file = open(filename, 'wb')
    csv_file.write(res.content)
    csv_file.close()

download_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 'wine_quality_red.csv')
df = pd.read_csv('wine_quality_red.csv', header=0, low_memory=False, sep=';')

df.head(10)


ModuleNotFoundError: No module named 'requests'

In [5]:
# 3. Mask values in dataset

dfmasked = df.mask(df > 10)
dfmasked = dfmasked.mask(df["residual sugar"] > 2.5)
dfmasked = dfmasked.mask(df["alcohol"] > 9.5)
dfmasked.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,,,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,,,0.9964,3.3,0.46,9.4,5.0
7,,,,,,,,,,,,
8,7.8,0.58,0.02,2.0,0.073,9.0,,0.9968,3.36,0.57,9.5,7.0
9,,,,,,,,,,,,


In [6]:
# 4. Impute missing values

#count NaNs
dfmasked.isnull().sum()

#copy into new variable
dffilled = dfmasked.copy()

#Fill NaNs with forward filling
dffilled.fillna(method='ffill', inplace=True)
dffilled.head(10)

#Fill remaining NaNs with average imputer
#Loop over columns
#Check for Nans
#If so impute average for Nans based on values in rest of column


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
1,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
2,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
3,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
4,7.4,0.7,0.0,1.9,0.076,,,0.9978,3.51,0.56,9.4,5.0
5,7.4,0.66,0.0,1.8,0.075,,,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,,,0.9964,3.3,0.46,9.4,5.0
7,7.9,0.6,0.06,1.6,0.069,,,0.9964,3.3,0.46,9.4,5.0
8,7.8,0.58,0.02,2.0,0.073,9.0,,0.9968,3.36,0.57,9.5,7.0
9,7.8,0.58,0.02,2.0,0.073,9.0,,0.9968,3.36,0.57,9.5,7.0


In [9]:
## Library Preparation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import requests

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

ModuleNotFoundError: No module named 'matplotlib'

In [8]:
# 5. Encode categorical & target variables
# The wine quality dataset does not explicitly contain categorical features as it's mostly chemical properties of the wine and a numerical quality rating. However, if we were to treat 'quality' as a categorical target variable for classification, we could encode it. Since 'quality' is already numerical, we don't need to encode it for regression tasks.

In [1]:
# 6. Transforms
scaler = MinMaxScaler()

# Apply scaler to dataset
df_scaled = pd.DataFrame(scaler.fit_transform(df),
                         columns=df.columns,
                         index=df.index)
df_scaled.describe()

NameError: name 'MinMaxScaler' is not defined

In [10]:
#7. Craft new feature(s)
# Am not sure what new features we should look at? Adding one that tells about sulfur dioxide ratio I guess.
df_scaled['sulfur_dioxide_ratio'] = df_scaled['free sulfur dioxide'] / df_scaled['total sulfur dioxide']

In [11]:
#8. Select some features
def make_mi_scores(X, y):
  mi_scores = mutual_info_regression(X, y)
  mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
  mi_scores = mi_scores.sort_values(ascending=False)
  return mi_scores

X = df.copy()
# This could be the pop of any column 
#y = X.pop('density')
y = X.pop('pH')
mi_scores = make_mi_scores(X, y)
mi_scores

fixed acidity           0.467138
citric acid             0.386058
density                 0.315328
total sulfur dioxide    0.203344
sulphates               0.190678
volatile acidity        0.189045
chlorides               0.185399
alcohol                 0.184409
free sulfur dioxide     0.129212
residual sugar          0.112121
quality                 0.000000
Name: MI Scores, dtype: float64

In [12]:
#9. Pick a metric 
#For a regression task, common metrics include Mean Absolute Error (MAE), Mean Squared Error (MSE), or R2 score.
#So I guess we can just pick MSE from his scripts

In [13]:
#10. Train-test split the data, do not leak data
X = df.copy()
# I guess we can do pH? Or probably no?
y = X.pop('pH')

seed = 8

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)

print('Train: ', X_train.shape, y_train.shape)
print('Validation: ', X_valid.shape, y_valid.shape)
print('Test:  ', X_test.shape, y_test.shape)

Train:  (959, 11) (959,)
Validation:  (320, 11) (320,)
Test:   (320, 11) (320,)


In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
# K-fold validation
model = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True, random_state=8)

results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: %.3f Â± %.3f" % (-results.mean(), results.std()))

In [None]:
model = DecisionTreeRegressor()
kfold = KFold(n_splits=5, shuffle=True, random_state=8)

results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: %.3f Â± %.3f" % (-results.mean(), results.std()))

In [None]:
model = GradientBoostingRegressor()
kfold = KFold(n_splits=5, shuffle=True, random_state=8)

results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
print("MSE: %.3f Â± %.3f" % (-results.mean(), results.std()))

In [14]:
# 11. Train two models on the training data


In [15]:
#12. Evaluate them on the test data

In [16]:
#13. Wrap the better model into a Gradio app

In [17]:
#14. Write a conclusion, emphasizing that one thing that makes your project super cool ðŸ˜Ž.