In [1]:
# https://www.kaggle.com/datasets/ruthgn/beer-profile-and-ratings-data-set

import pandas as pd
import joblib
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    mean_squared_error, 
    r2_score,
    mean_absolute_percentage_error
)

In [2]:
# Load the CSV data into a DataFrame
data = pd.read_csv('beer_data.csv',
                   usecols=['Style', 'ABV', 'Min IBU', 'Max IBU', 'Astringency', 
                            'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 
                            'Hoppy', 'Spices', 'Malty'])

data.head()

Unnamed: 0,Style,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty
0,Altbier,5.3,25,50,13,32,9,47,74,33,0,33,57,8,111
1,Altbier,7.2,25,50,12,57,18,33,55,16,0,24,35,12,84
2,Altbier,5.0,25,50,14,37,6,42,43,11,0,10,54,4,62
3,Altbier,8.5,25,50,13,55,31,47,101,18,1,49,40,16,119
4,Altbier,7.2,25,50,25,51,26,44,45,9,1,11,51,20,95


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3197 entries, 0 to 3196
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Style        3197 non-null   object 
 1   ABV          3197 non-null   float64
 2   Min IBU      3197 non-null   int64  
 3   Max IBU      3197 non-null   int64  
 4   Astringency  3197 non-null   int64  
 5   Body         3197 non-null   int64  
 6   Alcohol      3197 non-null   int64  
 7   Bitter       3197 non-null   int64  
 8   Sweet        3197 non-null   int64  
 9   Sour         3197 non-null   int64  
 10  Salty        3197 non-null   int64  
 11  Fruits       3197 non-null   int64  
 12  Hoppy        3197 non-null   int64  
 13  Spices       3197 non-null   int64  
 14  Malty        3197 non-null   int64  
dtypes: float64(1), int64(13), object(1)
memory usage: 374.8+ KB


In [4]:
# Split Data into Features (X) and Target (y)
X = data.drop(columns=['Style', 'ABV'])
y = data[['Style', 'ABV']]

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
# Encode the 'Style' target labels using LabelEncoder
label_encoder = LabelEncoder()
y_style_train = label_encoder.fit_transform(y_train['Style'])
y_style_test = label_encoder.transform(y_test['Style'])

std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

In [7]:
# Create and train the Random Forest Classifier
random_forest_classifier = RandomForestClassifier(random_state=42)
random_forest_classifier.fit(X_train_scaled, y_style_train)

# Make predictions on the test set
y_pred = random_forest_classifier.predict(X_test_scaled)

# Convert the encoded predictions back to original style labels
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the classifier's performance
accuracy_val = accuracy_score(y_style_test, y_pred)
precision_val = precision_score(y_style_test, y_pred, average='weighted')
recall_val = recall_score(y_style_test, y_pred, average='weighted')
f1_val = f1_score(y_style_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy_val:.2f}")
print(f"Precision: {precision_val:.2f}")
print(f"Recall: {recall_val:.2f}")
print(f"F1: {f1_val:.2f}")

Accuracy: 0.82
Precision: 0.84
Recall: 0.82
F1: 0.82


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
y_abv_train = y_train['ABV']
y_abv_test = y_test['ABV']

# Create and train the Random Forest Regressor
random_forest_regressor = RandomForestRegressor(random_state=42)

# Train the regressor on the full training data
random_forest_regressor.fit(X_train_scaled, y_abv_train)

# Make predictions on the test set
y_pred = random_forest_regressor.predict(X_test_scaled)

# Evaluate the regressor's performance
mse = mean_squared_error(y_abv_test, y_pred)
r2 = r2_score(y_abv_test, y_pred)
mape = mean_absolute_percentage_error(y_abv_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Squared Error (MSE): 7.04
R-squared (R2): 0.44


In [9]:
joblib.dump([label_encoder, std_scaler, random_forest_classifier, random_forest_regressor], './multi_output_model.pkl')

['./multi_output_model.pkl']

In [12]:
data = {
        'Min IBU': 25,
        'Max IBU': 40,
        'Astringency': 32,
        'Body': 27,
        'Alcohol': 5,
        'Bitter': 36,
        'Sweet': 43,
        'Sour': 18,
        'Salty': 7,
        'Fruits': 18,
        'Hoppy': 58,
        'Spices': 5,
        'Malty': 70
    }
test_df = pd.DataFrame(data, index=[0])

[label_encoder_new, std_scaler_new, random_forest_classifier_new, random_forest_regressor_new] = joblib.load('./multi_output_model.pkl')
test_df_scaled = std_scaler.transform(test_df)
y_pred_style = label_encoder_new.inverse_transform(random_forest_classifier_new.predict(test_df_scaled))
y_pred_abv = random_forest_regressor_new.predict(test_df_scaled)

print(y_pred_style, y_pred_abv)
print(y_test.head(1))

['Pilsner - German'] [5.1215]
                 Style  ABV
1951  Pilsner - German  5.0
