In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification
import tensorflow as tf

2023-01-22 12:52:40.845042: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Import csv data into a dataframe
beer_df = pd.read_csv("beer_reviews.csv")
beer_df.head()

Unnamed: 0,index,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [5]:
# Check for nulls
beer_df.isnull().sum()

index                     0
brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

In [6]:
# Drop nulls
beer_df.dropna(inplace=True)

In [7]:
# Drop the non-beneficial ID columns.
beer_df.drop(['index','beer_name', 'review_profilename', 'review_time', 'beer_style', 'brewery_name','beer_name','beer_beerid', 'brewery_id'], axis=1, inplace=True)
beer_df.head()

Unnamed: 0,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv
0,1.5,2.0,2.5,1.5,1.5,5.0
1,3.0,2.5,3.0,3.0,3.0,6.2
2,3.0,2.5,3.0,3.0,3.0,6.5
3,3.0,3.0,3.5,2.5,3.0,5.0
4,4.0,4.5,4.0,4.0,4.5,7.7


In [8]:
# Determine the number of unique values in each column.
beer_df.nunique()

review_overall        10
review_aroma           9
review_appearance     10
review_palate          9
review_taste           9
beer_abv             530
dtype: int64

In [9]:
beer_df.dtypes

review_overall       float64
review_aroma         float64
review_appearance    float64
review_palate        float64
review_taste         float64
beer_abv             float64
dtype: object

In [10]:
beer_df.review_overall.value_counts()

4.0    559790
4.5    314303
3.5    286901
3.0    155840
5.0     88997
2.5     54660
2.0     35737
1.5     12032
1.0     10211
0.0         7
Name: review_overall, dtype: int64

In [11]:
# Convert the target column values to 0 (3 or under) and 1 (3.5 or higher) based on their values
beer_df.loc[beer_df['review_overall'] <= 3, 'review_overall'] = 0
beer_df.loc[beer_df['review_overall'] > 3, 'review_overall'] = 1
   
# Check to make sure binning was successful
beer_df.head()

Unnamed: 0,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv
0,0.0,2.0,2.5,1.5,1.5,5.0
1,0.0,2.5,3.0,3.0,3.0,6.2
2,0.0,2.5,3.0,3.0,3.0,6.5
3,0.0,3.0,3.5,2.5,3.0,5.0
4,1.0,4.5,4.0,4.0,4.5,7.7


In [13]:
beer_df.review_overall.value_counts()

1.0    1249991
0.0     268487
Name: review_overall, dtype: int64

In [14]:
# Create our features
X = pd.get_dummies(beer_df.drop(columns='review_overall'))

# Create our target
y = beer_df['review_overall']

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(1138858, 5)

In [17]:
# Resample the training data with the BalancedRandomForestClassifier

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scale = X_scaler.transform(X_test)

In [18]:
# Creating the decision tree classifier instance
# Define model
model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [19]:
# Calculated the balanced accuracy score

#Predict
predictions = model.predict(X_test_scale)

#accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8680417259364628

In [20]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 3.5 or over", "Actual 3 or under"], columns=["Predicted 3.5 or over", "Predicted 3 or under"]
)

cm_df

Unnamed: 0,Predicted 3.5 or over,Predicted 3 or under
Actual 3.5 or over,58084,9038
Actual 3 or under,41056,271442


In [21]:
# Print the imbalanced classification report
y_pred = model.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.98      0.00      1.00      0.00      0.04      0.00     67122
        1.0       0.82      1.00      0.00      0.90      0.04      0.00    312498

avg / total       0.85      0.82      0.18      0.74      0.04      0.00    379620



In [22]:
# List the features sorted in descending order by feature importance
# Calculate feature importance in the Random Forest model. Sort by importance
importances = model.feature_importances_

sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.4925840972738148, 'review_taste'),
 (0.25787172891924, 'review_palate'),
 (0.10684626946085171, 'beer_abv'),
 (0.09685590240436028, 'review_aroma'),
 (0.04584200194173326, 'review_appearance')]

## Easy Ensemble AdaBoost Classifier

In [23]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
# Define model
model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [24]:
# Calculated the balanced accuracy score
#Predict
predictions = model.predict(X_test_scale)

#accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8798034876982246

In [26]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 3.5 or over", "Actual 3 or under"], columns=["Predicted 3.5 or over", "Predicted 3 or under"]
)

cm_df

Unnamed: 0,Predicted 3.5 or over,Predicted 3 or under
Actual 3.5 or over,57317,9805
Actual 3 or under,35824,276674


In [27]:
# Print the imbalanced classification report
y_pred = model.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.00      0.00      1.00      0.00      0.00      0.00     67122
        1.0       0.82      1.00      0.00      0.90      0.00      0.00    312498

avg / total       0.68      0.82      0.18      0.74      0.00      0.00    379620



In [None]:
## training_data, testing_data = train_test_split(df, test_size=0.2, random_state=25)