## Preprocessing Data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import our dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
#  Import and read the csv file.
file_path = "beer_reviews.csv"
beer_df = pd.read_csv(file_path, index_col=0)
beer_df.head(10)

Unnamed: 0_level_0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883
5,1075,Caldera Brewing Company,1325524659,3.0,3.5,3.5,oline73,Herbed / Spiced Beer,3.0,3.5,Caldera Ginger Beer,4.7,52159
6,1075,Caldera Brewing Company,1318991115,3.5,3.5,3.5,Reidrover,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159
7,1075,Caldera Brewing Company,1306276018,3.0,2.5,3.5,alpinebryant,Herbed / Spiced Beer,2.0,3.5,Caldera Ginger Beer,4.7,52159
8,1075,Caldera Brewing Company,1290454503,4.0,3.0,3.5,LordAdmNelson,Herbed / Spiced Beer,3.5,4.0,Caldera Ginger Beer,4.7,52159
9,1075,Caldera Brewing Company,1285632924,4.5,3.5,5.0,augustgarage,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159


In [4]:
# Check for nulls
beer_df.isnull().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

In [5]:
# Drop nulls
beer_df.dropna(inplace=True)

In [6]:
# Check value counts for 'overall review'
beer_df.review_overall.value_counts()

4.0    559790
4.5    314303
3.5    286901
3.0    155840
5.0     88997
2.5     54660
2.0     35737
1.5     12032
1.0     10211
0.0         7
Name: review_overall, dtype: int64

In [7]:
# Determine the number of unique values in each column.
beer_df.nunique()

brewery_id               5230
brewery_name             5155
review_time           1510459
review_overall             10
review_aroma                9
review_appearance          10
review_profilename      32908
beer_style                104
review_palate               9
review_taste                9
beer_name               44075
beer_abv                  530
beer_beerid             49000
dtype: int64

In [8]:
# Drop the non-beneficial columns.
beer_df.drop(['brewery_id', 'brewery_name', 'review_time', 'review_profilename', 
              'beer_style', 'beer_name', 'beer_beerid'], axis=1, inplace=True)
beer_df.head(10)

Unnamed: 0_level_0,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.5,2.0,2.5,1.5,1.5,5.0
1,3.0,2.5,3.0,3.0,3.0,6.2
2,3.0,2.5,3.0,3.0,3.0,6.5
3,3.0,3.0,3.5,2.5,3.0,5.0
4,4.0,4.5,4.0,4.0,4.5,7.7
5,3.0,3.5,3.5,3.0,3.5,4.7
6,3.5,3.5,3.5,4.0,4.0,4.7
7,3.0,2.5,3.5,2.0,3.5,4.7
8,4.0,3.0,3.5,3.5,4.0,4.7
9,4.5,3.5,5.0,4.0,4.0,4.7


In [9]:
# Determine the number of unique values in each column.
beer_df.nunique()

review_overall        10
review_aroma           9
review_appearance     10
review_palate          9
review_taste           9
beer_abv             530
dtype: int64

In [10]:
# Check data types
beer_df.dtypes

review_overall       float64
review_aroma         float64
review_appearance    float64
review_palate        float64
review_taste         float64
beer_abv             float64
dtype: object

In [11]:
# Check value counts for 'overall review'
beer_df.review_overall.value_counts()

4.0    559790
4.5    314303
3.5    286901
3.0    155840
5.0     88997
2.5     54660
2.0     35737
1.5     12032
1.0     10211
0.0         7
Name: review_overall, dtype: int64

In [12]:
# Convert the target column values to 0 (3 or under) and 1 (3.5 or higher) based on their values
beer_df.loc[beer_df['review_overall'] <= 3, 'review_overall'] = 0
beer_df.loc[beer_df['review_overall'] > 3, 'review_overall'] = 1

# Check to make sure binning was successful
beer_df.head()

Unnamed: 0_level_0,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0,2.0,2.5,1.5,1.5,5.0
1,0.0,2.5,3.0,3.0,3.0,6.2
2,0.0,2.5,3.0,3.0,3.0,6.5
3,0.0,3.0,3.5,2.5,3.0,5.0
4,1.0,4.5,4.0,4.0,4.5,7.7


In [13]:
# Check value counts for 'overall review' again to confirm values are either 0 or 1
beer_df.review_overall.value_counts()

1.0    1249991
0.0     268487
Name: review_overall, dtype: int64

## Create and Test Model 1 - Logistic Regression

In [14]:
# Create our features
X = beer_df.drop(columns='review_overall')

# Create our target
y = beer_df['review_overall']

In [15]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({1.0: 937493, 0.0: 201365})

### Our data is unbalanced. 17% from minority class

In [16]:
# Resample using combination over and under sampling with SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0.0: 473568, 1.0: 777888})

In [17]:
# Define and fit the model using Logistic Regression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [18]:
# Predict and print confusion matrix
y_pred = model.predict(X_test)
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 3 or under", "Actual 3.5 or over"], 
    columns=["Predicted 3 or under","Predicted 3.5 or over"]
)

cm_df

Unnamed: 0,Predicted 3 or under,Predicted 3.5 or over
Actual 3 or under,53810,13312
Actual 3.5 or over,25038,287460


In [19]:
# Print accuracy score
balanced_accuracy_score(y_test, y_pred)

0.860776224977494

In [20]:
# Print classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.68      0.80      0.92      0.74      0.86      0.73     67122
        1.0       0.96      0.92      0.80      0.94      0.86      0.75    312498

avg / total       0.91      0.90      0.82      0.90      0.86      0.74    379620

