# Beer draft modelling

In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm_notebook
import random as random

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [136]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, precision_recall_curve, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [137]:
from my_classes import Dropper, Dummier, Vectorizer, Stemmer

In [4]:
bitters_df = pd.read_csv("../../data/reviews/english_bitters_cleaned.csv")

In [5]:
bitters_df.head()

Unnamed: 0,rating,look,smell,taste,feel,overall,review,date,id,name
0,3.62,4.25,3.5,3.5,3.75,3.75,From notes 3/31/13: Pours clear medium amber p...,2019-01-04,1157,Honker's Ale
1,4.06,4.0,4.25,4.0,4.0,4.0,It's funny that the best English Bitter that I...,2018-12-17,1157,Honker's Ale
2,3.93,3.75,3.75,4.0,4.0,4.0,12oz bottle dated 21SEP18 poured into my mouth...,2018-12-14,1157,Honker's Ale
3,4.35,4.0,4.0,4.75,4.0,4.25,Had this on tap when we went for dinner. I was...,2018-08-19,1157,Honker's Ale
4,3.27,3.5,3.25,3.25,3.25,3.25,As a fan of their IPA thought I would give Hon...,2018-08-18,1157,Honker's Ale


In [7]:
#setting my binary rating at about the median
bitters_df["rating_bin"] = bitters_df["rating"].apply(lambda x: 1 if x > 3.6 else 0)`

### Baseline

In [261]:
#checking baseline accuracy
baseline = bitters_df["rating_bin"].value_counts(normalize=True).max()
baseline

0.5584735330324169

### Logistic

In [142]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegression(solver = "lbfgs")

#Setting up pipeline
log_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
log_pipe.fit(X_train, y_train)
scores = cross_val_score(log_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(log_pipe.score(X_test, y_test)))

CV score: 0.750
CV std = 0.016

Test score: 0.753


In [143]:
log_coeffs = pd.DataFrame(zip(log_pipe.steps[1][1].feature_names, log_pipe.steps[2][1].coef_[0]), columns = ["feature", "coeff"])
log_coeffs.sort_values("coeff", ascending=False).head()

Unnamed: 0,feature,coeff
5210,great,3.71278
7538,nice,3.33064
1060,balanced,2.727196
4210,excellent,2.722292
5126,good,2.458709


### Random Forest

In [157]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
vec = Vectorizer(to_vec = "review")
model = RandomForestClassifier(n_estimators=100)

#Setting up pipeline
for_pipe = Pipeline(steps = [("dropper", dropper),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
for_pipe.fit(X_train, y_train)
scores = cross_val_score(for_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(for_pipe.score(X_test, y_test)))

CV score: 0.712
CV std = 0.012

Test score: 0.720


In [158]:
for_import = pd.DataFrame(zip(for_pipe.steps[1][1].feature_names, for_pipe.steps[2][1].feature_importances_), columns = ["feature", "importance"])

In [159]:
for_import.sort_values("importance", ascending=False).head()

Unnamed: 0,feature,importance
7538,nice,0.010418
5210,great,0.00772
5439,head,0.007465
1060,balanced,0.006001
5126,good,0.005959


### Stemming Logistic

In [115]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review", ngram_range=(1,2))
model = LogisticRegression(solver = "lbfgs")

#Setting up pipeline
log_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
log_pipe.fit(X_train, y_train)
scores = cross_val_score(log_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(log_pipe.score(X_test, y_test)))

CV score: 0.635
CV std = 0.086

Test score: 0.712


In [118]:
log_coeffs = pd.DataFrame(zip(log_pipe.steps[1][1].feature_names, log_pipe.steps[2][1].coef_[0]), columns = ["feature", "coeff"])
log_coeffs.sort_values("coeff", ascending=False).head(10)

Unnamed: 0,feature,coeff
82975,nice,5.271739
56435,great,3.341485
9443,balanced,2.916788
54626,good,2.507252
104393,session,2.293725
24044,cask,2.251237
26362,citrus,2.187216
43185,excellent,2.183246
108486,smooth,2.020351
104403,session beer,1.788776


### Finding best C for logistic

In [148]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegressionCV(solver = "lbfgs", Cs=20, cv=5, max_iter=1000)

#Setting up pipeline
logcv_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
logcv_pipe.fit(X_train, y_train)

print("Train score: {0:.3f}".format(logcv_pipe.score(X_train, y_train)))

Train score: 0.864


In [153]:
logcv_pipe.steps[2][1].C_[0]

0.8286427728546842

In [248]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegression(solver = "lbfgs", C=0.8286427728546842)

#Setting up pipeline
log_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
log_pipe.fit(X_train, y_train)
scores = cross_val_score(log_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(log_pipe.score(X_test, y_test)))

CV score: 0.751
CV std = 0.014

Test score: 0.754


In [249]:
log_coeffs = pd.DataFrame(zip(log_pipe.steps[1][1].feature_names, log_pipe.steps[2][1].coef_[0]), columns = ["feature", "coeff"])
log_coeffs.sort_values("coeff", ascending=False).head(20)

Unnamed: 0,feature,coeff
5210,great,3.433925
7538,nice,3.185744
1060,balanced,2.545962
4210,excellent,2.479719
5126,good,2.304788
2966,crisp,1.691999
12471,wonderful,1.538273
9833,session,1.518206
1346,biscuity,1.506604
9354,rich,1.481384


### Trying gridsearch (no real luck)

In [191]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegression(solver = "lbfgs", C=0.8286427728546842)

#Setting up pipeline
log_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

log_params={"vec__min_df" : [1, 0.02, 0.05],
            "vec__max_df" : [1.0, 0.9, 0.95],
            "vec__ngram_range" : [(1,1)],
            #"stem__style" : [None, "porter", "lancaster"],
            "model__C" : np.logspace(-3,0,10),
            "model__solver" : ["newton-cg", "saga", "sag"],
            "model__penalty" : ["l2"]}

log_gs=GridSearchCV(log_pipe,
                    log_params,
                    n_jobs=-1,
                    cv=3,
                    verbose=10)

#Fitting and scoring model
log_gs.fit(X_train, y_train)

scores = cross_val_score(log_gs.best_estimator_, X_train, y_train, cv=5)
print("GS score: {0:.3f}".format(log_gs.best_score_))
print("")
print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(log_gs.best_estimator_.score(X_test, y_test)))

Fitting 3 folds for each of 270 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:  3.9min
[Paralle

[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  7.4min
[Paralle

[Parallel(n_jobs=-1)]: Done 398 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 399 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 400 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 401 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 403 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 404 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 406 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 407 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 411 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed: 11.2min
[Paralle

[Parallel(n_jobs=-1)]: Done 531 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 532 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 533 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 534 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 535 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 537 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 538 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 539 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 545 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed: 14.6min
[Paralle

[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 665 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 667 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 669 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 670 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 671 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 679 tasks      | elapsed: 18.0min
[Paralle

[Parallel(n_jobs=-1)]: Done 797 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 798 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 799 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Done 801 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 802 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 803 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 22.0min finished


GS score: 0.750

CV score: 0.750
CV std = 0.016

Test score: 0.753


In [192]:
log_gs.best_estimator_

Pipeline(memory=None,
         steps=[('dropper',
                 Dropper(to_drop=['look', 'smell', 'taste', 'feel', 'overall',
                                  'date', 'id', 'name', 'rating'])),
                ('vec',
                 Vectorizer(max_df=1.0, min_df=1, ngram_range=(1, 1),
                            stop_words='english', to_vec='review')),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='newton-cg', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

### Trying SVC

In [160]:
from sklearn.svm import SVC

In [165]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = SVC(gamma="scale")

#Setting up pipeline
svc_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
svc_pipe.fit(X_train, y_train)
scores = cross_val_score(svc_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(svc_pipe.score(X_test, y_test)))

CV score: 0.748
CV std = 0.016

Test score: 0.750


### Trying a higher split

In [239]:
bitters_df.describe()

Unnamed: 0,rating,look,smell,taste,feel,overall,id,rating_bin
count,4874.0,4874.0,4874.0,4874.0,4874.0,4874.0,4874.0,4874.0
mean,3.619532,3.754762,3.512302,3.61263,3.601705,3.727731,21149.727329,0.558474
std,0.540676,0.527734,0.591372,0.630672,0.606153,0.658809,29841.591009,0.49662
min,1.0,1.0,1.0,1.0,1.0,1.0,222.0,0.0
25%,3.32,3.5,3.0,3.5,3.25,3.5,2933.0,0.0
50%,3.68,4.0,3.5,3.5,3.5,4.0,10144.0,1.0
75%,3.98,4.0,4.0,4.0,4.0,4.0,27854.0,1.0
max,5.0,5.0,5.0,5.0,5.0,5.0,242866.0,1.0


In [243]:
#setting X and y, and creating split
X=bitters_df.drop(columns = ["rating_bin"]).copy()
y = X["rating"].apply(lambda x: 1 if x >= 4 else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegression(solver = "lbfgs", C=0.8286427728546842)

#Setting up pipeline
log_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
log_pipe.fit(X_train, y_train)
scores = cross_val_score(log_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(log_pipe.score(X_test, y_test)))

CV score: 0.777
CV std = 0.009

Test score: 0.802


In [246]:
X["rating"].apply(lambda x: 1 if x >= 4 else 0).value_counts(normalize=True).max()

0.7535904800984817

### Going more in depth in the best model so far

In [254]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegression(solver = "lbfgs", C=0.8286427728546842)

#Setting up pipeline
log_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
log_pipe.fit(X_train, y_train)
scores = cross_val_score(log_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(log_pipe.score(X_test, y_test)))

CV score: 0.749
CV std = 0.012

Test score: 0.743


In [257]:
lr_predict = log_pipe.predict(X_test)

print(classification_report(y_test, lr_predict))

              precision    recall  f1-score   support

           0       0.75      0.63      0.68       430
           1       0.74      0.83      0.78       545

    accuracy                           0.74       975
   macro avg       0.74      0.73      0.73       975
weighted avg       0.74      0.74      0.74       975



In [258]:
conmat = confusion_matrix(y_test, lr_predict, labels = [1,0])

pd.DataFrame(conmat,
            columns = ["p_high", "p_low"],
            index = ["high", "low"])

Unnamed: 0,p_high,p_low
high,455,90
low,161,269


In [259]:
y.mean()

0.5584735330324169

In [265]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegressionCV(solver = "lbfgs", Cs=20, cv=5, max_iter=1000, verbose=5, n_jobs=-1)

#Setting up pipeline
logcv_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
logcv_pipe.fit(X_train, y_train)

print("Train score: {0:.3f}".format(logcv_pipe.score(X_train, y_train)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 12.9min remaining: 19.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 15.3min finished


Train score: 0.894


In [271]:
logcv_pipe.steps[2][1].C_[0]

array([1.62377674])

In [272]:
logcv_pipe.score(X_test, y_test)

0.7405128205128205

In [273]:
#setting X and y, and creating split
X=bitters_df.copy()
y = X.pop("rating_bin")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

#Setting up pipeline classes
dropper = Dropper(["look", "smell", "taste", "feel",
                    "overall", "date", "id", "name", "rating"])
stem = Stemmer(["review"], style="porter")
vec = Vectorizer(to_vec = "review")
model = LogisticRegression(solver = "lbfgs", C=1.62377674)

#Setting up pipeline
log_pipe = Pipeline(steps = [("dropper", dropper),
                             #("stem", stem),
                              ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
log_pipe.fit(X_train, y_train)
scores = cross_val_score(log_pipe, X_train, y_train, cv=5)

print("CV score: {0:.3f}".format(np.mean(scores)))
print("CV std = {0:.3f}".format(np.std(scores)))
print("")
print("Test score: {0:.3f}".format(log_pipe.score(X_test, y_test)))

CV score: 0.749
CV std = 0.013

Test score: 0.741


### Trying out the model on brewers' descriptions

In [274]:
beers = pd.read_csv("../../data/beers.csv")

In [292]:
beers["review"]=beers["desc"]
clean_beers = beers.dropna()

In [301]:
#setting X and y, and creating split
X=clean_beers[["review"]].copy()
y=clean_beers["av_rating"].apply(lambda x: 1 if x > 3.6 else 0)

#Setting up pipeline
new_pipe = Pipeline(steps = [ ("vec", vec),
                              ("model", model)])

#Fitting and scoring model
y_pred = new_pipe.predict(X)

In [303]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.14      0.07      0.10        14
           1       0.71      0.84      0.77        38

    accuracy                           0.63        52
   macro avg       0.43      0.46      0.43        52
weighted avg       0.56      0.63      0.59        52



In [304]:
conmat = confusion_matrix(y, y_pred, labels = [1,0])

pd.DataFrame(conmat,
            columns = ["p_high", "p_low"],
            index = ["high", "low"])

Unnamed: 0,p_high,p_low
high,32,6
low,13,1


In [306]:
clean_beers["rating_pred"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [318]:
for i in range(0,7):
    print(clean_beers[clean_beers["rating_pred"] == 0].iloc[i,10])
    print("")

This 4.2% Kentish ale (bottle 4.5%) was first brewed in 1990 to celebrate the Battle of Britain which was fought in the skies above Kent 50 years earlier. The beer is named after the legendary Spitfire aeroplane designed by RJ Mitchell. The versatility of the aircraft and the courage of its pilots were essential to victory and were a key symbol of the spirit of that time.

The pint of pints. Belhaven Best is the main man of draught ales. Best colour, best flavour, best balance - the pint for all occasions. We Scots are often not known for blowing our own trumpets, modest souls that we are, but we've named this Best for a reason. We're sure you'll agree.

5.5% ABV 31 IBU'S

A general term for what is perhaps the highest expression of the brewer's craft. Amber not filtered and true to style. A natural with fish and chips.

Cwrw Braf is a clean-drinking, amber-coloured ale with a light bitterness and gentle hop aroma. A session bitter yet, with all the flavours and quality you would expec