In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from FE import FE
from FE import text_process

In [2]:
wine_df = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
# compiling a list of wine varieties to filter the dataframe by
keepers = list(wine_df.variety.value_counts()[wine_df.variety.value_counts() >= 1000].index)

In [4]:
# filtering the dataframe
filtered_wine = wine_df[wine_df.variety.isin(keepers)].copy(deep=True).reset_index()
filtered_wine['variety'] = filtered_wine['variety'].str.replace('(\s|-)','_').str.lower()

In [5]:
processed_wine = text_process(filtered_wine, merged=False)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

y = filtered_wine.variety
X = processed_wine

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# MultinomialNB Prediction

In [21]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb_pred = mnb.predict(X_test)
cnprint(classification_report(y_test, mnb_pred, target_names=mnb.classes_))

                            precision    recall  f1-score   support

  bordeaux_style_red_blend       0.53      0.70      0.60      2345
bordeaux_style_white_blend       0.84      0.17      0.28       339
            cabernet_franc       0.71      0.02      0.04       452
        cabernet_sauvignon       0.49      0.69      0.57      3101
           champagne_blend       0.72      0.42      0.53       437
                chardonnay       0.62      0.89      0.73      3836
                     gamay       0.83      0.14      0.24       319
            gewürztraminer       0.85      0.10      0.18       344
          grüner_veltliner       0.87      0.43      0.58       419
                    malbec       0.42      0.42      0.42       864
                    merlot       0.72      0.06      0.11       997
                  nebbiolo       0.67      0.80      0.73       942
              pinot_grigio       0.93      0.16      0.27       358
                pinot_gris       0.93      0.15

# MultinomialNB with Transformed Predictors

In [31]:
trans_mnb = MultinomialNB()
trans_mnb.fit(np.log(1+X_train), y_train)
trans_mnb_pred = trans_mnb.predict(np.log(1+X_test))
print(classification_report(y_test, trans_mnb_pred, target_names=trans_mnb.classes_))

                            precision    recall  f1-score   support

  bordeaux_style_red_blend       0.50      0.71      0.59      2345
bordeaux_style_white_blend       0.87      0.04      0.07       339
            cabernet_franc       0.67      0.00      0.01       452
        cabernet_sauvignon       0.47      0.72      0.57      3101
           champagne_blend       0.84      0.27      0.40       437
                chardonnay       0.56      0.93      0.70      3836
                     gamay       0.90      0.06      0.11       319
            gewürztraminer       0.88      0.02      0.04       344
          grüner_veltliner       0.93      0.27      0.42       419
                    malbec       0.45      0.31      0.37       864
                    merlot       0.81      0.02      0.04       997
                  nebbiolo       0.67      0.74      0.70       942
              pinot_grigio       0.96      0.07      0.13       358
                pinot_gris       0.95      0.07

# MultinomialNB with Explicitly Defined Priors

In [32]:
n = y_train.shape[0]
priors = y_train.value_counts().values/n

priors_mnb = MultinomialNB(class_prior=priors)
priors_mnb.fit(X_train, y_train)
priors_mnb_pred = priors_mnb.predict(X_test)
print(classification_report(y_test, priors_mnb_pred, target_names=priors_mnb.classes_))

                            precision    recall  f1-score   support

  bordeaux_style_red_blend       0.49      0.76      0.60      2345
bordeaux_style_white_blend       0.68      0.36      0.47       339
            cabernet_franc       0.51      0.04      0.08       452
        cabernet_sauvignon       0.44      0.76      0.55      3101
           champagne_blend       0.61      0.57      0.59       437
                chardonnay       0.63      0.89      0.73      3836
                     gamay       0.72      0.23      0.34       319
            gewürztraminer       0.87      0.18      0.29       344
          grüner_veltliner       0.75      0.54      0.63       419
                    malbec       0.37      0.47      0.41       864
                    merlot       0.60      0.08      0.14       997
                  nebbiolo       0.64      0.84      0.72       942
              pinot_grigio       0.89      0.24      0.38       358
                pinot_gris       0.93      0.20

# ComplementNB Prediction

In [23]:
from sklearn.naive_bayes import ComplementNB

cnb = ComplementNB(alpha=1.2)
cnb.fit(X_train, y_train)
cnb_pred = cnb.predict(X_test)
print(classification_report(y_test, cnb_pred, target_names=cnb.classes_))

                            precision    recall  f1-score   support

  bordeaux_style_red_blend       0.50      0.73      0.59      2345
bordeaux_style_white_blend       0.67      0.14      0.23       339
            cabernet_franc       0.41      0.02      0.03       452
        cabernet_sauvignon       0.53      0.65      0.58      3101
           champagne_blend       0.82      0.41      0.55       437
                chardonnay       0.58      0.90      0.70      3836
                     gamay       0.85      0.27      0.41       319
            gewürztraminer       0.83      0.31      0.46       344
          grüner_veltliner       0.76      0.42      0.54       419
                    malbec       0.52      0.41      0.45       864
                    merlot       0.53      0.03      0.05       997
                  nebbiolo       0.66      0.81      0.73       942
              pinot_grigio       0.90      0.36      0.51       358
                pinot_gris       0.87      0.16

# ComplementNB with Transformed Predictors

In [35]:
trans_cnb = ComplementNB(alpha=1.2)
trans_cnb.fit(np.log(1+X_train), y_train)
trans_cnb_pred = trans_cnb.predict(np.log(1+X_test))
print(classification_report(y_test, trans_cnb_pred, target_names=trans_cnb.classes_))

                            precision    recall  f1-score   support

  bordeaux_style_red_blend       0.50      0.74      0.60      2345
bordeaux_style_white_blend       0.68      0.14      0.23       339
            cabernet_franc       0.40      0.01      0.03       452
        cabernet_sauvignon       0.53      0.65      0.58      3101
           champagne_blend       0.81      0.41      0.54       437
                chardonnay       0.57      0.91      0.70      3836
                     gamay       0.85      0.27      0.41       319
            gewürztraminer       0.82      0.30      0.44       344
          grüner_veltliner       0.78      0.42      0.54       419
                    malbec       0.52      0.39      0.45       864
                    merlot       0.51      0.02      0.04       997
                  nebbiolo       0.66      0.81      0.73       942
              pinot_grigio       0.90      0.36      0.51       358
                pinot_gris       0.88      0.15