In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from FE import FE
from FE import text_process

In [2]:
wine_df = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
# compiling a list of wine varieties to filter the dataframe by
keepers = list(wine_df.variety.value_counts()[wine_df.variety.value_counts() >= 1000].index)

In [4]:
# filtering the dataframe
filtered_wine = wine_df[wine_df.variety.isin(keepers)].copy(deep=True).reset_index()
filtered_wine['variety'] = filtered_wine['variety'].str.replace('(\s|-)','_').str.lower()

In [5]:
processed_wine = text_process(filtered_wine, merged=False)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

y = filtered_wine.variety
X = processed_wine

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Multinomial NB Prediction

In [20]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb_pred = mnb.predict(X_test)
class_names = filtered_wine['variety'].unique()
print(classification_report(y_test, mnb_pred, target_names=class_names))

                            precision    recall  f1-score   support

               white_blend       0.53      0.70      0.60      2345
            portuguese_red       0.84      0.17      0.28       339
                pinot_gris       0.71      0.02      0.04       452
                  riesling       0.49      0.69      0.57      3101
                pinot_noir       0.72      0.42      0.53       437
            gewürztraminer       0.62      0.89      0.73      3836
        cabernet_sauvignon       0.83      0.14      0.24       319
                chardonnay       0.85      0.10      0.18       344
                    malbec       0.87      0.43      0.58       419
                 red_blend       0.42      0.42      0.42       864
                    merlot       0.72      0.06      0.11       997
                     gamay       0.67      0.80      0.73       942
           sauvignon_blanc       0.93      0.16      0.27       358
bordeaux_style_white_blend       0.93      0.15

# ComplementNB Prediction

In [22]:
from sklearn.naive_bayes import ComplementNB

cnb = ComplementNB(alpha=1.2)
cnb.fit(X_train, y_train)
cnb_pred = cnb.predict(X_test)
print(classification_report(y_test, cnb_pred, target_names=class_names))

                            precision    recall  f1-score   support

               white_blend       0.50      0.73      0.59      2345
            portuguese_red       0.67      0.14      0.23       339
                pinot_gris       0.41      0.02      0.03       452
                  riesling       0.53      0.65      0.58      3101
                pinot_noir       0.82      0.41      0.55       437
            gewürztraminer       0.58      0.90      0.70      3836
        cabernet_sauvignon       0.85      0.27      0.41       319
                chardonnay       0.83      0.31      0.46       344
                    malbec       0.76      0.42      0.54       419
                 red_blend       0.52      0.41      0.45       864
                    merlot       0.53      0.03      0.05       997
                     gamay       0.66      0.81      0.73       942
           sauvignon_blanc       0.90      0.36      0.51       358
bordeaux_style_white_blend       0.87      0.16