# Loading in the dataset

In [1]:
import pandas as pd
import numpy as np
import json
import requests
import os
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

from FE import FE
from FE import text_process
pd.set_option('display.max_columns', None)

# Data Processing

In [2]:
wine = pd.read_csv('winemag-data-130k-v2.csv')

In [3]:
# compiling a list of wine varieties to filter the dataframe by
keepers = list(wine.variety.value_counts()[wine.variety.value_counts() >= 1000].index)
# filtering the dataframe
filtered_wine = wine[wine.variety.isin(keepers)].copy(deep=True).reset_index()
# wine color feature
filtered_wine = FE(filtered_wine)

In [4]:
filtered_wine['variety'] = filtered_wine['variety'].str.replace('(\s|-)','_').str.lower()
processed_wine = text_process(filtered_wine, one_hot=True)

In [5]:
y = processed_wine['variety']
X = processed_wine.drop(columns=['id','index','country','description','points','price','province','region_1',
                                 'region_2','variety', 'country', 'taster_name', 'taster_twitter_handle', 
                                 'title', 'winery', 'year', 'continent', 'category', 'score_descriptive','designation'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Building Latent Dirichlet Allocation Model

In [6]:
n_samples = 2000
n_features = 1000
n_components = 28
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [7]:
joblib_file = "lda_model.pkl"

tf_train = X_train.drop(columns = ['x0_red','x0_rose', 'x0_unknown', 'x0_white'])

# lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
#                                 learning_method='online',
#                                 learning_offset=50.,
#                                 random_state=0)
# lda.fit(tf_train)

# joblib.dump(lda, joblib_file)

# instead of retraining lda model
    
lda = joblib.load(joblib_file)
    
tf_feature_names = X_train.columns[0:-4]
tbl = pd.DataFrame(lda.transform(tf_train))

# Preparing Data for Classification

In [14]:
n_components = 28
tbl = pd.DataFrame(lda.transform(tf_train))
tbl.columns = ['topic_' + str(x) for x in range(n_components)]
new_train = pd.concat([X_train[['x0_red','x0_rose', 'x0_unknown', 'x0_white']].reset_index(drop=True), 
                       tbl.reset_index(drop=True)], axis=1)

In [17]:
new_test = pd.DataFrame(lda.transform(X_test.drop(columns=['x0_red','x0_rose', 'x0_unknown', 'x0_white'])))
new_test.columns = ['topic_' + str(x) for x in range(n_components)]
new_test = pd.concat([X_test[['x0_red','x0_rose', 'x0_unknown', 'x0_white']].reset_index(drop=True), 
                      new_test.reset_index(drop=True)], axis=1)

# Random Forest Classification on LDA Outputs

In [16]:
forest_filename = 'forest_model.pkl'

# gridsearch_forest = RandomForestClassifier()

# params = {
#     "n_estimators": [100, 300, 500],
#     "max_depth": [5,15,25],
#     "min_samples_leaf" : [1, 2, 4]
# }

# clf = RandomizedSearchCV(gridsearch_forest, param_distributions=params, cv=3 )
# clf.fit(new_train,y_train)

# joblib.dump(clf, forest_filename, compress=True)

# instead of retraining random forest model

clf = joblib.load(forest_filename)

['forest_model.pkl']

In [18]:
forest_pred = clf.predict(new_test)

In [19]:
print(classification_report(y_test, forest_pred, target_names=clf.classes_))

                            precision    recall  f1-score   support

  bordeaux_style_red_blend       0.48      0.61      0.54      2345
bordeaux_style_white_blend       0.42      0.15      0.22       339
            cabernet_franc       0.71      0.07      0.13       452
        cabernet_sauvignon       0.34      0.55      0.42      3101
           champagne_blend       0.56      0.36      0.44       437
                chardonnay       0.52      0.87      0.65      3836
                     gamay       0.38      0.13      0.20       319
            gewürztraminer       0.66      0.11      0.19       344
          grüner_veltliner       0.59      0.22      0.32       419
                    malbec       0.38      0.13      0.19       864
                    merlot       0.41      0.11      0.17       997
                  nebbiolo       0.45      0.39      0.42       942
              pinot_grigio       0.55      0.27      0.36       358
                pinot_gris       0.71      0.69

# Naive Bayes Classification on LDA Outputs

In [20]:
gnb = GaussianNB()
gnb.fit(new_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [21]:
gnb_pred = gnb.predict(new_test)

In [22]:
print(classification_report(y_test, gnb_pred, target_names=gnb.classes_))

                            precision    recall  f1-score   support

  bordeaux_style_red_blend       0.21      0.19      0.20      2345
bordeaux_style_white_blend       0.19      0.25      0.21       339
            cabernet_franc       0.05      0.04      0.04       452
        cabernet_sauvignon       0.29      0.18      0.22      3101
           champagne_blend       0.36      0.34      0.35       437
                chardonnay       0.62      0.44      0.51      3836
                     gamay       0.10      0.62      0.17       319
            gewürztraminer       0.11      0.18      0.14       344
          grüner_veltliner       0.19      0.26      0.22       419
                    malbec       0.08      0.06      0.07       864
                    merlot       0.15      0.02      0.04       997
                  nebbiolo       0.19      0.79      0.31       942
              pinot_grigio       0.20      0.26      0.23       358
                pinot_gris       0.45      0.76