## 1. Package Installation

In [2]:
import pandas as pd
import numpy as np
import warnings
import ast
from gensim.models import LdaModel, TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora
from New_Data_Process import continent, process_new_data
from Similarity import J_S_distance, cosine_similarity, soft_cosine_measure_similarity
import xgboost as xgb
import pickle
from ipywidgets import widgets
from IPython.display import display

warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## 2. Preparation

In [3]:
df_wine = pd.read_csv('data/df_wine_clean_no.csv').iloc[:, 1:]
df_wine_400 = pd.read_csv('data/df_wine_ready_400.csv').iloc[:, 1:]

scaler = pickle.load(open('models/scaler_without_ohe.pkl','rb'))

lda = LdaModel.load('models/topic modeling/optimal_ldamodel')
dictionary = corpora.Dictionary.load('models/topic modeling/optimal_ldamodel.id2word')
df_wine['LDA description'] = [ast.literal_eval(text) for text in df_wine['LDA description']]
corpus = [dictionary.doc2bow(text) for text in df_wine['LDA description']]
tfidf = TfidfModel(corpus)
termsim_matrix = SparseTermSimilarityMatrix.load('data/termsim_matrix')

model_xgb = xgb.XGBRegressor()
model_xgb.load_model('models/XGBOOST/XGBRegressor_0.763715.json')

## 3. Manually Input a New Data

In [4]:
description = "There are oodles of crowd-pleasing floral and fruit aromas on this Semillon-Sauvignon blend. Honeysuckle, lime blossom, peach and lemon drops are underpinned by gingery spice. The palate is weightier than one might expect, although it's still in the light-to medium-bodied spectrum. Flavors are delicate but persistent. There's freshness and a pretty, summer-sipping vibe. Drink now."
new_data_xgb = process_new_data(df_wine=df_wine,
                                df_wine_400=df_wine_400,
                                dictionary=dictionary,
                                lda=lda,
                                scaler=scaler,
                                model_xgb=model_xgb,
                                price=18,
                                description=description,
                                country='Australia',
                                province='South Australia',
                                variety='Semillon-Sauvignon Blanc',
                                year='2021')
new_data_xgb

Unnamed: 0,normalized rating,price,positive,neutral,negative,compound,0,1,2,3,...,2006,2007,2008,2010,2011,2012,2013,2014,2015,2016
0,4.301984,-0.510076,-0.357467,0.506417,-0.449336,-0.780777,-0.171051,0.718823,-0.131139,-0.203453,...,-0.220587,-0.244376,-0.258417,-0.332,-0.340314,-0.386675,-0.375307,-0.364692,-0.284207,-0.168551


In [5]:
J_S_distance(df_wine=df_wine,
             new_data=np.array(new_data_xgb.iloc[:, 6:146]),
             n=10,
             sort_by_points=False)

10 Most similar wines (descending order by similarity):
1. Nicosia 2013 Vulkà Bianco  (Etna) ---- 4.15
2. Bel Lago 2012 North Vineyard Pinot Noir (Leelanau Peninsula) ---- 5.5
3. Bien Nacido 2012 Grenache (Santa Maria Valley) ---- 5.5
4. Carlson 2014 Chardonnay (Santa Barbara County) ---- 5.5
5. Casa Santos Lima 2014 Quinta do Espirito Santos Red (Lisboa) ---- 5.5
6. Château de Sancerre 2014  Sancerre ---- 5.5
7. Gratien et Meyer NV Cuvée Flamme Brut Sparkling (Crémant de Loire) ---- 5.5
8. Grgich Hills 2013 Miljenko's Selection Chardonnay (Carneros) ---- 5.5
9. Havens 2013 Merlot (Napa Valley) ---- 5.5
10. La Castellada 2009 Ribolla Gialla (Collio) ---- 5.5


In [6]:
cosine_similarity(df_wine=df_wine,
                  df_wine_400=df_wine_400,
                  new_data=new_data_xgb,
                  n=10,
                  sort_by_points=False)

10 Most similar wines (descending order by similarity):
1. Kooyong 2013 Farrago Chardonnay (Mornington Peninsula) ---- 5.5
2. Giant Steps 2012 Sexton Vineyard Chardonnay (Yarra Valley) ---- 6.4
3. Streicker 2013 Bridgeland Block Sauvignon Blanc-Semillon (Margaret River) ---- 3.25
4. Vasse Felix 2013 Chardonnay (Margaret River) ---- 6.85
5. Cape Mentelle 2012 Sauvignon Blanc-Semillon (Margaret River) ---- 5.05
6. Leeuwin Estate 2014 Prelude Vineyards Chardonnay (Margaret River) ---- 6.85
7. Stella Bella 2009 Cabernet Sauvignon-Merlot (Margaret River) ---- 5.5
8. Robert Oatley 2013 Finisterre Chardonnay (Margaret River) ---- 5.5
9. Moorooduc 2013 Robinson Pinot Noir (Mornington Peninsula) ---- 6.4
10. Robert Oatley 2015 Finisterre Chardonnay (Margaret River) ---- 6.4


In [4]:
soft_cosine_measure_similarity(df_wine=df_wine,
                               new_input=description,
                               termsim_matrix=termsim_matrix,
                               tfidf=tfidf,
                               corpus=corpus,
                               dictionary=dictionary,
                               k=10,
                               sort_by_points=False)

100%|██████████| 118985/118985 [00:45<00:00, 2619.43it/s]

10 Most similar wines (descending order by similarity):
1. Quiet Resolve 2014 Project 592 The Cape Winds Chardonnay (Western Cape) ---- 4.6
2. V. Sattui 2011 Riesling (Anderson Valley) ---- 3.7
3. Bostavan 2015 Dor Traminer-Chardonnay White (Moldova) ---- 4.15
4. Jidvei 2016 Treasure of Transylvania Medium Dry Moscato (Tarnave) ---- 3.25
5. Mauro Sebaste 2011  Moscato d'Asti ---- 4.15
6. Three Rivers 2007 Chardonnay (Columbia Valley (WA)) ---- 4.15
7. Buitenverwachting 2016 Bayten Sauvignon Blanc (Constantia) ---- 5.5
8. Tildio 2007 Riesling (Columbia Valley (WA)) ---- 4.6
9. La Follette 2013 Pinot Noir (North Coast) ---- 5.5
10. Aubichon Cellars 2015 Pinot Noir (Willamette Valley) ---- 6.4





## 4. Dashboard

In [15]:
layout = widgets.Layout(width='auto', height='40px')

price_input = widgets.Text(description="Price")
display(price_input)

description_input = widgets.Text(description="Description", layout = layout)
display(description_input)

country_input = widgets.Text(description="Country")
display(country_input)

province_input = widgets.Text(description="Province")
display(province_input)

variety_input = widgets.Text(description="Variety")
display(variety_input)

year_input = widgets.Dropdown(description="Year", options=range(1927, 2023))
display(year_input)

method_input = widgets.Dropdown(description="Method", options=['Topic Prioritized', 'Numeric Prioritized', 'Description Prioritized (slow)'])
display(method_input)

top_input = widgets.IntSlider(description="Top # Wines", min=0, max=100, step=5)
display(top_input)

sort_input = widgets.Dropdown(description="Sort Points", options=['False', 'True'])
display(sort_input)

execute = widgets.Button(description='Execute!')
box_layout = widgets.Layout(display='flex',
                            flex_flow='column',
                            align_items='center',
                            width='auto',
                            height='40px')
box = widgets.HBox(children=[execute], layout=box_layout)
display(box)

def btn_eventhandler(obj):
    new_data_xgb = process_new_data(df_wine=df_wine,
                                    df_wine_400=df_wine_400,
                                    dictionary=dictionary,
                                    lda=lda,
                                    scaler=scaler,
                                    model_xgb=model_xgb,
                                    price=int(price_input.value),
                                    description=str(description_input.value),
                                    country=str(country_input.value),
                                    province=str(province_input.value),
                                    variety=str(variety_input.value),
                                    year=str(year_input.value))  
    print("Predicted Rating: {0:.2f}".format(new_data_xgb.iloc[0,0]))
    print('-----------------------------------------------------')
    if method_input.value == 'Topic Prioritized':
        J_S_distance(df_wine=df_wine,
                     new_data=np.array(new_data_xgb.iloc[:, 6:146]),
                     n=top_input.value,
                     sort_by_points=(sort_input.value == 'True'))
    elif method_input.value == 'Numeric Prioritized':
        cosine_similarity(df_wine=df_wine,
                          df_wine_400=df_wine_400,
                          new_data=new_data_xgb,
                          n=top_input.value,
                          sort_by_points=(sort_input.value == 'True'))
    elif method_input.value == 'Description Prioritized (slow)':
        soft_cosine_measure_similarity(df_wine=df_wine,
                                       new_input=str(description_input.value),
                                       termsim_matrix=termsim_matrix,
                                       tfidf=tfidf,
                                       corpus=corpus,
                                       dictionary=dictionary,
                                       k=top_input.value,
                                       sort_by_points=(sort_input.value == 'True'))
    
        
execute.on_click(btn_eventhandler)

Text(value='', description='Price')

Text(value='', description='Description', layout=Layout(height='40px', width='auto'))

Text(value='', description='Country')

Text(value='', description='Province')

Text(value='', description='Variety')

Dropdown(description='Year', options=(1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, …

Dropdown(description='Method', options=('Topic Prioritized', 'Numeric Prioritized', 'Description Prioritized (…

IntSlider(value=0, description='Top # Wines', step=5)

Dropdown(description='Sort Points', options=('False', 'True'), value='False')

HBox(children=(Button(description='Execute!', style=ButtonStyle()),), layout=Layout(align_items='center', disp…