## 1. Package Installation

In [114]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import scipy.spatial as sp
from sklearn.preprocessing import OneHotEncoder
from Preprocessing import clean, token_stop, vader_score, getPolarity
from New_Data_Process import continent
import xgboost as xgb
import pickle
import json


warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## 2. Read in Data

In [28]:
df_wine = pd.read_csv('data/df_wine_clean_no.csv').iloc[:, 1:]
df_wine.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,title,variety,...,130,131,132,133,134,135,136,137,138,139
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,39.5,Sicily & Sardinia,Etna,,Nicosia 2013 Vulkà Bianco (Etna),White Blend,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,...,0.0,0.0,0.0,0.081144,0.0,0.0,0.0,0.0,0.0,0.0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,...,0.0,0.0,0.0,0.349908,0.092807,0.0,0.0,0.0,0.0,0.0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,...,0.0,0.0,0.0,0.0,0.0,0.048212,0.0,0.0,0.0,0.0


In [25]:
df_wine_400 = pd.read_csv('data/df_wine_ready_400.csv').iloc[:, 1:]
df_wine_400.head()

Unnamed: 0,normalized rating,price,positive,neutral,negative,compound,0,1,2,3,...,2006,2007,2008,2010,2011,2012,2013,2014,2015,2016
0,4.15,0.128299,-0.810436,0.960002,-0.449336,-0.826123,-0.171051,-0.204186,-0.131139,3.833288,...,-0.220587,-0.244376,-0.258417,-0.332,-0.340314,-0.386675,2.664482,-0.364692,-0.284207,-0.168551
1,4.15,-0.599151,-0.041276,0.189796,-0.449336,-0.197066,-0.171051,-0.204186,-0.131139,-0.203453,...,-0.220587,-0.244376,-0.258417,-0.332,2.938463,-0.386675,-0.375307,-0.364692,-0.284207,-0.168551
2,4.15,-0.628843,-1.359328,1.047161,0.95041,-1.363137,-0.171051,-0.204186,-0.131139,-0.203453,...,-0.220587,-0.244376,-0.258417,-0.332,-0.340314,-0.386675,2.664482,-0.364692,-0.284207,-0.168551
3,4.15,-0.658535,-0.810436,0.960002,-0.449336,-0.405175,-0.171051,-0.204186,-0.131139,-0.203453,...,-0.220587,-0.244376,-0.258417,-0.332,-0.340314,-0.386675,2.664482,-0.364692,-0.284207,-0.168551
4,4.15,0.885442,0.537814,-0.390081,-0.449336,1.260598,-0.171051,-0.204186,-0.131139,-0.203453,...,-0.220587,-0.244376,-0.258417,-0.332,-0.340314,2.586152,-0.375307,-0.364692,-0.284207,-0.168551


In [113]:
# 'price', 'positive', 'neutral', 'negative', 'compound', topic 0-139
scaler = pickle.load(open('models/scaler_without_ohe.pkl','rb'))

In [29]:
lda = LdaModel.load('models/topic modeling/optimal_ldamodel')
dictionary = corpora.Dictionary.load('models/topic modeling/optimal_ldamodel.id2word')
df_wine['LDA description'] = [ast.literal_eval(text) for text in df_wine['LDA description']]
corpus = [dictionary.doc2bow(text) for text in df_wine['LDA description']]

## 3. Input a New Data Point and Processed(Prototype)

In [None]:
# save the value for 0 and 1 after standardizing
'''
ohe_reference = {i: sorted(list(set(df_wine_400[i]))) for i in list(df_wine_400.columns)[146:]}

with open("data/ohe_reference.json", "w") as outfile:
    json.dump(ohe_reference, outfile)
'''

In [128]:
# input a new data point
new_data = pd.DataFrame({'price': [18],
                         'description': "There are oodles of crowd-pleasing floral and fruit aromas on this Semillon-Sauvignon blend. Honeysuckle, lime blossom, peach and lemon drops are underpinned by gingery spice. The palate is weightier than one might expect, although it's still in the light-to medium-bodied spectrum. Flavors are delicate but persistent. There's freshness and a pretty, summer-sipping vibe. Drink now.",
                         'country': 'Australia',
                         'province': 'South Australia',
                         'variety': 'Semillon-Sauvignon Blanc',
                         'year': '2021'})

# generate vader score
vader_result = vader_score(new_data['description'])
new_data['positive'] = vader_result['positive']
new_data['neutral'] = vader_result['neutral']
new_data['negative'] = vader_result['negative']
new_data['compound'] = vader_result['compound']
new_data['polarity'] = vader_result['polarity']

# generate continent
new_data['continent'] = continent(new_data['country'])

# generate topic distribution
new_doc = token_stop(new_data.iloc[0,1])
new_doc_bow = dictionary.doc2bow(new_doc)
new_doc_dist = lda.get_document_topics(new_doc_bow)
dist = np.zeros(140,)
for (i, prob) in new_doc_dist:
    dist[i] = prob
new_doc_dist = dist
new_data = pd.concat([new_data, pd.DataFrame({str(i):[value] for i, value in enumerate(pd.Series(new_doc_dist))})], axis=1)

# standardize
new_data_ready = pd.concat([new_data[['price', 'positive', 'neutral', 'negative', 'compound']], new_data.iloc[:, 12:]], axis=1)
labels = list(new_data_ready.columns)
new_data_ready = pd.DataFrame(scaler.transform(new_data_ready))
new_data_ready.columns = labels

# one hot encoder
ohe = OneHotEncoder()
transformed = ohe.fit_transform(new_data[['country', 'continent', 'polarity', 'year', 'variety']])
df_ohe = pd.DataFrame(transformed.toarray())
col_name = []
for i in ohe.categories_:
    col_name.extend(list(i))
df_ohe.columns = col_name
new_data_ready = pd.concat([new_data_ready, df_ohe], axis=1)

# prepare data for xgb model
new_data_xgb = pd.DataFrame({i:[0] for i in list(df_wine_400.columns)[1:]})
xgb_columns = list(new_data_xgb.columns)
for i in xgb_columns:
    try:
        new_data_xgb[i] = new_data_ready[i]
    except:
        continue

f = open('data/ohe_reference.json')
ohe_reference = json.load(f)
f.close()

for key, value in ohe_reference.items():
    new_data_xgb[key] = value[int(new_data_xgb[key])]

new_data_xgb

Unnamed: 0,price,positive,neutral,negative,compound,0,1,2,3,4,...,2006,2007,2008,2010,2011,2012,2013,2014,2015,2016
0,-0.510076,-0.357467,0.506417,-0.449336,-0.780777,-0.171051,0.717333,-0.131139,-0.203453,-0.234454,...,-0.220587,-0.244376,-0.258417,-0.332,-0.340314,-0.386675,-0.375307,-0.364692,-0.284207,-0.168551


## 4. Apply XGBOOST to Predict

In [115]:
model_xgb = xgb.XGBRegressor()
model_xgb.load_model('models/XGBOOST/XGBRegressor_0.763715.json')

In [129]:
predicted_new_data = pd.DataFrame({'normalized rating': model_xgb.predict(new_data_xgb)})
new_data_xgb = pd.concat([predicted_new_data, new_data_xgb], axis=1)
new_data_xgb

Unnamed: 0,normalized rating,price,positive,neutral,negative,compound,0,1,2,3,...,2006,2007,2008,2010,2011,2012,2013,2014,2015,2016
0,4.301984,-0.510076,-0.357467,0.506417,-0.449336,-0.780777,-0.171051,0.717333,-0.131139,-0.203453,...,-0.220587,-0.244376,-0.258417,-0.332,-0.340314,-0.386675,-0.375307,-0.364692,-0.284207,-0.168551


## 5. Cosine Similarity Among All Numerical Variables

In [134]:
def print_most_similar(df, query, matrix, k=10, sort_points=False):
    cos_sims = 1 - sp.distance.cdist(matrix, query, 'cosine')
    cos_sims = cos_sims.reshape(len(cos_sims))
    most_sim_ids = sorted(range(len(cos_sims)), key=lambda i: -cos_sims[i])[:10]
    most_similar_df = df[df.index.isin(most_sim_ids)]
    most_similar_df = most_similar_df[['title', 'normalized rating']]
    if sort_points:
        most_similar_df = most_similar_df.sort_values(by=['normalized rating'], ascending=False)
        print(f'{k} Most similar wines (descending order by similarity and points):')
    else:
        print(f'{k} Most similar wines (descending order by similarity):')
    for i in range(k):
        print(f'{i+1}. {most_similar_df.iloc[i, 0]} ---- {round(most_similar_df.iloc[i, 1], 2)}')

In [136]:
print_most_similar(df_wine, new_data_xgb, df_wine_400, sort_points=False)

10 Most similar wines (descending order by similarity):
1. Kooyong 2013 Farrago Chardonnay (Mornington Peninsula) ---- 5.5
2. Giant Steps 2012 Sexton Vineyard Chardonnay (Yarra Valley) ---- 6.4
3. Streicker 2013 Bridgeland Block Sauvignon Blanc-Semillon (Margaret River) ---- 3.25
4. Vasse Felix 2013 Chardonnay (Margaret River) ---- 6.85
5. Cape Mentelle 2012 Sauvignon Blanc-Semillon (Margaret River) ---- 5.05
6. Leeuwin Estate 2014 Prelude Vineyards Chardonnay (Margaret River) ---- 6.85
7. Stella Bella 2009 Cabernet Sauvignon-Merlot (Margaret River) ---- 5.5
8. Robert Oatley 2013 Finisterre Chardonnay (Margaret River) ---- 5.5
9. Moorooduc 2013 Robinson Pinot Noir (Mornington Peninsula) ---- 6.4
10. Robert Oatley 2015 Finisterre Chardonnay (Margaret River) ---- 6.4
