## 1. Package Installation

In [1]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
from scipy.stats import entropy
from Preprocessing import clean, token_stop
import scipy.spatial as sp

warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## 2. Read in Data

In [2]:
df_wine = pd.read_csv('data/df_wine_clean_no.csv').iloc[:, 1:]
df_wine.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,title,variety,...,130,131,132,133,134,135,136,137,138,139
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,39.5,Sicily & Sardinia,Etna,,Nicosia 2013 Vulkà Bianco (Etna),White Blend,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,...,0.0,0.0,0.0,0.081144,0.0,0.0,0.0,0.0,0.0,0.0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,...,0.0,0.0,0.0,0.349908,0.092807,0.0,0.0,0.0,0.0,0.0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,...,0.0,0.0,0.0,0.0,0.0,0.048212,0.0,0.0,0.0,0.0


In [3]:
df_wine.shape

(119459, 160)

In [4]:
lda = LdaModel.load('models/topic modeling/optimal_ldamodel')
dictionary = corpora.Dictionary.load('models/topic modeling/optimal_ldamodel.id2word')
df_wine['LDA description'] = [ast.literal_eval(text) for text in df_wine['LDA description']]
corpus = [dictionary.doc2bow(text) for text in df_wine['LDA description']]

## 3. Apply Jensen-Shannon distance

In [5]:
doc_topic_dist = np.array(df_wine.iloc[:,20:])
doc_topic_dist.shape

(119459, 140)

In [None]:
fig, ax = plt.subplots()
patches = ax.bar(np.arange(140), doc_topic_dist[0,:])
ax.set_xlabel('Topic ID', fontsize=15)
ax.set_ylabel('Topic Contribution', fontsize=15)
ax.set_title("Topic Distribution for an Seen Article", fontsize=20)
fig.tight_layout()
plt.show()

In [6]:
def jensen_shannon(query, matrix):
    p = query[None, :].T
    q = matrix.T
    m = 0.5 * (p+q)
    return np.sqrt(0.5 * (entropy(p, m)+entropy(q, m)))

In [7]:
def get_most_similar_documents(query, matrix, k=10):
    sims = jensen_shannon(query, matrix)
    return sims.argsort()[:k]

In [8]:
def print_most_similar(df, query, matrix, k=10, sort_points=False):
    most_sim_ids = get_most_similar_documents(query, matrix)
    most_similar_df = df[df.index.isin(most_sim_ids)]
    most_similar_df = most_similar_df[['title', 'normalized rating']]
    if sort_points:
        most_similar_df = most_similar_df.sort_values(by=['normalized rating'], ascending=False)
        print(f'{k} Most similar wines (descending order by similarity and points):')
    else:
        print(f'{k} Most similar wines (descending order by similarity):')
    for i in range(k):
        print(f'{i+1}. {most_similar_df.iloc[i, 0]} ---- {round(most_similar_df.iloc[i, 1], 2)}')

In [12]:
type(doc_topic_dist)

numpy.ndarray

In [10]:
jensen_shannon(doc_topic_dist[0, :], doc_topic_dist)

array([1.19910768e-08, 8.32554695e-01, 8.32556540e-01, ...,
       8.32555234e-01, 8.32555931e-01, 8.32559987e-01])

In [9]:
print_most_similar(df_wine, doc_topic_dist[0, :], doc_topic_dist, sort_points=True)

10 Most similar wines (descending order by similarity and points):
1. Tenuta Luisa 2015 Friulano (Isonzo del Friuli) ---- 4.6
2. Inama 2013 Vigneti di Foscarino  (Soave Classico) ---- 4.6
3. Nicosia 2013 Vulkà Bianco  (Etna) ---- 4.15
4. Panizzi 2014 Vigna Santa Margherita  (Vernaccia di San Gimignano) ---- 4.15
5. Tenuta Gorghi Tondi 2013 Rajàh Zibibbo (Terre Siciliane) ---- 4.15
6. Sandro de Bruno 2015 Colle di Montecchia di Crosara  (Soave Colli Scaligeri) ---- 4.15
7. Feudo Antico 2014 Grillo Parlante Grillo (Terre Siciliane) ---- 4.15
8. Capolino Perlingieri 2015 Vento Greco (Sannio) ---- 4.15
9. Il Lebbio 2015 Tropìe  (Vernaccia di San Gimignano) ---- 4.15
10. Kellerei Kaltern Caldaro 2012 Pinot Grigio (Alto Adige) ---- 3.7


In [None]:
#use lda model to predict a new document
new_doc = "There are oodles of crowd-pleasing floral and fruit aromas on this Semillon-Sauvignon blend. Honeysuckle, lime blossom, peach and lemon drops are underpinned by gingery spice. The palate is weightier than one might expect, although it's still in the light-to medium-bodied spectrum. Flavors are delicate but persistent. There's freshness and a pretty, summer-sipping vibe. Drink now."
new_doc = token_stop(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
new_doc_dist = lda.get_document_topics(new_doc_bow)
dist = np.zeros(140,)
for (i, prob) in new_doc_dist:
    dist[i] = prob
new_doc_dist = dist
new_doc_dist

In [None]:
print_most_similar(df_wine, new_doc_dist, doc_topic_dist, sort_points=True)