# Word2Vec and Doc2Vec Models

In [1]:
import pandas as pd
import numpy as np
import regex as re
import nltk
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import collections
from collections import Counter

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [2]:
df = pd.read_csv('../data/test_lower_wines.csv')
df.head()

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,text
0,"Skin Contact Silvaner ""Kleine Heimat,"" 2Naturk...",2Naturkinder,2017,750,white,24,germany,franken,https://www.astorwines.com/SearchResultsSingle...,"Made from 100% Silvaner, this white wine ferme...",silvaner,0,made silvaner white wine ferments skins days g...,franken white germany made silvaner white wine...
1,"Fledermaus Rot, 2Naturkinder",2Naturkinder,2018,750,red,0,germany,franken,https://www.astorwines.com/SearchResultsSingle...,Made from Schwartz Riesling (aka Pinot Meunier...,pinot meunier,0,made schwartz riesling pinot meunier grapes ha...,franken red germany made schwartz riesling pin...
2,"Sekt Weiss, 50o N NV",50oN,NV,750,white,14,germany,rheingau,https://www.astorwines.com/SearchResultsSingle...,Sometimes you just want to drink a wine becaus...,müller-thurgau pinot blanc sylvaner,0,sometimes want drink wine tastes good one wine...,rheingau white germany sometimes want drink wi...
3,"Fuga, Mencía, Ribeira Sacra",A Fuga,2017,750,red,0,spain,ribeira sacra,https://www.astorwines.com/SearchResultsSingle...,"FUGA MENCIAJust barely opaque, and dark garnet...",mencía,0,fuga menciajust barely opaque dark garnet colo...,ribeira sacra red spain fuga menciajust barely...
4,"Cirò Rosso Classico Superiore, A Vita",A Vita,2016,750,red,21,italy,calabria,https://www.astorwines.com/SearchResultsSingle...,"Here is a wine with a distinct, alluring chara...",gaglioppo,cirò,wine distinct alluring character dates back an...,calabriacirò red italy wine distinct alluring ...


In [3]:
df.duplicated().sum()

678

In [4]:
df.drop_duplicates(subset=None, keep='first', inplace=True)

In [5]:
df.shape

(12156, 14)

In [6]:
df[df['color'].isnull()] = 'red'

In [7]:
# drop ciders
cider_rows = df[df['grape'] == 'apple'].index
df.drop(cider_rows, inplace=True)

In [8]:
df.shape

(12145, 14)

# Word2Vec with CBOW
Predict word in the middle, better accuracy for frequent words

For Vec models, need to the all tokenized words-split the text here

In [9]:
df['description_clean_2'] = [str(row).split() for row in df['description_clean']]
df['text_2'] = [str(row).split() for row in df['text']]

In [10]:
w2v_cbow = Word2Vec(df['description_clean_2'], 
                    min_count = 5, # frequency dropped before training occurs
                    workers = 4) # independent threads working

In [11]:
w2v_cbow.wv.most_similar_cosmul('sparkling')

[('ros', 0.9102315306663513),
 ('sparkler', 0.8830097317695618),
 ('festive', 0.8785400390625),
 ('type', 0.8728392124176025),
 ('makes', 0.8718475103378296),
 ('everyday', 0.8690309524536133),
 ('affordable', 0.8683280944824219),
 ('moscato', 0.8632428050041199),
 ('cava', 0.8581907749176025),
 ('brut', 0.8523533940315247)]

In [12]:
text_w2v_cbow = Word2Vec(df['text_2'], 
                    min_count = 5, # frequency dropped before training occurs
                    workers = 4) # independent threads working

In [13]:
text_w2v_cbow.wv.most_similar_cosmul('sparkling')

[('ros', 0.8871276378631592),
 ('italychampagne', 0.8793519735336304),
 ('piedmont,', 0.8574734926223755),
 ('languedoc-roussillon', 0.846211314201355),
 ('white', 0.8390681743621826),
 ('penedès', 0.8385623097419739),
 ('loirenantes', 0.8345789313316345),
 ('dão', 0.8276304602622986),
 ('and', 0.8275236487388611),
 ('nahe,', 0.8246423602104187)]

# Word2Vec with SkipGram
Predicts context, rare words or phrases<br>
The SkipGram brings back more similar words than CBOW

In [14]:
w2v_sg = Word2Vec(df['description_clean_2'],
                  min_count=1, 
                  sg=1, 
                  workers=3) 

In [15]:
w2v_sg.wv.most_similar_cosmul(positive=['red', 'lush', 'juicy', 'italy'])

[('abounds', 0.5515811443328857),
 ('overflows', 0.5425879955291748),
 ('endearing', 0.5424729585647583),
 ('sassy', 0.5420688986778259),
 ('scrumptious', 0.5410647392272949),
 ('cheery', 0.5407906174659729),
 ('jazzy', 0.5400341749191284),
 ('charmingly', 0.5380794405937195),
 ('plethora', 0.5352271199226379),
 ('uncomplicated', 0.5332666635513306)]

In [16]:
text_w2v_sg = Word2Vec(df['text_2'],
                  min_count=1, 
                  sg=1, 
                  workers=3) 

In [17]:
text_w2v_sg.wv.most_similar_cosmul('red')

[('nandark', 0.8667792081832886),
 ('nanrich', 0.8603235483169556),
 ('nanblack', 0.854231595993042),
 ('nana', 0.8525999784469604),
 ('nanred', 0.8523722290992737),
 ('nandeep', 0.8514415621757507),
 ('margaux,', 0.8423077464103699),
 ('nancherry', 0.8416593670845032),
 ('st-julien,', 0.8414425253868103),
 ('nanripe', 0.8412728309631348)]

# Doc2Vec
Specific document vector added to Word2Vec

In [109]:
summ_tokens = df_white['description_clean_2']

In [None]:
# Try bi grams

In [112]:
from gensim.models import Phrases
bigram = Phrases(summ_tokens, min_count=1, threshold=3, delimiter=b' ')
bigram_docs = []
for doc in summ_tokens:
    bigram_docs.append(bigram[doc])
tagged_bigrams = [TaggedDocument(bigrams, [i]) for i, bigrams in enumerate(bigram_docs)]
bigrams_model = Doc2Vec(vector_size = 200,
                        min_count = 3,
                        window = 10,
                        epochs = 20)
bigrams_model.build_vocab(tagged_bigrams)
bigrams_model.train(tagged_bigrams,
                    total_examples = d2v.corpus_count,
                    epochs = d2v.epochs)

In [18]:
df_white = df[df['color'] == 'white']

In [19]:
df_white.head()

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,text,description_clean_2,text_2
0,"Skin Contact Silvaner ""Kleine Heimat,"" 2Naturk...",2Naturkinder,2017,750,white,24,germany,franken,https://www.astorwines.com/SearchResultsSingle...,"Made from 100% Silvaner, this white wine ferme...",silvaner,0,made silvaner white wine ferments skins days g...,franken white germany made silvaner white wine...,"[made, silvaner, white, wine, ferments, skins,...","[franken, white, germany, made, silvaner, whit..."
2,"Sekt Weiss, 50o N NV",50oN,NV,750,white,14,germany,rheingau,https://www.astorwines.com/SearchResultsSingle...,Sometimes you just want to drink a wine becaus...,müller-thurgau pinot blanc sylvaner,0,sometimes want drink wine tastes good one wine...,rheingau white germany sometimes want drink wi...,"[sometimes, want, drink, wine, tastes, good, o...","[rheingau, white, germany, sometimes, want, dr..."
5,"Dhron Hofberg Riesling Kabinett, A.J. Adam",A.J. Adam,2018,750,white,23,germany,mosel-saar-ruwer,https://www.astorwines.com/SearchResultsSingle...,For me wine is almost always part of a larger ...,riesling,mittel-mosel,wine almost always part larger story pour food...,mosel-saar-ruwermittel-mosel white germany win...,"[wine, almost, always, part, larger, story, po...","[mosel-saar-ruwermittel-mosel, white, germany,..."
10,"The Abarbanel, ""Les Chemins de Favarelle"" Char...",Abarbenel Wines,2016,750,white,0,france,languedoc-roussillon,https://www.astorwines.com/SearchResultsSingle...,Unoaked Chardonnay. Kosher for Passover and Me...,chardonnay,0,unoaked chardonnay kosher passover mevushal,languedoc-roussillon white france unoaked char...,"[unoaked, chardonnay, kosher, passover, mevushal]","[languedoc-roussillon, white, france, unoaked,..."
12,"Kerner, Abbazia Novacella",Abbazia Novacella,2018,750,white,21,italy,trentino-alto adige,https://www.astorwines.com/SearchResultsSingle...,There's so much history in this bottle it woul...,kerner,alto adige,much history bottle would drain ink pens write...,trentino-alto adigealto adige white italy much...,"[much, history, bottle, would, drain, ink, pen...","[trentino-alto, adigealto, adige, white, italy..."


In [20]:
df_red = df[df['color'] == 'red']

In [21]:
# Clay from NLT project 

In [22]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield TaggedDocument(list_of_words, [i])

In [23]:
data_tag = list(create_tagged_document(df_white['description_clean_2']))

In [24]:
d2v = Doc2Vec(vector_size=100, min_count=3, epochs=30)

In [25]:
d2v.build_vocab(data_tag)

In [26]:
d2v.train(data_tag, total_examples=d2v.corpus_count, epochs=d2v.epochs)

In [27]:
df_white.head(30)

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,text,description_clean_2,text_2
0,"Skin Contact Silvaner ""Kleine Heimat,"" 2Naturk...",2Naturkinder,2017,750,white,24,germany,franken,https://www.astorwines.com/SearchResultsSingle...,"Made from 100% Silvaner, this white wine ferme...",silvaner,0,made silvaner white wine ferments skins days g...,franken white germany made silvaner white wine...,"[made, silvaner, white, wine, ferments, skins,...","[franken, white, germany, made, silvaner, whit..."
2,"Sekt Weiss, 50o N NV",50oN,NV,750,white,14,germany,rheingau,https://www.astorwines.com/SearchResultsSingle...,Sometimes you just want to drink a wine becaus...,müller-thurgau pinot blanc sylvaner,0,sometimes want drink wine tastes good one wine...,rheingau white germany sometimes want drink wi...,"[sometimes, want, drink, wine, tastes, good, o...","[rheingau, white, germany, sometimes, want, dr..."
5,"Dhron Hofberg Riesling Kabinett, A.J. Adam",A.J. Adam,2018,750,white,23,germany,mosel-saar-ruwer,https://www.astorwines.com/SearchResultsSingle...,For me wine is almost always part of a larger ...,riesling,mittel-mosel,wine almost always part larger story pour food...,mosel-saar-ruwermittel-mosel white germany win...,"[wine, almost, always, part, larger, story, po...","[mosel-saar-ruwermittel-mosel, white, germany,..."
10,"The Abarbanel, ""Les Chemins de Favarelle"" Char...",Abarbenel Wines,2016,750,white,0,france,languedoc-roussillon,https://www.astorwines.com/SearchResultsSingle...,Unoaked Chardonnay. Kosher for Passover and Me...,chardonnay,0,unoaked chardonnay kosher passover mevushal,languedoc-roussillon white france unoaked char...,"[unoaked, chardonnay, kosher, passover, mevushal]","[languedoc-roussillon, white, france, unoaked,..."
12,"Kerner, Abbazia Novacella",Abbazia Novacella,2018,750,white,21,italy,trentino-alto adige,https://www.astorwines.com/SearchResultsSingle...,There's so much history in this bottle it woul...,kerner,alto adige,much history bottle would drain ink pens write...,trentino-alto adigealto adige white italy much...,"[much, history, bottle, would, drain, ink, pen...","[trentino-alto, adigealto, adige, white, italy..."
13,"Pinot Grigio, Abbazia di Novacella",Abbazia Novacella,2018,750,white,24,italy,trentino-alto adige,https://www.astorwines.com/SearchResultsSingle...,The palates of many people have been jaded by ...,pinot grigio,alto adige,palates many people jaded pinot grigio often v...,trentino-alto adigealto adige white italy pala...,"[palates, many, people, jaded, pinot, grigio, ...","[trentino-alto, adigealto, adige, white, italy..."
21,"Agrapart ""Venus"" Blanc de Blancs Brut Nature",Agrapart,2013,750,white,224,france,champagne,https://www.astorwines.com/SearchResultsSingle...,"All elegance and minerality, this Champagne is...",chardonnay,côte de blancs,elegance minerality champagne stunning oysters...,champagnecôte de blancs white france elegance ...,"[elegance, minerality, champagne, stunning, oy...","[champagnecôte, de, blancs, white, france, ele..."
22,"Agrapart ""Venus"" Blanc de Blancs Brut Nature",Agrapart,2012,750,white,214,france,champagne,https://www.astorwines.com/SearchResultsSingle...,"All elegance and minerality, this Champagne is...",chardonnay,côte de blancs,elegance minerality champagne stunning oysters...,champagnecôte de blancs white france elegance ...,"[elegance, minerality, champagne, stunning, oy...","[champagnecôte, de, blancs, white, france, ele..."
23,"Agrapart ""Crus 7"" Blanc de Blancs Brut NV",Agrapart,NV,750,white,0,france,champagne,https://www.astorwines.com/SearchResultsSingle...,"Today, brothers Fabrice and Pascal Agrapart co...",chardonnay,côte de blancs,today brothers fabrice pascal agrapart continu...,champagnecôte de blancs white france today bro...,"[today, brothers, fabrice, pascal, agrapart, c...","[champagnecôte, de, blancs, white, france, tod..."
29,"Pinot Blanc, Albert Mann",Albert Mann,2018,750,white,19,france,alsace,https://www.astorwines.com/SearchResultsSingle...,Alsace will always hold a special place in my ...,pinot blanc,0,alsace always hold special place heart region ...,alsace white france alsace always hold special...,"[alsace, always, hold, special, place, heart, ...","[alsace, white, france, alsace, always, hold, ..."


In [28]:
d2v.docvecs.most_similar(23)

[(1302, 0.774288535118103),
 (457, 0.7183742523193359),
 (771, 0.7134526371955872),
 (1413, 0.7104878425598145),
 (586, 0.7024083137512207),
 (510, 0.6950732469558716),
 (1373, 0.6934767961502075),
 (798, 0.6922529935836792),
 (1326, 0.6852031946182251),
 (1327, 0.6799265146255493)]

In [29]:
df.loc[437, 'url']

'https://www.astorwines.com/SearchResultsSingle.aspx?p=1&search=17840&searchtype=Contains'

# TFIDF

In [30]:
tf = TfidfVectorizer(analyzer='word', 
                     ngram_range=(1, 2), 
                     stop_words='english')

tf_matrix = tf.fit_transform(df['description_clean'])
tf_matrix.shape

(12145, 381534)

In [31]:
cosine_sim = cosine_similarity(tf_matrix, tf_matrix)

recommender = pairwise_distances(cosine_sim, metric='cosine')

recommender.shape

(12145, 12145)

In [32]:
recommender

array([[0.        , 0.60636978, 0.72231285, ..., 0.77854966, 0.71070869,
        0.77493859],
       [0.60636978, 0.        , 0.76262784, ..., 0.82937114, 0.76180263,
        0.81035657],
       [0.72231285, 0.76262784, 0.        , ..., 0.56455272, 0.57763284,
        0.62394855],
       ...,
       [0.77854966, 0.82937114, 0.56455272, ..., 0.        , 0.62691373,
        0.66426633],
       [0.71070869, 0.76180263, 0.57763284, ..., 0.62691373, 0.        ,
        0.60203727],
       [0.77493859, 0.81035657, 0.62394855, ..., 0.66426633, 0.60203727,
        0.        ]])

In [33]:
df.head(1)

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,text,description_clean_2,text_2
0,"Skin Contact Silvaner ""Kleine Heimat,"" 2Naturk...",2Naturkinder,2017,750,white,24,germany,franken,https://www.astorwines.com/SearchResultsSingle...,"Made from 100% Silvaner, this white wine ferme...",silvaner,0,made silvaner white wine ferments skins days g...,franken white germany made silvaner white wine...,"[made, silvaner, white, wine, ferments, skins,...","[franken, white, germany, made, silvaner, whit..."


In [64]:
recommender_df = pd.DataFrame(recommender, 
             index=df.index,
             columns=df['name'])

In [65]:
recommender_df.head()

name,"Skin Contact Silvaner ""Kleine Heimat,"" 2Naturkinder","Fledermaus Rot, 2Naturkinder","Sekt Weiss, 50o N NV","Fuga, Mencía, Ribeira Sacra","Cirò Rosso Classico Superiore, A Vita","Dhron Hofberg Riesling Kabinett, A.J. Adam","Areni ""L. 2017021"", Aaron Sedrick Rawlins","Bierzo Mencía, Abad Dom Bueno","The Abarbanel, ""Les Dolmens Rouges"" Cabernet Sauvignon","The Abarbanel, ""Les Chemins de Favarelle"" Chardonnay",...,Weingut Gunther Steinmetz Pinot Noir,Messmer Spatburgunder (1 liter),Muller-Catoir Haardt Spatburgunder Trocken,Weingut Ziereisen Spatburgunder Schulen Ziereisen,Wittmann Spatburgunder,Weingut Darting Pinot Meunier,Hexamer Spatburgunder Weissherbst Halbtrocken,Funf Sangria,Schmitt Sohne Ice Wine (500ML),Dr. Loosen Eiswein Riesling (375ML Half-bottle)
0,0.0,0.60637,0.722313,0.677793,0.726989,0.745429,0.688633,0.684973,0.816716,0.955439,...,0.873551,0.981488,0.8832,0.708942,0.799446,0.835188,0.803975,0.77855,0.710709,0.774939
1,0.60637,0.0,0.762628,0.748901,0.776654,0.745178,0.712811,0.745318,0.849736,0.964093,...,0.895577,0.980191,0.904712,0.72635,0.759531,0.772622,0.799007,0.829371,0.761803,0.810357
2,0.722313,0.762628,0.0,0.487461,0.609532,0.642945,0.610022,0.570958,0.686989,0.919988,...,0.769149,0.989099,0.779797,0.511896,0.62881,0.67874,0.652541,0.564553,0.577633,0.623949
3,0.677793,0.748901,0.487461,0.0,0.477965,0.61979,0.471534,0.429038,0.526843,0.928035,...,0.659638,0.986922,0.726716,0.409554,0.554667,0.605509,0.659389,0.539037,0.495625,0.584161
4,0.726989,0.776654,0.609532,0.477965,0.0,0.68074,0.535245,0.52832,0.607206,0.948577,...,0.728645,0.985973,0.799975,0.552385,0.655437,0.673611,0.712169,0.663807,0.5962,0.682985


In [66]:
tf2 = TfidfVectorizer(analyzer='word', 
                     ngram_range=(1, 2), 
                     stop_words='english')
tf_matrix2 = tf2.fit_transform(df['text'])
cosine_sim = cosine_similarity(tf_matrix2, tf_matrix2)
recommender2 = pairwise_distances(cosine_sim, metric='cosine')
recommender_df2 = pd.DataFrame(recommender2, 
             index=df.index,
             columns=df['name'])

In [79]:
df.loc[5550:5555]

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,text,description_clean_2,text_2
5550,Gary Farrell Russian River Selection Pinot Noir,0,2017,750,red,47,usa,"russian river, sonoma county, california",https://www.wine.com/product/gary-farrell-russ...,[This is an outstanding showcase of appellatio...,pinot noir,0,outstanding showcase appellation sourced stell...,"russian river, sonoma county, california red u...","[outstanding, showcase, appellation, sourced, ...","[russian, river,, sonoma, county,, california,..."
5551,Lynmar Winery Russian River Pinot Noir,0,2018,750,red,49,usa,"russian river, sonoma county, california",https://www.wine.com/product/lynmar-winery-rus...,nanThe 2018 Lynmar Winery Russian River Pinot ...,pinot noir,0,nanthe lynmar winery russian river pinot noir ...,"russian river, sonoma county, california red u...","[nanthe, lynmar, winery, russian, river, pinot...","[russian, river,, sonoma, county,, california,..."
5552,Oakville Winery Estate Zinfandel,0,2017,750,red,27,usa,"oakville, napa valley, california",https://www.wine.com/product/oakville-winery-e...,[C: The 2017 Oakville Winery Zinfandel does an...,zinfandel,0,oakville winery zinfandel excellent job presen...,"oakville, napa valley, california red usa oakv...","[oakville, winery, zinfandel, excellent, job, ...","[oakville,, napa, valley,, california, red, us..."
5553,Trefethen Cabernet Sauvignon Estate (375ML hal...,0,2016,375,red,31,usa,"napa valley, california",https://www.wine.com/product/trefethen-caberne...,"[<p><span style=""color: rgb(66, 64, 64);"">With...",cabernet sauvignon,0,span style color rgb fresh herb notes lively a...,"napa valley, california red usa span style col...","[span, style, color, rgb, fresh, herb, notes, ...","[napa, valley,, california, red, usa, span, st..."
5554,Highway 12 Red Blend,0,2016,750,red,24,usa,"sonoma county, california",https://www.wine.com/product/highway-12-red-bl...,"nanA fruity, leathery nose offers a glimpse in...",other red blends,0,nana fruity leathery nose offers glimpse unfol...,"sonoma county, california red usa nana fruity ...","[nana, fruity, leathery, nose, offers, glimpse...","[sonoma, county,, california, red, usa, nana, ..."
5555,Frog's Leap Zinfandel (375ML half-bottle),0,2018,375,red,16,usa,"napa valley, california",https://www.wine.com/product/frogs-leap-zinfan...,"[<p>Close your eyes, smell this wine, and you ...",zinfandel,0,close eyes smell wine might think moved ruther...,"napa valley, california red usa close eyes sme...","[close, eyes, smell, wine, might, think, moved...","[napa, valley,, california, red, usa, close, e..."


In [81]:
recommender_df['Gary Farrell Russian River Selection Pinot Noir'].sort_values()[1:11]

5636    0.307555
5180    0.312158
5053    0.312741
4957    0.312777
5398    0.316472
8554    0.317532
4774    0.317773
5485    0.318231
4731    0.318879
4658    0.320760
Name: Gary Farrell Russian River Selection Pinot Noir, dtype: float64

In [83]:
recommender_df2['Gary Farrell Russian River Selection Pinot Noir'].sort_values()[1:11]

5082    0.209751
4658    0.213270
4859    0.216470
4735    0.217534
4774    0.227940
5863    0.231019
5361    0.232554
6317    0.232586
5422    0.234921
5672    0.235014
Name: Gary Farrell Russian River Selection Pinot Noir, dtype: float64

In [85]:
df.loc[5636, 'name']

'Joseph Phelps Freestone Vineyards Sonoma Coast Pinot Noir'

## Need Pieces - Testing on breakin up by color

Initial stages of building a recommnders. Changed the index to index corresponding to wines. Need to get words to find most similar wine in df to recommend from here.

In [45]:
w2v_sg_white = Word2Vec(df_white['description_clean_2'],
                  min_count=1, 
                  sg=1, 
                  workers=3) 

In [40]:
# recommender_df['Skin Contact Silvaner "Kleine Heimat," 2Naturkinder'].sort_values()[1:11]
recommender_df[df.loc[23]['name']].sort_values()[1:11]

22       0.377764
21       0.377764
1916     0.559868
2480     0.592201
1942     0.599560
2084     0.601497
10144    0.607226
2597     0.613014
1787     0.613121
3077     0.619425
Name: Agrapart "Crus 7" Blanc de Blancs Brut  NV, dtype: float64

In [41]:
df.loc[1916 , 'name']

'Le Brun Servenay, Extra Brut Millésime, Vieilles Vignes, Grand Cru'

In [50]:
w2v_sg_white.wv.most_similar_cosmul(['bright', 'white', 'juicy'])

[('delightfully', 0.8403853178024292),
 ('fragrant', 0.8399611115455627),
 ('grassy', 0.8391693830490112),
 ('appealing', 0.8361043334007263),
 ('bursting', 0.8358244895935059),
 ('delicate', 0.8328315615653992),
 ('alluring', 0.8316115140914917),
 ('burst', 0.8299099206924438),
 ('citric', 0.8294507265090942),
 ('exuberant', 0.8270400166511536)]

In [113]:
def search_docs(word, number_of_articles):
    new_vec = bigrams_model.infer_vector([word])
    tag_list = bigrams_model.docvecs.most_similar([new_vec])[0:number_of_articles]
    sims = []
    tags = []
    for num in tag_list:
        tags.append(num[0])
        sims.append(num[1])
    dfs = []
    for t in tags:
        dfs.append(df_white.iloc[t, :])
    new_df = pd.DataFrame(dfs)
    new_df['similarity_percentage'] = sims
    return new_df

In [121]:
df_white['description'] = [str(row).lstrip('nan') for row in df_white['description']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [122]:
search_docs('acid', 6)

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,text,description_clean_2,text_2,similarity_percentage
12795,Gunderloch Nackenheimer Rothenberg Riesling Au...,0,2017,750.0,white,65,germany,"rheinhessen, germany",https://www.wine.com/product/gunderloch-nacken...,[Super-ripe apricot character and a stunning i...,riesling,0,super ripe apricot character stunning interpla...,"rheinhessen, germany white germany super ripe ...","[super, ripe, apricot, character, stunning, in...","[rheinhessen,, germany, white, germany, super,...",0.995969
7357,Ca'Momi Chardonnay,0,2018,750.0,white,20,usa,"napa valley, california",https://www.wine.com/product/camomi-chardonnay...,"[Buttery, with plush, creamy apple pastry and ...",chardonnay,0,buttery plush creamy apple pastry dried mango ...,"napa valley, california white usa buttery plus...","[buttery, plush, creamy, apple, pastry, dried,...","[napa, valley,, california, white, usa, butter...",0.995503
12720,Von Winning Sauvignon Blanc II,0,2019,750.0,white,24,germany,"pfalz, germany",https://www.wine.com/product/von-winning-sauvi...,"An unmistakeable aroma of juicy gooseberries, ...",sauvignon blanc,0,nanan unmistakeable aroma juicy gooseberries l...,"pfalz, germany white germany nanan unmistakeab...","[nanan, unmistakeable, aroma, juicy, gooseberr...","[pfalz,, germany, white, germany, nanan, unmis...",0.995058
10679,Maison Matisco Saint-Veran,0,2016,750.0,white,23,france,"st-veran, maconnais, burgundy, france",https://www.wine.com/product/maison-matisco-sa...,"This is Maison Matisco’s star wine, with a bea...",chardonnay,burgundy,nanthis maison matisco star wine beautiful pal...,"st-veran, maconnais, burgundy, franceburgundy ...","[nanthis, maison, matisco, star, wine, beautif...","[st-veran,, maconnais,, burgundy,, franceburgu...",0.994713
7389,PlumpJack Reserve Chardonnay,0,2018,750.0,white,59,usa,"napa valley, california",https://www.wine.com/product/plumpjack-reserve...,The 2018 PlumpJack Chardonnay is packed with t...,chardonnay,0,nanthe plumpjack chardonnay packed fresh aroma...,"napa valley, california white usa nanthe plump...","[nanthe, plumpjack, chardonnay, packed, fresh,...","[napa, valley,, california, white, usa, nanthe...",0.99379
2156,"Prosecco ""Fedéra"" Extra Dry, Miotto NV",Miotto,NV,1.5,white,34,italy,veneto,https://www.astorwines.com/SearchResultsSingle...,One of the greatest Prosecco wines to hit our ...,glera,valdobbiadene,one greatest prosecco wines hit shelves quite ...,venetovaldobbiadene white italy one greatest p...,"[one, greatest, prosecco, wines, hit, shelves,...","[venetovaldobbiadene, white, italy, one, great...",0.993132


In [99]:
recommender_df2['EglyOuriet, Grand Cru Brut'].sort_values()[1:6]

10776    0.343650
2724     0.355140
439      0.357030
7870     0.363189
2044     0.375894
Name: EglyOuriet, Grand Cru Brut, dtype: float64

In [97]:
recommender_df2['Covenant "Lavan" Chardonnay, Kosher'].sort_values()[1:3]

851    0.414492
174    0.422725
Name: Covenant "Lavan" Chardonnay, Kosher, dtype: float64

In [98]:
print(df.loc[10776, 'name'])
print(df.loc[2724, 'name'])
print(df.loc[851, 'name'])
print(df.loc[174, 'name'])

Simonnet-Febvre Cremant de Bourgogne Brut
Savart Premier Cru "Accomplie," Extra Brut                                                            NV
Covenant, The Tribe Chardonnay
Baron Herzog Chenin Blanc, Kosher
