In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize

%pylab inline
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from gensim import corpora, models, similarities
from nltk.corpus import stopwords
import nltk

import snowballstemmer
import gensim

from string import ascii_lowercase

import itertools

from collections import defaultdict
import collections

Populating the interactive namespace from numpy and matplotlib




In [2]:
#import data
reds = pd.read_json('red_wines.json', orient = "columns", typ='frame')
whts = pd.read_json('white_wines.json',orient = 'columns', typ='frame')
rose = pd.read_json('rose_wines.json', orient='columns', typ='frame')
sprk = pd.read_json('spark_wines.json', orient='columns', typ='frame')

#combine data_sets (wine type is in 'Varietal.WineType.Id' and 'Varietal.WineType.Name')
wine = pd.DataFrame()
wine = wine.append(reds, ignore_index = True)
wine = wine.append(whts, ignore_index = True)
wine = wine.append(rose, ignore_index = True)
wine = wine.append(sprk, ignore_index = True)

wine_orig = wine.copy()
print('number of entries:',len(wine))

number of entries: 111072


In [3]:
len(reds)

72577

In [3]:
#drop duplicate IDs if they exist
wine = wine.drop_duplicates(subset='Id',keep='first')
print('number of entries:',len(wine))


number of entries: 104787


In [4]:
#use the wine dataframe, column name, and column key as inputs
#also have the notes column specifiable as optional parameter
def select_wines(data, col_name='NaN', col_key='NaN', notes_col='Notes'):
    
    if (col_name == 'NaN') & (col_key == 'NaN'):
        notes = data[data[notes_col] != 'NaN'][['Id',notes_col]].copy()
    else:
        notes = data[(data[col_name]==col_key) & (data[notes_col] != 'NaN')][['Id',notes_col]].copy()
    
    return notes
    

In [5]:
#cleaning up the test_notes dataframe
#takes as input the notes dataframe
def clean_notes(input_list, input_ids):
    
    #copy input just to be sure
    notes_list = input_list.copy()
    w_ids_list = input_ids.tolist().copy()
    
    #replace 'Winemaker's Notes' tag
    notes_list.replace('Winemaker\'s Notes:',' ',inplace=True,regex=True)
    #replace puncuation, special characters, etc.
    notes_list.replace('[$&!:"#%\'()*+,-.—/;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ',inplace=True,regex=True)
    
    #stemmer and stopwords initialization
    stemmer = snowballstemmer.EnglishStemmer()
    #initialize stop list
    stop = stopwords.words('english')
    #extend stoplist
    stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
    #also include stems
    stoplist = stemmer.stemWords(stop)
    #set stoplist
    stoplist = set(stoplist)
    #combine full and stemmed words
    stop = set(sorted(stop + list(stoplist)))
    
    #remove stopwords and return list
    notes_filt = [' '.join(word for word in line.lower().split() if word not in stop) for line in notes_list.tolist()]
    
    #find stems and replace words
    notes_stem = [stemmer.stemWords(line.split()) for line in notes_filt]
    
    #output 
    #output = pd.DataFrame(pd.Series(notes_stem),columns=['Stemmed'])
    output = pd.DataFrame({'Id':pd.Series(w_ids_list), 'Stemmed':pd.Series(notes_stem)})
    
    return output
    

In [6]:
def count_notes(wine_notes_stem):
    freq_stem = defaultdict(int)
    for line in wine_notes.Stemmed:
        for token in line:
            freq_stem[token] += 1
    
    return freq_stem

In [223]:
def print_notes(i, wine_data):
    print(wine[wine.Id==wine_data.Id[i]][['Id','Name','Vintage','Varietal.Name']])
    print(wine_data.Stemmed[i])
    return None

In [110]:
#select wines
wine_notes = select_wines(wine)

#clean up tasting notes and merge with orig dataframe
stemmed_notes = clean_notes(wine_notes.Notes, wine_notes.Id)
wine_notes = wine_notes.merge(stemmed_notes, how='left', on='Id')

#count stem frequencies
note_freq = count_notes(wine_notes.Stemmed)

#remove low counts (< 5) and merge
wine_notes_mult = [[token for token in line if note_freq[token] > 5]
                    for line in wine_notes.Stemmed]
wine_notes_mult = pd.DataFrame({'Id':pd.Series(wine_notes.Id), 'Stem>5':pd.Series(wine_notes_mult)})
wine_notes = pd.merge(wine_notes, wine_notes_mult, how='left', on='Id')


#pd.DataFrame(pd.Series(stemmed_notes))[0].tolist()
#return to original form: wine_notes.Stemmed.tolist()
#return to original form: wine_notes['Stem>5'].tolist()

In [111]:
wine_notes[['Id','Notes']][wine_notes.Notes.str.startswith('About')].sort_values(by='Notes')

Unnamed: 0,Id,Notes
12881,74451,About Alexander Valley Vineyards: Alexander V...
19924,52880,About Amity Vineyards: Amity means friendship...
8472,74049,"About Balcom & Moe Winery: In 2001, Balcom & ..."
10454,53544,"About Balcom & Moe Winery: In 2001, Balcom & ..."
10573,61807,"About Balcom & Moe Winery: In 2001, Balcom & ..."
11323,54398,"About Balcom & Moe Winery: In 2001, Balcom & ..."
87400,61618,"About Balcom & Moe Winery: In 2001, Balcom & ..."
67191,53542,"About Balcom & Moe Winery: In 2001, Balcom & ..."
22485,53541,"About Balcom & Moe Winery: In 2001, Balcom & ..."
13109,54397,"About Balcom & Moe Winery: In 2001, Balcom & ..."


In [113]:
#remove non-winemaker notes entries, i.e. "About" 
wine_notes = wine_notes[wine_notes.Notes.str.match('Winemaker\'s Notes: ')].copy()

#identify and save duplicate entries
wine_SaveDups = wine_notes[wine_notes.duplicated(subset='Notes',keep=False)].copy()

#remove duplicate winemaker notes
wine_notes = wine_notes.drop_duplicates(subset='Notes',keep='first').copy().reset_index(drop=True)


In [336]:
wine_notes.to_pickle('wine_notes.pickle')

In [119]:
#keyword for saving data
data_key = 'wine'

In [120]:
#create dictionary of all stemmed words
dictionary = corpora.Dictionary(wine_notes['Stemmed'].tolist())
dictionary.save(data_key+'.dict')
#create corpus from dictionary
corpus = [dictionary.doc2bow(line) for line in wine_notes['Stemmed'].tolist()]
corpora.MmCorpus.serialize(data_key+'.mm', corpus)

In [121]:
#1. convert to tf-idf model
tfidf = models.TfidfModel(corpus)
tfidf.save(data_key+'.tfidf_model')

#save corpus to memory
corpus_tfidf = tfidf[corpus]

In [122]:
#2. make lsi model out of tfidf model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)
lsi.save(data_key+'.lsi')

#save corpus to mempory
corpus_lsi = lsi[corpus_tfidf]

#show lsi topics
lsi.show_topics(formatted=False)

[(0,
  [('red', 0.15980691532808758),
   ('cabernet', 0.13872463545244956),
   ('cherri', 0.13531397467683948),
   ('black', 0.13443607105136537),
   ('fruit', 0.12265283193294517),
   ('note', 0.12058316755186818),
   ('tannin', 0.11978117063261282),
   ('flavor', 0.11893816448986058),
   ('spice', 0.11504974245550983),
   ('palat', 0.1139041015084049)]),
 (1,
  [('cabernet', 0.3995078379955424),
   ('merlot', 0.25171123308563093),
   ('sauvignon', 0.19867655469337009),
   ('franc', 0.18969249342857464),
   ('citrus', -0.18143048860650085),
   ('appl', -0.17517935815842217),
   ('blend', 0.155367386015141),
   ('pear', -0.15100807894707582),
   ('chardonnay', -0.14735255685084808),
   ('petit', 0.14697295287253609)]),
 (2,
  [('cabernet', 0.36230807115183239),
   ('sauvignon', 0.27409364548223764),
   ('merlot', 0.23366748454589031),
   ('red', -0.21433115192360774),
   ('franc', 0.20634105778054404),
   ('blend', 0.19717010555328943),
   ('rubi', -0.15789575195242819),
   ('chardonna

In [201]:
#3. make lda model out of tfidf model
#default settings for the most part
lda1 = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics = 30, passes = 1)

#lda.save(data_key+'.lda')

corpus_lda1 = lda[corpus_tfidf]

lda1.show_topics(num_topics=30,formatted=False)

[(0,
  [('lime', 0.041359991324368864),
   ('peach', 0.040031444866733955),
   ('apricot', 0.025819038656736289),
   ('viognier', 0.011439772081899459),
   ('crisp', 0.010796188650992767),
   ('pink', 0.010124528920519545),
   ('white', 0.010097226763012739),
   ('zesti', 0.0076796018172910719),
   ('honey', 0.0074433648244017405),
   ('roussann', 0.0066195349577925612)]),
 (1,
  [('salti', 0.034774478263978822),
   ('citric', 0.032799657676356078),
   ('chardonnay', 0.026537805713932942),
   ('anise', 0.020502750217254135),
   ('sancerr', 0.018312187002072895),
   ('numer', 0.015771834197703352),
   ('entre', 0.014170712107362069),
   ('clover', 0.011442895048758082),
   ('pink', 0.011292632475578537),
   ('flash', 0.0096468348907376884)]),
 (2,
  [('sparkl', 0.13396683372780285),
   ('lee', 0.05371652828009111),
   ('jasmin', 0.032288253221840098),
   ('zest', 0.019776584113367832),
   ('citrus', 0.017603768791347809),
   ('glint', 0.016922179600070707),
   ('lumin', 0.01396055369612

In [194]:
#don't like the results - seems to ignore words associated with red wines, i.e. 'dark' 'jammy' 'plum' etc.
#increase number of topics?
#increase number of passes?
#look at other parameters, i.e. decay, alpha, eta
#randomize the input corpus? right now its organized red, white, rose, sparkling
#... the topic seem to reflect words more commonly found in sparkling than red, maybe order matters?

In [207]:
#3. make lda model out of the tfidf model - reverse corpus order
#flip data based on index --> .sort_index(ascending=False)
data_input = wine_notes.sort_index(ascending=False)['Stemmed'].tolist()

#keyword for saving data
data_key = 'wine_reverse'

#num_topics
n_top = 30

#create dictionary of all stemmed words
dict_reverse = corpora.Dictionary(data_input)
dict_reverse.save(data_key+'.dict')
#create corpus from dictionary
corp_reverse = [dict_reverse.doc2bow(line) for line in data_input]
#save corpus
corpora.MmCorpus.serialize(data_key+'.mm', corp_reverse)

#1. convert to tf-idf model
tfidf_reverse = models.TfidfModel(corp_reverse)
tfidf_reverse.save(data_key+'.tfidf_model')

#save corpus to memory
corp_reverse_tfidf = tfidf_reverse[corp_reverse]

#3. make lda model out of tfidf model
#default settings for the most part
lda_reverse = models.LdaModel(corp_reverse_tfidf, id2word=dict_reverse, num_topics = n_top, passes = 1)

lda_reverse.save(data_key+'.lda')

corp_reverse_lda = lda_reverse[corp_reverse_tfidf]

lda_reverse.show_topics(num_topics=n_top,formatted=False)

[(0,
  [('blue', 0.022280660591453651),
   ('bird', 0.015751931283863301),
   ('crack', 0.015319407084167088),
   ('minti', 0.014137180107482761),
   ('vegetarian', 0.012176522297420297),
   ('peppercorn', 0.011513058321745056),
   ('underscor', 0.011368282061916687),
   ('anis', 0.010401974210280126),
   ('briar', 0.0098393134910892202),
   ('damp', 0.0091229243922045632)]),
 (1,
  [('plush', 0.054316861197532991),
   ('chop', 0.0329724852085325),
   ('scarlet', 0.012086855447314282),
   ('entranc', 0.0110015534344422),
   ('neighbor', 0.010302776474770894),
   ('desir', 0.010180699361516304),
   ('quaffabl', 0.01013687852064268),
   ('undercurr', 0.0076809109985818596),
   ('ink', 0.0075794687281939728),
   ('piec', 0.0074034457652030483)]),
 (2,
  [('classico', 0.037659725825832963),
   ('plummi', 0.034984729799094567),
   ('rabbit', 0.01509193982576372),
   ('au', 0.013239178055603868),
   ('il', 0.01197043127350874),
   ('reddish', 0.01196869423823004),
   ('anim', 0.0114009164592

In [None]:
#results now are exclusively terms associated w red wines...

In [234]:
#3. make lda model out of the tfidf model - shuffled corpus order
#flip data based on index --> .sample(frac=1, replace=False)
data_shuff = wine_notes.sample(frac=1, replace = False).copy().reset_index(drop=True)
data_input = data_shuff['Stemmed'].tolist()
#keyword for saving data
data_key = 'wine_shuffled'

#num_topics
n_top = 30

#create dictionary of all stemmed words --> don't need to repeat this step every time
dict_shuffled = corpora.Dictionary(data_input)
dict_shuffled.save(data_key+'.dict')
#create corpus from dictionary --> insert reshuffled data here - possible to reshuffle corpus?
corp_shuffled = [dict_shuffled.doc2bow(line) for line in data_input]
#save corpus
corpora.MmCorpus.serialize(data_key+'.mm', corp_shuffled)

#1. convert to tf-idf model
tfidf_shuffled = models.TfidfModel(corp_shuffled)
tfidf_shuffled.save(data_key+'.tfidf_model')

#save corpus to memory
corp_shuffled_tfidf = tfidf_shuffled[corp_shuffled]

#3. make lda model out of tfidf model
#default settings for the most part
lda_shuffled = models.LdaModel(corp_shuffled_tfidf, id2word=dict_shuffled, num_topics = n_top, passes = 1)

lda_shuffled.save(data_key+'.lda')

corp_shuffled_lda = lda_shuffled[corp_shuffled_tfidf]

lda_shuffled.show_topics(num_topics=n_top,formatted=False)

[(0,
  [('ether', 0.01171014026394592),
   ('decid', 0.01034717447981948),
   ('biodynam', 0.010003410668146414),
   ('casual', 0.009709029048863423),
   ('oili', 0.0095331852576981593),
   ('heaven', 0.0086857138929141257),
   ('transpar', 0.008504429136171162),
   ('hors', 0.0081572034134494622),
   ('toro', 0.0080464051725413054),
   ('share', 0.0076425958386918376)]),
 (1,
  [('butteri', 0.017780563070804979),
   ('dinner', 0.013699958272200802),
   ('curd', 0.013128416033930146),
   ('popular', 0.010196720810107203),
   ('claret', 0.010156704076519125),
   ('treat', 0.0095334307143644624),
   ('april', 0.0077227669899960849),
   ('ice', 0.0076550101494483967),
   ('sauté', 0.0076224455078210676),
   ('wildflow', 0.0075793087745906146)]),
 (2,
  [('venison', 0.019239986251989579),
   ('recal', 0.016998191797427547),
   ('allspic', 0.015551085106018049),
   ('draw', 0.014941128981160977),
   ('vegetarian', 0.012933750743480127),
   ('entre', 0.012196653166225839),
   ('filter', 0.01

In [293]:
wine_lda = [c for c in corp_shuffled_lda]
wine_lda[0]

[(6, 0.82666730180461578)]

In [317]:
w = 0
i=0
wine_lda[w][i][0]

6

In [326]:
indiv_probs = pd.Series(0,index=range(0,n_top))
indiv_probs.iloc[wine_lda[w][i][0]] = wine_lda[w][i][1]
indiv_probs

0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.826667
7     0.000000
8     0.000000
9     0.000000
10    0.000000
11    0.000000
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    0.000000
17    0.000000
18    0.000000
19    0.000000
20    0.000000
21    0.000000
22    0.000000
23    0.000000
24    0.000000
25    0.000000
26    0.000000
27    0.000000
28    0.000000
29    0.000000
dtype: float64

In [327]:
#list of tuples (wine_lda) into a dataframe
#[i for w in wine_lda for i in w if i[0] == 0]
wine_lda_prob = pd.DataFrame()
for w in range(len(wine_lda)):
    indiv_probs = pd.Series(0,index=range(0,n_top))
    for i in range(len(wine_lda[w])):
        indiv_probs.iloc[wine_lda[w][i][0]] = wine_lda[w][i][1]
    wine_lda_prob = wine_lda_prob.append(indiv_probs.T, ignore_index=True)

In [328]:
wine_lda_prob

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.826667,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.455158,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.208923
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.281241,0.556693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.038114,0.000000,0.000000,0.000000,...,0.456103,0.000000,0.369489,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.289002,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.413905,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.297096,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.542113,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.041202,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.333659,0.000000,0.038967,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.052702,0.000000,0.825363,0.000000,0.000000,0.000000
8,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,...,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617,0.010617
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.820090,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [262]:
#with shuffled input data, now mess with decay
lda_shuffled_k10 = models.LdaModel(corp_shuffled_tfidf, id2word=dict_shuffled, num_topics = n_top, decay = 1, passes = 1)

#lda_shuffled_k10.save(data_key+'.lda')

#corp_shuffled_lda = lda_shuffled[corp_shuffled_tfidf]

lda_shuffled_k10.show_topics(num_topics=n_top,formatted=False)

[(0,
  [('vineyard', 0.0097338321089430625),
   ('pinot', 0.0059734707991251869),
   ('santa', 0.00565709794133151),
   ('noir', 0.0056110592921068819),
   ('valley', 0.004887486390270924),
   ('mountain', 0.0046159756298834358),
   ('barrel', 0.0044593544044123842),
   ('river', 0.0044431927635295845),
   ('soil', 0.0043766488215080026),
   ('russian', 0.0041451990552652286)]),
 (1,
  [('touriga', 0.003987736172723199),
   ('brick', 0.0038694537170429542),
   ('year', 0.0036465981203596472),
   ('pinot', 0.0033647464976159203),
   ('wine', 0.0033551198133302202),
   ('cherri', 0.0032096291070427413),
   ('red', 0.0032008717310825852),
   ('fruit', 0.003058661331901641),
   ('note', 0.0029146336658393275),
   ('tinta', 0.0029126142002028712)]),
 (2,
  [('sparkl', 0.0080547206876144858),
   ('semillon', 0.0076471271250711514),
   ('nutti', 0.0072189836730243585),
   ('brut', 0.0053399002658216128),
   ('hors', 0.0052597688275080896),
   ('bubbl', 0.0052129425619409269),
   ('sushi', 0.0

In [332]:
wine_search = 0
print_notes(wine_search, data_shuff)

           Id                                               Name Vintage  \
42741  103364  Domaine Serene Jerusalem Hill Pinot Noir (375M...    2005   

      Varietal.Name  
42741    Pinot Noir  
['beauti', 'aroma', 'dark', 'cherri', 'red', 'raspberri', 'pipe', 'tobacco', 'milk', 'chocol', 'pomegran', 'earth', 'palat', 'rich', 'suppl', 'show', 'black', 'raspberri', 'cherri', 'pie', 'black', 'currant', 'rose', 'violet', 'oak', 'spice', 'persist', 'seamless', 'fruit']


In [334]:
wine_topics = lda_shuffled.get_document_topics(corp_shuffled_tfidf[wine_search])
#print(wine_topics)
#lda0.show_topic(wine_topics[0][0],topn=50)
topics_wine = [(x[1], x[0]) for x in sorted([(x[1], x[0]) for x in wine_topics], reverse=True)]
pprint(topics_wine)
lda_shuffled.show_topic(topics_wine[0][0],topn=30)

[(6, 0.82666730180407166)]


[('black', 0.0073323658436459235),
 ('dark', 0.0068364028857192515),
 ('cherri', 0.0068112118008341656),
 ('tannin', 0.0057336211193474585),
 ('spice', 0.0051240840499943612),
 ('blackberri', 0.0047960349630945661),
 ('palat', 0.0045786611990393028),
 ('plum', 0.0044540137335985063),
 ('chocol', 0.0043951927290887018),
 ('fruit', 0.0043899021224126185),
 ('red', 0.0043487957595114415),
 ('rich', 0.004336082976918643),
 ('nose', 0.0043031170803746119),
 ('wine', 0.004287182987361921),
 ('note', 0.0042465327876498706),
 ('flavor', 0.0040257333953266751),
 ('boysenberri', 0.0040015481432190162),
 ('finish', 0.0039529249455613272),
 ('cabernet', 0.0037958289857205492),
 ('ripe', 0.0037814704657402084),
 ('long', 0.0037653868405781786),
 ('complex', 0.0036465765470249148),
 ('aroma', 0.003642377232750852),
 ('raspberri', 0.003631953759635616),
 ('sweet', 0.0036317453227137919),
 ('layer', 0.0036087698108080513),
 ('hint', 0.0035764888044848269),
 ('structur', 0.0035675555436689159),
 ('blue

In [335]:
wine_topics = lda_shuffled_k10.get_document_topics(corp_shuffled_tfidf[wine_search])
#print(wine_topics)
#lda0.show_topic(wine_topics[0][0],topn=50)
topics_wine = [(x[1], x[0]) for x in sorted([(x[1], x[0]) for x in wine_topics], reverse=True)]
pprint(topics_wine)
lda_shuffled_k10.show_topic(topics_wine[0][0],topn=30)

[(6, 0.7502997973148392), (21, 0.082344494082523159)]


[('red', 0.01399524964739398),
 ('cherri', 0.010746266010098367),
 ('black', 0.010352302786087253),
 ('tannin', 0.01000378550688533),
 ('dark', 0.008940847223870162),
 ('deep', 0.00887409352842378),
 ('plum', 0.0088738356554184054),
 ('rubi', 0.0088685224038281985),
 ('spice', 0.0085415630014319226),
 ('color', 0.0084980423716148456),
 ('intens', 0.007898442351038917),
 ('ripe', 0.0076894730713625761),
 ('blackberri', 0.0074478907936398817),
 ('nose', 0.0074463354675700271),
 ('berri', 0.007247851905600418),
 ('note', 0.0071921326376178894),
 ('fruit', 0.0070714073103144443),
 ('hint', 0.0069937247642903469),
 ('aroma', 0.0069583958019055793),
 ('palat', 0.0069570145154413546),
 ('full', 0.0069246243623451867),
 ('soft', 0.0069010519280370526),
 ('finish', 0.0068652825866745244),
 ('long', 0.0066400295694770406),
 ('bodi', 0.0066059557419647719),
 ('violet', 0.0065454881655419182),
 ('vanilla', 0.0063646154582876436),
 ('rich', 0.006268210367449698),
 ('chocol', 0.0062101074440030993),

In [123]:
#3. make lda model out of tfidf model - increase passes
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics = 30, passes = 100)

lda.save(data_key+'.lda')

corpus_lda = lda[corpus_tfidf]

lda.show_topics(num_topics=30,formatted=False)

[(0,
  [('fresh', 0.012936801723876117),
   ('appl', 0.01279115186719384),
   ('lime', 0.012264979517310972),
   ('green', 0.011062888979276564),
   ('white', 0.010782773860141474),
   ('fish', 0.010522204131387118),
   ('pair', 0.010247275239228255),
   ('light', 0.0098123912730184136),
   ('dish', 0.0094384904725869252),
   ('crisp', 0.0090658317471460151)]),
 (1,
  [('pear', 0.019200973510115762),
   ('pale', 0.014011592408454437),
   ('peach', 0.011384129508450875),
   ('note', 0.0095458533750616831),
   ('palat', 0.0093376376399218336),
   ('finish', 0.0090794391302572545),
   ('fresh', 0.0089364959273550964),
   ('aroma', 0.008651189744085153),
   ('hint', 0.0083798790702353954),
   ('apricot', 0.0081788499151035053)]),
 (2,
  [('blanc', 0.14764763873629869),
   ('passion', 0.030828737145068247),
   ('tropic', 0.029631908474105975),
   ('quinc', 0.023787449964005534),
   ('watermelon', 0.021416050372967507),
   ('spectat', 0.020323858751197802),
   ('effervesc', 0.016779389671805

In [137]:
#3. make lda model out of tfidf model
lda0 = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics = 30, decay = 0, alpha = 'auto', passes = 1)

#lda0.save(data_key+'.lda')

corpus_lda0 = lda[corpus_tfidf]

lda0.show_topics(num_topics=30,formatted=False)

[(0,
  [('tini', 0.16500799864031579),
   ('appet', 0.16405406053185345),
   ('distinguish', 0.052040762065014221),
   ('cru', 0.041926180600536329),
   ('contact', 0.032539114226988679),
   ('premier', 0.031732436840680889),
   ('drunk', 0.028851969030847462),
   ('knit', 0.024180529784865644),
   ('authent', 0.020467751950641374),
   ('introduc', 0.019225463166996234)]),
 (1,
  [('pink', 0.43102086019365465),
   ('semi', 0.038037643133580289),
   ('simpl', 0.03362108991934723),
   ('hors', 0.02466731657693115),
   ('onion', 0.02380850842541922),
   ('transpar', 0.022354714282257264),
   ('en', 0.017922485059627936),
   ('word', 0.014287367729010136),
   ('effus', 0.0092976132932424162),
   ('lilac', 0.0084199685247887696)]),
 (2,
  [('discreet', 0.084370728148312119),
   ('di', 0.075177964214069595),
   ('direct', 0.051740630770232367),
   ('sensual', 0.051358615637056342),
   ('ginger', 0.044607788127506154),
   ('slice', 0.033795937280225272),
   ('thin', 0.022429056986938384),
   

[(0,
  [('rosé', 0.01622450637962106),
   ('ferment', 0.013770759011348336),
   ('chardonnay', 0.013731578232417701),
   ('mouss', 0.0092911282805901736),
   ('grape', 0.0085023354321357488),
   ('yeast', 0.0071249047485134054),
   ('aperitif', 0.0065903245921661363),
   ('steel', 0.0065258845823634466),
   ('vineyard', 0.0064381447744908064),
   ('stainless', 0.0064020482357216831)]),
 (1,
  [('grapefruit', 0.055952084008640594),
   ('honeysuckl', 0.030482662732476636),
   ('tangerin', 0.018556224488829924),
   ('cava', 0.016261434253163929),
   ('grenach', 0.014075710196333836),
   ('cinsault', 0.01133801145024612),
   ('sur', 0.0096773970031734293),
   ('roussann', 0.0079335388999883186),
   ('muscat', 0.0078803822223963219),
   ('viognier', 0.0074069820855342198)]),
 (2,
  [('lemon', 0.031667312423980867),
   ('yellow', 0.029449619325111143),
   ('straw', 0.026200312317385545),
   ('pale', 0.021314056473782556),
   ('fish', 0.011021913258900031),
   ('shellfish', 0.0104705061980710

In [190]:
wine_search = 5240
print_notes(wine_search)
wine_topics = lda0.get_document_topics(corpus_tfidf[wine_search])
print(wine_topics)
#lda0.show_topic(wine_topics[0][0],topn=50)
topics_wine = [(x[1], x[0]) for x in sorted([(x[1], x[0]) for x in wine_topics], reverse=True)]
#pprint(sorted(wine_topics, key = lambda x: (x[-1],x[0])))
lda0.show_topic(topics_wine[0][0],topn=50)

          Id                                    Name Vintage Varietal.Name
6337  117301  Bodegas Olivares Altos de la Hoya 2010    2010     Mourvedre
['deep', 'rubi', 'color', 'ripe', 'power', 'scent', 'red', 'dark', 'fruit', 'miner', 'spice', 'finish', 'fresh', 'long', 'repeat', 'dark', 'fruit', 'note', 'except', 'combin', 'red', 'meat', 'stew', 'heat', 'love', 'monastrel', 'known', 'mourvèdr', 'franc', 'rhone', 'valley', 'jumilla', 'record', 'show', 'monastrel', 'use', 'least', 'earli', 'th', 'centuri', 'like', 'rhone', 'jumilla', 'get', 'extrem', 'hot', 'summer', 'day', 'foot', 'elev', 'night', 'cool', 'allow', 'grape', 'becom', 'physiolog', 'ripe', 'maintain', 'acid']
[(7, 0.020650972196068294), (13, 0.012156527779136799), (17, 0.021796653836788783), (22, 0.10961170368726297), (23, 0.015277123438814334), (24, 0.016717362288006691), (25, 0.011197844641213501), (26, 0.013279030599965644), (27, 0.44543442952902718), (28, 0.24033034348579826), (29, 0.024376825133051946)]


[('bubbl', 0.017628529219647922),
 ('fresh', 0.015424090812868982),
 ('fine', 0.014156573454774756),
 ('fruiti', 0.012808139001657106),
 ('light', 0.011744898769629577),
 ('delic', 0.011310062249974299),
 ('sparkl', 0.010646621939286557),
 ('aperitif', 0.0099298319291341603),
 ('yellow', 0.0092473967202246991),
 ('pinot', 0.0087995930833953927),
 ('rose', 0.0086054061805203273),
 ('persist', 0.0084690963772800863),
 ('white', 0.0080815237119593892),
 ('color', 0.0079502202848328717),
 ('pale', 0.0076692979460039543),
 ('eleg', 0.00766792614462197),
 ('crisp', 0.0075304830386870789),
 ('dessert', 0.0075227866529618669),
 ('creami', 0.0074782612826338919),
 ('perfect', 0.0071688256677290615),
 ('aroma', 0.007153125430857314),
 ('pleasant', 0.0070480186605029176),
 ('live', 0.0069589668203327007),
 ('note', 0.0069084178264622181),
 ('hint', 0.0067448419316992391),
 ('fruit', 0.00666440107209416),
 ('palat', 0.0066466419775917489),
 ('dri', 0.0065232256426533139),
 ('balanc', 0.00648807126

In [136]:
lda.get_document_topics(corpus_tfidf[wine_search], per_word_topics=True)

([(0, 0.22920071732077349),
  (1, 0.48958571392477745),
  (22, 0.065722446059508888),
  (26, 0.056193042219192607)],
 [(10, [1]),
  (122, [1, 0]),
  (176, [1, 0]),
  (187, [1, 0]),
  (198, [1, 0]),
  (203, [1, 0]),
  (242, [1, 0]),
  (272, [0, 1]),
  (330, [1, 0]),
  (379, [1]),
  (426, [1, 0]),
  (452, [1, 0]),
  (459, [0]),
  (464, [1, 0]),
  (799, [26]),
  (879, [1, 0]),
  (886, [1, 0]),
  (928, [22]),
  (1020, [1]),
  (1046, [1, 0]),
  (2644, [0]),
  (3271, [1])],
 [(10, [(1, 0.032486376324312193)]),
  (122, [(0, 0.02004309119367435), (1, 0.081790230834165617)]),
  (176, [(0, 0.023484407966476134), (1, 0.076201495739499595)]),
  (187, [(0, 0.024604332465218291), (1, 0.1485914442141068)]),
  (198, [(0, 0.039494991642498971), (1, 0.12697942946308519)]),
  (203, [(0, 0.037129704181457189), (1, 0.14448446977418358)]),
  (242, [(0, 0.035247657142874526), (1, 0.16158998828817711)]),
  (272, [(0, 0.094714225962059084), (1, 0.063365931442456322)]),
  (330, [(0, 0.064871451578245601), (1, 0

In [None]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=30)
vec = corpus[0]
print(tfidf[vec])
#sims = index[tfidf[vec]]
#causes Python to crash

In [None]:
#manually calculating cosine cimilarities between 2 vectors
def cos_sim(corpus_i, corpus_j):
    
    if (corpus_i != []) & (corpus_j != []):
        #extract words from each corpus entry
        words_i = [c[0] for c in corpus_i]
        words_j = [c[0] for c in corpus_j]

        #build list of shared words
        words_shared = [w for w in words_i if w in words_j]

        #extract frequency values
        numer_i = [c[1] for c in corpus_i if c[0] in words_shared]
        numer_j = [c[1] for c in corpus_j if c[0] in words_shared]

        #calculate dot product (numerator for cosine similarities)
        numer = np.dot(numer_i, numer_j)

        #calculate denominator
        denom_i = np.sqrt(sum( i[1]*i[1] for i in corpus_i))
        denom_j = np.sqrt(sum( j[1]*j[1] for j in corpus_j))
        denom = denom_i * denom_j
    
        result = numer/denom
    
    else:
        result = np.nan
    
    #cos similarity
    return result

In [None]:
#calculate the most similar wines to given entry
search_id = 25794

cos_test = [cos_sim(tfidf[corpus[search_id]],tfidf[i]) for i in corpus]
print_notes(search_id)
#result for 0th wine
pd.DataFrame(cos_test).sort_values(by=0, ascending=False).head(10)

In [None]:
# find top 10 and print notes
top10 = pd.DataFrame(cos_test).sort_values(by=0, ascending=False).head(10)

#top10_Ids = wine_notes.Id[top10.index]
    
selection = pd.DataFrame()
selection = selection.append([wine[wine.Id == i][['Name','Vintage','Varietal.Name']] for i in wine_notes.iloc[top10.index].Id])
print(selection)
for i in top10.index: print_notes(i)

In [None]:
def find_top_matches(id_list):
    matches = pd.DataFrame()
    for i in id_list:
        matches = matches.append wine[wine.Id==wine_notes.Id[i]][['Id','Name','Vintage','Varietal.Name']])
    print(wine_notes.Stemmed[i])
    return matches

In [None]:
selection = pd.DataFrame()
selection = selection.append([wine[wine.Id == i][['Name','Vintage','Varietal.Name','Notes']] for i in wine_notes.iloc[top10.index].Id])
selection

In [None]:
n = 0
test1 = [i[n] for i in corpus[n]]
for i in corpus[n]: print(i[n],dictionary[i[n]])

In [None]:
n = 21078
test1 = [i[0] for i in corpus[n]]
for i in corpus[n]: print(i[0],dictionary[i[0]])

In [None]:
#LDA with TFIDF model