In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize

%pylab inline
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from gensim import corpora, models, similarities
from nltk.corpus import stopwords
import nltk

import snowballstemmer
import gensim

from string import ascii_lowercase

import itertools

from collections import defaultdict

Populating the interactive namespace from numpy and matplotlib




In [2]:
#import data
reds = pd.read_json('red_wines.json', orient = "columns", typ='frame')
whts = pd.read_json('white_wines.json',orient = 'columns', typ='frame')
rose = pd.read_json('rose_wines.json', orient='columns', typ='frame')
sprk = pd.read_json('spark_wines.json', orient='columns', typ='frame')

#combine data_sets (wine type is in 'Varietal.WineType.Id' and 'Varietal.WineType.Name')
wine = pd.DataFrame()
wine = wine.append(reds, ignore_index = True)
wine = wine.append(whts, ignore_index = True)
wine = wine.append(rose, ignore_index = True)
wine = wine.append(sprk, ignore_index = True)

In [3]:
#use the wine dataframe, column name, and column key as inputs
#also have the notes column specifiable as optional parameter
def select_wines(data, col_name='NaN', col_key='NaN', notes_col='Notes'):
    
    if (col_name == 'NaN') & (col_key == 'NaN'):
        notes = data[data[notes_col] != 'NaN'][notes_col].copy()
        w_ids = data[data[notes_col] != 'NaN']['Id'].copy()
    else:
        #copy notes and wine ids
        notes = data[(data[col_name]==col_key) & (data[notes_col] != 'NaN')][notes_col].copy()
        w_ids = data[(data[col_name]==col_key) & (data[notes_col] != 'NaN')]['Id'].copy()
    
    result = pd.concat([w_ids, notes], axis=1, join_axes=[w_ids.index])
    
    return result
    

In [4]:
#cleaning up the test_notes dataframe
#takes as input the notes dataframe
def clean_notes(input_list, input_ids):
    
    #copy input just to be sure
    notes_list = input_list.copy()
    w_ids_list = input_ids.tolist().copy()
    
    #replace 'Winemaker's Notes' tag
    notes_list.replace('Winemaker\'s Notes:',' ',inplace=True,regex=True)
    #replace puncuation, special characters, etc.
    notes_list.replace('[$&!:"#%\'()*+,-.—/;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ',inplace=True,regex=True)
    
    #stemmer and stopwords initialization
    stemmer = snowballstemmer.EnglishStemmer()
    #initialize stop list
    stop = stopwords.words('english')
    #extend stoplist
    stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
    #also include stems
    stoplist = stemmer.stemWords(stop)
    #set stoplist
    stoplist = set(stoplist)
    #combine full and stemmed words
    stop = set(sorted(stop + list(stoplist)))
    
    #remove stopwords and return list
    notes_filt = [' '.join(word for word in line.lower().split() if word not in stop) for line in notes_list.tolist()]
    
    #find stems and replace words
    notes_stem = [stemmer.stemWords(line.split()) for line in notes_filt]
    
    #output 
    #output = pd.DataFrame(pd.Series(notes_stem),columns=['Stemmed'])
    output = pd.DataFrame({'Id':pd.Series(w_ids_list), 'Stemmed':pd.Series(notes_stem)})
    
    return output
    

In [5]:
def count_notes(wine_notes_stem):
    freq_stem = defaultdict(int)
    for line in wine_notes.Stemmed:
        for token in line:
            freq_stem[token] += 1
    
    return freq_stem

In [10]:
wine['Varietal.Name'].value_counts()

Chardonnay               13619
Cabernet Sauvignon       13374
Pinot Noir               11037
Bordeaux Red Blends       7737
Other Red Blends          6949
Syrah/Shiraz              6094
Sauvignon Blanc           5321
Merlot                    4903
Sangiovese                3892
Zinfandel                 3392
Rhone Red Blends          3341
Riesling                  2904
Rosé                      2701
Pinot Gris/Grigio         2368
Other White Blends        2329
Tempranillo               2112
Nebbiolo                  1839
Other White Wine          1744
Malbec                    1732
Other Red Wine            1685
Vintage                   1303
Non-Vintage               1002
Grenache                   789
Viognier                   711
Chenin Blanc               709
Rhone White Blends         687
Gamay                      635
Barbera                    619
Bordeaux White Blends      589
Petite Sirah               562
Gewurztraminer             524
Cabernet Franc             508
Muscat  

In [11]:
#select wines
wine_notes = select_wines(wine, 'Varietal.Name', 'Cabernet Sauvignon')

#clean up tasting notes and merge with orig dataframe
stemmed_notes = clean_notes(wine_notes.Notes, wine_notes.Id)
wine_notes = pd.merge(wine_notes, stemmed_notes, how='left', on='Id')

#count stem frequencies
note_freq = count_notes(wine_notes.Stemmed)

#remove low counts (< 5) and merge
wine_notes_mult = [[token for token in line if note_freq[token] > 5]
                    for line in wine_notes.Stemmed]
wine_notes_mult = pd.DataFrame({'Id':pd.Series(wine_notes.Id), 'Stem>5':pd.Series(wine_notes_mult)})
wine_notes = pd.merge(wine_notes, wine_notes_mult, how='left', on='Id')


In [None]:
#pd.DataFrame(pd.Series(stemmed_notes))[0].tolist()
#return to original form: wine_notes.Stemmed.tolist()
#return to original form: wine_notes['Stem>5'].tolist()

In [12]:
#create dictionary of all stemmed words
dictionary = corpora.Dictionary(wine_notes['Stemmed'].tolist())
#create corpus from dictionary
corpus = [dictionary.doc2bow(line) for line in wine_notes['Stemmed'].tolist()]
#save corpus
corpora.MmCorpus.serialize('cabernet.mm', corpus)

In [None]:
#not sure where to go from here

In [None]:
token_counts = pd.Series(dict(note_freq)).sort_values(ascending = False)
token_counts

In [None]:
print(dictionary.token2id)

In [None]:
tfidf = models.TfidfModel(corpus) #initialize tf-idf model

In [None]:
vec = [(0,1),(4,1)]
print(tfidf[vec]) #use model to transform vectors

In [None]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=12)

In [None]:
sims = index[tfidf[vec]]
