In [38]:
import pandas as pd
import string
from glove import Glove

In [64]:
# columns to use
cols=['description', 'points']

# import data
reviews_1 = pd.read_csv('../../data/wine-reviews/winemag-data_first150k.csv', index_col=False, usecols=cols)
reviews_2 = pd.read_csv('../../data/wine-reviews/winemag-data-130k-v2.csv', index_col=False, usecols=cols)

print("Number of entries in dataset 1: %s" %reviews_1.shape[0])
print("Number of entries in dataset 2: %s" %reviews_2.shape[0])

duplicates = set(reviews_1.description).intersection(set(reviews_2.description))

print("\nNumber of duplicate entries across datasets: %s" % len(duplicates))

# concatenate and drop duplicates
data = pd.concat([reviews_1,reviews_2])
data.drop_duplicates(inplace=True)

print("\nNumber of unique reviews: %s" % data.shape[0])

data.head()

Number of entries in dataset 1: 150930
Number of entries in dataset 2: 129971

Number of duplicate entries across datasets: 48346

Number of unique reviews: 169461


Unnamed: 0,description,points
0,This tremendous 100% varietal wine hails from ...,96
1,"Ripe aromas of fig, blackberry and cassis are ...",96
2,Mac Watson honors the memory of a wine once ma...,96
3,"This spent 20 months in 30% new French oak, an...",96
4,"This is the top wine from La Bégude, named aft...",95


In [135]:
punc_remove = string.punctuation
punc_remove = punc_remove.replace('%', '')
table = str.maketrans(dict.fromkeys(punc_remove))

# lowercase
data['description_test'] = data.description_test.str.lower()
# remove punctuation
data['description_test'] = data.description_test.str.translate(table)
# replace percentage sign
data['description_test'] = data.description_test.str.replace('%', ' percent')
# split words
data['description_test'] = data.description_test.str.split()

In [137]:
data.head()

Unnamed: 0,description,points,description_test
0,This tremendous 100% varietal wine hails from ...,96,"[this, tremendous, 100, percent, varietal, win..."
1,"Ripe aromas of fig, blackberry and cassis are ...",96,"[ripe, aromas, of, fig, blackberry, and, cassi..."
2,Mac Watson honors the memory of a wine once ma...,96,"[mac, watson, honors, the, memory, of, a, wine..."
3,"This spent 20 months in 30% new French oak, an...",96,"[this, spent, 20, months, in, 30, percent, new..."
4,"This is the top wine from La Bégude, named aft...",95,"[this, is, the, top, wine, from, la, bégude, n..."


In [213]:
import tensorflow as tf
from collections import Counter
import math
import random

In [241]:
def create_vocabulary(documents):
    """Unique words and counts"""
    vocabulary = Counter()

    for row in documents:
        vocabulary.update(row)
        
    return vocabulary

documents = list(data.description_test)
vocabulary = create_vocabulary(documents)
vocabulary_size = len(vocabulary)

print("Number of unique words: %s" % vocabulary_size)

Number of unique words: 58541


In [242]:
def top_vocabulary(vocabulary, n_words=10000):
    """Limit vocabulary to highest occurring words and create IDs."""
    vocabulary_n = list(dict(vocabulary.most_common(n_words - 1)).keys())
    vocabulary_n.append('UNK') # placeholder for rare words
    
    vocabulary_n = dict(zip(vocabulary_n, random.sample(range(0, n_words+1), n_words)))
            
    return vocabulary_n

vocabulary_n = top_vocabulary(vocabulary)

In [243]:
def map_vocabulary(vocabulary, map_table):
    """Map vocabulary words to IDs"""
    vocabulary_map_table = dict.fromkeys(vocabulary.keys(), 0)
    for word in vocabulary:
        if word not in map_table:
            vocabulary_map_table[word] = map_table['UNK']
        else:
            vocabulary_map_table[word] = map_table[word]
            
    return vocabulary_map_table
    
vocabulary_map_table = map_vocabulary(vocabulary, vocabulary_n)

In [250]:
def map_documents(documents, vocabulary_map_table):
    """Map documents to integer word IDs"""
    documents_mapped = [[vocabulary_map_table[word] for word in doc] for doc in documents]
    
    return documents_mapped
    
documents_mapped = map_documents(documents, vocabulary_map_table)            

In [177]:
# word embeddings

# latent features
embedding_size = 50
vocabulary_size = len(vocabulary)
# init values
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# initialize weights and biases for word2vect model
# each unique word gets a weight per latent feature and a single bias
nce_weights = tf.Variable(
  tf.truncated_normal([vocabulary_size, embedding_size],
                      stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [178]:
# Placeholders for inputs (reviews are read in in batches during training)
batch_size = 25

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

NameError: name 'vocabulary_size' is not defined