In [38]:
import pandas as pd
import string
from glove import Glove

In [64]:
# columns to use
cols=['description', 'points']

# import data
reviews_1 = pd.read_csv('../../data/wine-reviews/winemag-data_first150k.csv', index_col=False, usecols=cols)
reviews_2 = pd.read_csv('../../data/wine-reviews/winemag-data-130k-v2.csv', index_col=False, usecols=cols)

print("Number of entries in dataset 1: %s" %reviews_1.shape[0])
print("Number of entries in dataset 2: %s" %reviews_2.shape[0])

duplicates = set(reviews_1.description).intersection(set(reviews_2.description))

print("\nNumber of duplicate entries across datasets: %s" % len(duplicates))

# concatenate and drop duplicates
data = pd.concat([reviews_1,reviews_2])
data.drop_duplicates(inplace=True)

print("\nNumber of unique reviews: %s" % data.shape[0])

data.head()

Number of entries in dataset 1: 150930
Number of entries in dataset 2: 129971

Number of duplicate entries across datasets: 48346

Number of unique reviews: 169461


Unnamed: 0,description,points
0,This tremendous 100% varietal wine hails from ...,96
1,"Ripe aromas of fig, blackberry and cassis are ...",96
2,Mac Watson honors the memory of a wine once ma...,96
3,"This spent 20 months in 30% new French oak, an...",96
4,"This is the top wine from La Bégude, named aft...",95


In [135]:
punc_remove = string.punctuation
punc_remove = punc_remove.replace('%', '')
table = str.maketrans(dict.fromkeys(punc_remove))

# lowercase
data['description_test'] = data.description_test.str.lower()
# remove punctuation
data['description_test'] = data.description_test.str.translate(table)
# replace percentage sign
data['description_test'] = data.description_test.str.replace('%', ' percent')
# split words
data['description_test'] = data.description_test.str.split()

In [137]:
data.head()

Unnamed: 0,description,points,description_test
0,This tremendous 100% varietal wine hails from ...,96,"[this, tremendous, 100, percent, varietal, win..."
1,"Ripe aromas of fig, blackberry and cassis are ...",96,"[ripe, aromas, of, fig, blackberry, and, cassi..."
2,Mac Watson honors the memory of a wine once ma...,96,"[mac, watson, honors, the, memory, of, a, wine..."
3,"This spent 20 months in 30% new French oak, an...",96,"[this, spent, 20, months, in, 30, percent, new..."
4,"This is the top wine from La Bégude, named aft...",95,"[this, is, the, top, wine, from, la, bégude, n..."


In [254]:
import tensorflow as tf
import numpy as np
import collections
from collections import Counter
import math
import random

In [241]:
def create_vocabulary(documents):
    """Unique words and counts"""
    vocabulary = Counter()

    for row in documents:
        vocabulary.update(row)
        
    return vocabulary

documents = list(data.description_test)
vocabulary = create_vocabulary(documents)
vocabulary_size = len(vocabulary)

print("Number of unique words: %s" % vocabulary_size)

Number of unique words: 58541


In [242]:
def top_vocabulary(vocabulary, n_words=10000):
    """Limit vocabulary to highest occurring words and create IDs."""
    vocabulary_n = list(dict(vocabulary.most_common(n_words - 1)).keys())
    vocabulary_n.append('UNK') # placeholder for rare words
    
    vocabulary_n = dict(zip(vocabulary_n, random.sample(range(0, n_words+1), n_words)))
            
    return vocabulary_n

vocabulary_n = top_vocabulary(vocabulary)

In [243]:
def map_vocabulary(vocabulary, map_table):
    """Map vocabulary words to IDs"""
    vocabulary_map_table = dict.fromkeys(vocabulary.keys(), 0)
    for word in vocabulary:
        if word not in map_table:
            vocabulary_map_table[word] = map_table['UNK']
        else:
            vocabulary_map_table[word] = map_table[word]
            
    return vocabulary_map_table
    
vocabulary_map_table = map_vocabulary(vocabulary, vocabulary_n)

In [250]:
def map_documents(documents, vocabulary_map_table):
    """Map documents to integer word IDs"""
    documents_mapped = [[vocabulary_map_table[word] for word in doc] for doc in documents]
    
    return documents_mapped
    
documents_mapped = map_documents(documents, vocabulary_map_table)         

In [262]:
review_index = 0 # keep track of training batches
word_index = 0
total_num_words = sum([len(doc) for doc in documents_mapped])

In [263]:
total_num_words

6868514

In [300]:
def generate_batch(documents_mapped, batch_size, num_skips, skip_window):
    
    global review_index
    global word_index
#     global total_num_word
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # span considers window on both sides of target word
    # we could potentially consider a full sentence or review
    # as the context, but keeping it simpler for now
    span = 2 * skip_window + 1 
    
    # init buffer (context and target words)
    buffer = collections.deque(maxlen=span)
    
    # go back to first review if no more reviews left
    if review_index >= len(documents_mapped):
        review_index = 0
    
    # make sure there are enough words for skip-gram
    # could consider moving to next review instead of recycling words
    if word_index + span > len(documents_mapped[word_index]):
        word_index = len(documents_mapped[word_index]) - span
    
    # new skip-gram
    buffer.extend(documents_mapped[review_index][word_index:word_index + span])
        
    for i in range(batch_size // num_skips):
        # skip_window is the same as index of target word
        context_words = [w for w in range(span) if w != skip_window]
        # randomly select context word indices to use
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            # target word
            batch[i * num_skips + j] = buffer[skip_window]
            # context word
            labels[i * num_skips + j, 0] = buffer[context_word]
        
        # add next words / new review for skips
        if word_index + span == len(documents_mapped[word_index]):
            # next review
            word_index = 0
            review_index += 1
            # start at the beginning if non left
            if review_index >= len(documents_mapped):
                review_index = 0
            
            buffer.extend(documents_mapped[review_index][word_index:word_index + span])
            
            word_index += span
        else:
            # add next word in review
            buffer.append(documents_mapped[review_index][word_index])
            word_index += 1
            
    return batch, labels
            
batch_size = 50
num_skips = 5
skip_window = 3
test_batch, test_labels = generate_batch(documents_mapped, batch_size, num_skips, skip_window)       

In [301]:
assert test_batch.shape[0] == batch_size

In [302]:
print("Target sample:")
print(test_batch[:10])
print("\nContext Sample:")
print(test_labels[:10])

Target sample:
[1654 1654 1654 1654 1654 5564 5564 5564 5564 5564]

Context Sample:
[[2843]
 [6133]
 [6621]
 [9040]
 [5564]
 [1654]
 [8424]
 [6621]
 [6133]
 [2843]]


In [304]:
embedding_size = 64

batch_size = 50
num_skips = 5
skip_window = 3

num_sampled = 32 # number of negative examples to sample

In [309]:
graph = tf.Graph()

with graph.as_default():
    
    # input data
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=batch_size) # target
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) # context (what we want to predict)
        
    with tf.name_scope('embeddings'):
        # embedding weights
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    # hidden layer weights
    with tf.name_scope('weights'):
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                      stddev=1.0 / math.sqrt(embedding_size)))
    # hidden layers biases
    with tf.name_scope('biases'):
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
    # NCE because softmax is too expensive
    # is it a good or a corrupt pair (context and target)?
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=nce_weights, 
                biases=nce_biases, 
                labels=train_labels, 
                inputs=embed, 
                num_sampled=num_sampled, 
                num_classes=vocabulary_size))
        
    # for viz
    tf.summary.scalar('loss', loss)
    
    # SGD - minimize loss on train data (see loss above)
    # Learning Rate = 1.0 (don't care about overfitting)
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
#     valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid)

    # summary data
    merged = tf.summary.merge_all()
    
    # init variables
    init = tf.global_variables_initializer()
    
    # saver
    saver = tf.train.Saver()    

In [311]:
num_steps = 10001 # number of unique words

with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter('word2vec', session.graph)
    
    init.run()
    print('Initialized')
    
    average_loss = 0
    
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(documents_mapped, batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        run_metadata = tf.RunMetadata()
        
        _, summary, loss_val = session.run(
            [optimizer, merged, loss], 
            feed_dict=feed_dict, 
            run_metadata=run_metadata)
        
        average_loss += loss_val
        
        writer.add_summary(summary, step)
        
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)
            
        if step % 200 == 0:
            if step > 0:
                average_loss /= 200
                
    print('Average loss at step', step, ': ', average_loss)
        
    final_embeddings = normalized_embeddings.eval()

    save.save(session, 'word2vec/word2vec.ckpt')
    
writer.close()
        
        
        
        
        

Initialized
Average loss at step 0 :  159.7444305419922
Average loss at step 1 :  316.1351776123047
Average loss at step 2 :  475.43177795410156
Average loss at step 3 :  595.7382049560547
Average loss at step 4 :  748.8101654052734
Average loss at step 5 :  864.2584457397461
Average loss at step 6 :  1014.1323928833008
Average loss at step 7 :  1154.0315017700195
Average loss at step 8 :  1300.6665573120117
Average loss at step 9 :  1448.2027206420898
Average loss at step 10 :  1600.2970352172852
Average loss at step 11 :  1733.1249618530273
Average loss at step 12 :  1865.9839401245117
Average loss at step 13 :  2020.6288375854492
Average loss at step 14 :  2153.9018783569336
Average loss at step 15 :  2308.083854675293
Average loss at step 16 :  2446.849250793457
Average loss at step 17 :  2583.1024856567383
Average loss at step 18 :  2706.971076965332
Average loss at step 19 :  2857.3503189086914
Average loss at step 20 :  2991.960136413574
Average loss at step 21 :  3124.200019836

KeyboardInterrupt: 

In [177]:
# word embeddings

# latent features
embedding_size = 50
vocabulary_size = len(vocabulary)
# init values
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# initialize weights and biases for word2vect model
# each unique word gets a weight per latent feature and a single bias
nce_weights = tf.Variable(
  tf.truncated_normal([vocabulary_size, embedding_size],
                      stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [178]:
# Placeholders for inputs (reviews are read in in batches during training)
batch_size = 25

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [277]:
batch_size // num_skips

10

In [289]:
test_batch[0][test_batch[0] == 2941]

array([2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941],
      dtype=int32)