In [1]:
import math
import numpy as np
import pandas as pd

import keras
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [2]:
data = pd.read_csv('../../data/wine_data.csv')
print(data.shape)

# shuffle the data
data = data.sample(frac=1)

data.head()

(150929, 11)


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
138501,138501,Australia,"Wide, smooth and dark on the palate, this blen...",Terra Rossa First Growth,90,70.0,South Australia,Coonawarra,,Cabernet Sauvignon-Merlot,Parker
10427,10427,US,"From a vineyard above Santa Paula, this bottli...",South Mountain Vineyard,88,24.0,California,Ventura County,Central Coast,Petit Verdot,Clos des Amis
29897,29897,Italy,"This offers aromas of violet, rose, Marasca ch...",Bergeisa,90,,Piedmont,Barolo,,Nebbiolo,Le Strette
566,566,US,"This full-bodied wine combines deep, ripe frui...",Made with Organically Grown Grapes,90,22.0,California,Sierra Foothills,Sierra Foothills,Merlot,Chacewater
29089,29089,US,This full and heady wine offers marshmallow oa...,,88,36.0,California,Russian River Valley,Sonoma,Chardonnay,Ledson


In [3]:
# do some preprocessing 
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]

# drop the first column
data = data.drop(data.columns[0], axis=1)
print(data.shape)


(137229, 10)


In [4]:
# anything that occurs less than this will be removed
variety_threshold = 500
value_counts = data['variety'].value_counts()
print(value_counts)

# get variety index that need to be removed 
to_remove = value_counts[value_counts <= variety_threshold].index
print(to_remove)

# replace data with np.nan when variety equals the index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]
print(data.shape)

Chardonnay                       13775
Pinot Noir                       13625
Cabernet Sauvignon               12671
Red Blend                         9377
Sauvignon Blanc                   6054
Syrah                             5667
Riesling                          5212
Merlot                            4987
Bordeaux-style Red Blend          4545
Zinfandel                         3794
Malbec                            3085
Sangiovese                        2879
White Blend                       2554
Tempranillo                       2525
Rosé                              2461
Shiraz                            1945
Sparkling Blend                   1820
Portuguese Red                    1812
Nebbiolo                          1529
Rhône-style Red Blend             1455
Cabernet Franc                    1310
Corvina, Rondinella, Molinara     1292
Pinot Gris                        1275
Pinot Grigio                      1270
Viognier                          1254
Champagne Blend          

In [5]:
# split data into train and test dataset
train_size = int(len(data) * .8)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(data) - train_size))

Train size: 95646
Test size: 23912


In [6]:
# train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# train labels
labels_train = data['price'][:train_size]

# test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# test labels
labels_test = data['price'][train_size:]

In [7]:
# create a tokenizer to preprocess our text descriptions

# this is a hyperparameter
vocab_size = 12000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)

# on fit on train
tokenize.fit_on_texts(description_train) 

In [8]:
# wide feature 1: sparse bag of words (bow) vocab_size vector
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

print(description_train.head())
print(description_bow_train)
print(description_bow_train.shape)

566       This full-bodied wine combines deep, ripe frui...
29089     This full and heady wine offers marshmallow oa...
122758    The nose delivers a mix of herbal berry, cola ...
144620    The most depth and weight of the lineup; black...
129541    Very concentrated and closed up, like a coil. ...
Name: description, dtype: object
[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]
(95646, 12000)


In [9]:
# wide feature 2: one-hot vector of variety categories
print(variety_train)

encoder = LabelEncoder()
encoder.fit(variety_train)

variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)

num_classes = np.max(variety_train) + 1
print(variety_train)
print(num_classes)

# convert labels to one-hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)
print(variety_train)


566                         Merlot
29089                   Chardonnay
122758                   Red Blend
144620                  Pinot Noir
129541                  Chardonnay
49091                     Riesling
38138                         Rosé
118798    Bordeaux-style Red Blend
36060                    Red Blend
100491                   Carmenère
43098                       Malbec
128299                 White Blend
121720                   Zinfandel
139564                    Viognier
120306                      Merlot
18794     Bordeaux-style Red Blend
82820                   Pinot Noir
119777          Cabernet Sauvignon
150030          Cabernet Sauvignon
42109           Cabernet Sauvignon
93304                   Pinot Noir
6783                    Sangiovese
5216            Cabernet Sauvignon
74338              Sauvignon Blanc
20464               Portuguese Red
135583          Cabernet Sauvignon
9434               Sparkling Blend
11954                   Chardonnay
124380              

In [10]:
# define the wide model with the functinoal API
layers = keras.layers

bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)

wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], 
                         outputs=predictions)

wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          3082496     concatenate_1[0][0]              
__________

In [11]:
# deep model features: word enbedding of wine descriptions
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)
print(train_embed[0])

[6, 54, 82, 9, 1138, 250, 30, 13, 8, 74, 93, 388, 1, 3, 49, 102, 123, 16, 5058, 22, 3, 690, 4, 3, 346, 1508, 887, 3778, 2, 9, 1919, 2, 4, 5365, 107, 116, 4, 2, 267, 1, 347, 12, 5, 13, 313, 308, 121, 1165, 33, 32, 80, 901]


In [12]:
max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding='post')
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding='post')

In [13]:
# define the deep model with the functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)

deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 170, 8)            96000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
deep_model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['accuracy'])

In [15]:
# combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)

combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse',
                      optimizer='adam',
                      metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________

In [16]:
# run training
combined_model.fit([description_bow_train, variety_train] + [train_embed],
                  labels_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x136b10860>

In [17]:
combined_model.evaluate([description_bow_test, variety_test] + [test_embed],
                       labels_test, batch_size=128)



[539.6331252311536, 0.0674138507924478]

In [18]:
# generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])
print(predictions)

[[45.651146 ]
 [15.525355 ]
 [49.130028 ]
 ...
 [34.249504 ]
 [19.732634 ]
 [ 4.6762514]]


In [19]:
# compare predictions with actual values for the first few items in our test dataset
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

A timid wine, with cheese, strawberry and lime scents followed by a crisp palate of nectarines, petroleum and white flowers. The medium, peachy finish is slightly off-dry. Drink now or hold several years for more aromatic development.
Predicted:  45.651146 Actual:  45.0 

Opens sweet and borderline syrupy, but also dark, earthy and candied, with nothing offensive or too herbal. The palate is jammy and rich, with soft tannins, saturation, black-fruit flavors, chocolate and herbal flavors. Finishes sweet and sticky, with coffee and some mint. Very nice wine but also so ripe that the varietal character is vague.
Predicted:  15.525355 Actual:  24.0 

A blend of grapes including traditional Corvina and Rondinella undergoes air-drying for extra-rich concentration and intensity. That, with careful oak aging, has shaped a richly textured, dense wine that would pair with game meat or bean soup with smoked bacon. It shows raw, slightly chewy tannins on the close.
Predicted:  49.130028 Actual:  2

In [20]:
# compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  29.47046148777008
