In [None]:
import math
import numpy as np
import pandas as pd

import keras
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('../../data/wine_data.csv')
print(data.shape)

# shuffle the data
data = data.sample(frac=1)

data.head()

(150929, 11)


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
7940,7940,Italy,"This elegant, full-bodied red opens with a lov...",Musmeci Riserva,92,50.0,Sicily & Sardinia,Etna,,Red Blend,Tenuta di Fessina
77573,77573,Italy,"This delivers excellent value, with dark fruit...",Grifalco,88,14.0,Southern Italy,Aglianico del Vulture,,Aglianico,Lucania
16571,16571,US,"Clean, balanced and smooth despite the tannins...",Big John's Vineyard,84,24.0,California,Calaveras County,Sierra Foothills,Zinfandel,Newsome-Harlow
38578,38578,France,"Big, rich, almost velvet in texture, this is a...",,93,105.0,Bordeaux,Margaux,,Bordeaux-style Red Blend,Château Rauzan-Ségla
134070,134070,France,"Very perfumed wine, from organic grapes, the b...",L de Lagarde,86,,Bordeaux,Bordeaux,,Bordeaux-style Red Blend,Château de Lagarde


In [3]:
# do some preprocessing 
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]

# drop the first column
data = data.drop(data.columns[0], axis=1)
print(data.shape)


(137229, 10)


In [4]:
# anything that occurs less than this will be removed
variety_threshold = 500
value_counts = data['variety'].value_counts()
print(value_counts)

# get variety index that need to be removed 
to_remove = value_counts[value_counts <= variety_threshold].index
print(to_remove)

# replace data with np.nan when variety equals the index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]
print(data.shape)

Chardonnay                       13775
Pinot Noir                       13625
Cabernet Sauvignon               12671
Red Blend                         9377
Sauvignon Blanc                   6054
Syrah                             5667
Riesling                          5212
Merlot                            4987
Bordeaux-style Red Blend          4545
Zinfandel                         3794
Malbec                            3085
Sangiovese                        2879
White Blend                       2554
Tempranillo                       2525
Rosé                              2461
Shiraz                            1945
Sparkling Blend                   1820
Portuguese Red                    1812
Nebbiolo                          1529
Rhône-style Red Blend             1455
Cabernet Franc                    1310
Corvina, Rondinella, Molinara     1292
Pinot Gris                        1275
Pinot Grigio                      1270
Viognier                          1254
Champagne Blend          

In [5]:
# split data into train and test dataset
train_size = int(len(data) * .8)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(data) - train_size))

Train size: 95646
Test size: 23912


In [6]:
# train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# train labels
labels_train = data['price'][:train_size]

# test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# test labels
labels_test = data['price'][train_size:]

In [7]:
# create a tokenizer to preprocess our text descriptions

# this is a hyperparameter
vocab_size = 12000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)

# on fit on train
tokenize.fit_on_texts(description_train) 

In [8]:
# wide feature 1: sparse bag of words (bow) vocab_size vector
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

print(description_train.head())
print(description_bow_train)
print(description_bow_train.shape)

7940      This elegant, full-bodied red opens with a lov...
16571     Clean, balanced and smooth despite the tannins...
38578     Big, rich, almost velvet in texture, this is a...
67627     Fruity but common on the nose, and acidic and ...
113193    Like most of the wines from Neil Ellis, this S...
Name: description, dtype: object
[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]
(95646, 12000)


In [9]:
# wide feature 2: one-hot vector of variety categories
print(variety_train)

encoder = LabelEncoder()
encoder.fit(variety_train)

variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)

num_classes = np.max(variety_train) + 1
print(variety_train)
print(num_classes)

# convert labels to one-hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)
print(variety_train)


7940                          Red Blend
16571                         Zinfandel
38578          Bordeaux-style Red Blend
67627                            Malbec
113193                  Sauvignon Blanc
107451                          Barbera
111701                       Chardonnay
112073                        Red Blend
76840                          Riesling
97578                        Chardonnay
126955                        Red Blend
29046                   Sauvignon Blanc
93971                              Rosé
114073                            Syrah
53473                Cabernet Sauvignon
89026                        Pinot Noir
24177                   Sparkling Blend
34661             Rhône-style Red Blend
130314               Cabernet Sauvignon
141494                       Pinot Gris
55925                          Riesling
116079                     Petite Sirah
79082                            Merlot
100975                     Pinot Grigio
36743                    Portuguese Red


In [10]:
# define the wide model with the functinoal API
layers = keras.layers

bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)

wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], 
                         outputs=predictions)

wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 256)          3082496     concatenate_1[0][0]              
__________

In [11]:
# deep model features: word enbedding of wine descriptions
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)
print(train_embed[0])

[6, 134, 54, 84, 41, 218, 5, 3, 330, 2275, 4, 41, 44, 1, 611, 2, 138, 24, 1884, 115, 415, 220, 19, 230, 345, 795, 1, 1156, 35, 426, 102, 302, 21, 1, 47, 23, 32, 6008]


In [12]:
max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding='post')
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding='post')

In [14]:
# define the deep model with the functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)

deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 170)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 170, 8)            96000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1361      
Total params: 97,361
Trainable params: 97,361
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
deep_model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['accuracy'])

In [17]:
# combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)

combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse',
                      optimizer='adam',
                      metrics=['accuracy'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 170)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 12040)        0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________

In [20]:
# run training
combined_model.fit([description_bow_train, variety_train] + [train_embed],
                  labels_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12d062eb8>

In [21]:
combined_model.evaluate([description_bow_test, variety_test] + [test_embed],
                       labels_test, batch_size=128)



[670.4098962022533, 0.062604550031684]

In [22]:
# generate predictions
predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed])
print(predictions)

[[14.137038]
 [40.058777]
 [19.782604]
 ...
 [20.241787]
 [23.562008]
 [22.68404 ]]


In [23]:
# compare predictions with actual values for the first few items in our test dataset
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(description_test.iloc[i])
    print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

Mocha and creamy baked-berry aromas give it a soupy, chunky start, while the palate is rich and meaty for a $10 kosher wine. Flavors run sweet and syrupy, and a touch herbal, while the finish is full and chewy. Not exact but still a pretty good wine for the price.
Predicted:  14.137038 Actual:  10.0 

A very good effort with this tricky grape. Along with the stone-fruit flavors are sweeter streaks of cotton candy and marshmallow. But the candied fruit is lively and not cloying or artificial. The finish is long and clean, with a twist of buttered nuts and creamy vanilla.
Predicted:  40.058777 Actual:  18.0 

Complex and smooth wine, keeping firm tannins well integrated with the elegant polished fruit. With refreshing acidity to go with the fruit and tannins, the wine is finely, ripely balanced.
Predicted:  19.782604 Actual:  15.0 

Greenwood Ridge does a terrific job with Scherrer's Alexander Valley Zin grapes, capturing the intensity of these century vines while maintaining balance. Th

In [24]:
# compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  7.0928569495677944
