In [35]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

In [36]:
df = pd.read_csv('data/wine-reviews.csv',usecols = ('country','description','points','price','variety','winery'))

In [37]:
df.head()

Unnamed: 0,country,description,points,price,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,,White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",87,65.0,Pinot Noir,Sweet Cheeks


In [38]:
df = df.dropna(subset=['description','points'])

In [None]:
df.head()

In [None]:
plt.hist(df.points,bins=20)
plt.title('Points Histogram')
plt.ylabel('N')
plt.xlabel('Points')
plt.show()

In [None]:
df['label'] = (df.points >=90).astype(int)
df = df[['description','label']]

In [None]:
df.tail()

In [None]:
train,val,test = np.split(df.sample(frac=1),[int(0.8*len(df)),int(0.9*len(df))])

In [None]:
len(train),len(val),len(test)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
  df = dataframe.copy()
  labels = df.pop('label')
  df = df["description"]
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [None]:
train_data = df_to_dataset(train)
val_data = df_to_dataset(val)
test_data = df_to_dataset(test)

# Embedding + Model

In [None]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding,dtype=tf.string,trainable=True)

In [None]:
hub_layer(list(train_data)[0][0])

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16,activation='relu'))
model.add(tf.keras.layers.Dropout((0.4))
model.add(tf.keras.layers.Dense(16,activation='relu'))
model.add(tf.keras.layers.Dropout((0.4))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(train_data)

In [None]:
model.evaluate(val_data)

In [None]:
history = model.fit(train_data, epochs= 5 , validation_data=val_data)

In [None]:
# Training & Validation Accuracy
plt.plot(history.history['accuracy'], label='Training acc')
plt.plot(history.history['val_accuracy'], label='validation acc')
plt.title('Accuracy of model')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
# Training & Validation Loss
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.title('loss of model')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
model.evaluate(test_data)

# LSTM

In [None]:
encoder = tf.keras.layers.TextVectorization(max_tokens=2000)
encoder.adapt(train_data.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=32,
        mask_zero=True
    ),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(train_data)
model.evaluate(valid_data)

In [None]:
history = model.fit(train_data, epochs=5, validation_data=valid_data)

In [None]:
model.evaluate(test_data)