In [1]:
# !pip install pandas
# !pip install seaborn
# !pip install sklearn
# !pip install pyenchant
# !apt-get install libenchant1c2a

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing import sequence
# !pip install english-words 
# !pip install PyDictionary
# from PyDictionary import PyDictionary
import os 

# os.chdir('/content/drive/MyDrive/MSCA_31009/Final_Project/')
os.chdir('/content/drive/MyDrive/NeuralResearch/wine-net/')

def remove_punctuation(s):
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

  # Removing punctuations in string
  # Using loop + punctuation string
  for ele in s:
      if ele in punc:
          s = s.replace(ele, "")
  return s

def clean_description(review, swap):
  terms = review.split()
  for x in terms:
    try: x = swap[('t_' + str(x))]
    except KeyError: x = ''
  return ' '.join(terms)


In [3]:
wines = pd.read_csv('winemag-data-130k-v2.csv')
wines = wines.drop(columns = ['Unnamed: 0'])

wines2 = pd.read_csv('winemag-data_first150k.csv')
wines2.head()

# Intersect and append two review tables
columns = [value for value in wines.columns if value in wines2.columns] # intersection(wines.columns, wines2.columns)
wines = wines[columns]
wines2 = wines2[columns]
wines = pd.concat([wines, wines2]).drop(columns = ['designation', 'winery'])

wines = wines.sample(n=20000)

# Imputation steps
median_price = wines.price.median()
median_points = wines.points.median()

# Impute variables
wines.price = wines.price.fillna(median_price)
wines.points = wines.points.fillna(median_points)
wines = wines.fillna('')
wines.price = wines.price.astype(int)
wines.description = wines.description.apply(lambda x: remove_punctuation(x.lower()))

Y = wines.variety
Y0_price = wines.price
Y1_points = wines.points
wines = wines.drop(columns=['variety', 'price', 'points', 'region_1', 'region_2'])

wines.head()

Unnamed: 0,country,description,province
963,US,ripe fruit flavors and a delicious spiciness c...,California
147657,Italy,mildly minty and raisiny on the nose with a ja...,Tuscany
94901,US,a bigproduction mediumbodied effort this has n...,Oregon
111858,US,this is a good vineyard and its a decent wine ...,California
45128,France,solid and perfumed this expresses all the powe...,Southwest France


In [4]:
text = wines.description
tfidf = TfidfVectorizer() 
review_vector = tfidf.fit_transform(text)
words = tfidf.get_feature_names_out()
rev_array = review_vector.toarray()
words_df = pd.DataFrame(rev_array, columns = words)

words.shape

(19262,)

In [5]:
import enchant

enchant.list_languages()
eng = enchant.Dict("en_US")
drops = []
for column in words_df.columns:
    english = eng.check(column)
    if any(map(str.isdigit, column)): 
        drops.append(column)
    if not(english):
        drops.append(column)

words_df = words_df.drop(columns = drops)
words_df.head()

Unnamed: 0,abandon,abandoning,abate,abbey,abbreviated,aberrant,ability,able,aboard,aboriginal,...,zinfandel,zing,zinging,zings,zingy,zip,zipping,zippy,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.161706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# vectorizing the rest of the data in the table.
wines.head()

Unnamed: 0,country,description,province
963,US,ripe fruit flavors and a delicious spiciness c...,California
147657,Italy,mildly minty and raisiny on the nose with a ja...,Tuscany
94901,US,a bigproduction mediumbodied effort this has n...,Oregon
111858,US,this is a good vineyard and its a decent wine ...,California
45128,France,solid and perfumed this expresses all the powe...,Southwest France


In [7]:
from keras.preprocessing.text import Tokenizer
X = wines.description.to_numpy()
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X)
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)
max_log_length = 20000
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
X_processed

array([[   0,    0,    0, ...,    1,  526,  564],
       [   0,    0,    0, ...,    3, 1202,  732],
       [   0,    0,    0, ...,  806,   88,  959],
       ...,
       [   0,    0,    0, ...,   15,    2,   19],
       [   0,    0,    0, ...,  365,   10,   20],
       [   0,    0,    0, ...,    7,  237, 3281]], dtype=int32)

In [8]:
from sklearn.preprocessing import LabelEncoder
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
Y_processed = encoder.transform(Y)

In [9]:
# words_df = words_df.reset_index()
# wines = wines.reset_index()
# wines = pd.concat([wines, words_df], axis=1)
# wines.shape
# wines = wines.drop(columns = ['index'])

## Variety Classifier

In [10]:
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

X_train, X_test, y_train, y_test = train_test_split(X_processed, Y_processed, test_size=0.25, random_state=0)

input_dim = X_train.shape[1]

In [11]:
Y.to_numpy()

array(['Red Blend', 'Red Blend', 'Pinot Gris', ..., 'Chardonnay',
       'Cabernet Sauvignon', 'Graciano'], dtype=object)

In [12]:
model = Sequential()

# Adding an embedding layer
model.add(tf.keras.layers.Embedding(input_dim=input_dim, output_dim=32, input_length=max_log_length))

# Adding an LSTM layer
model.add(tf.keras.layers.LSTM(units=64, recurrent_dropout=0.5))

# Dropout layer
model.add(tf.keras.layers.Dropout(0.5))

# Dense layer 
model.add(tf.keras.layers.Dense(units=64, activation='sigmoid'))

# Dense layer 
model.add(tf.keras.layers.Dense(units=32, activation='sigmoid'))

# Dense layer 
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])

model.fit(x=X_train, y = y_train, batch_size = 128, epochs = 10)

Epoch 1/10
  1/118 [..............................] - ETA: 1:55:08 - loss: 0.0000e+00 - accuracy: 0.0000e+00

KeyboardInterrupt: ignored

In [None]:
model.evaluate(X_test, y_test, batch_size=128)