In [1]:
# !pip install pandas
# !pip install seaborn
# !pip install sklearn
# !pip install pyenchant
# !apt-get install libenchant1c2a

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
# !pip install english-words 
# !pip install PyDictionary
# from PyDictionary import PyDictionary
import os 

os.chdir('/content/drive/MyDrive/MSCA_31009/Final_Project/')

def remove_punctuation(s):
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

  # Removing punctuations in string
  # Using loop + punctuation string
  for ele in s:
      if ele in punc:
          s = s.replace(ele, "")
  return s

def clean_description(review, swap):
  terms = review.split()
  for x in terms:
    try: x = swap[('t_' + str(x))]
    except KeyError: x = ''
  return ' '.join(terms)


In [3]:
wines = pd.read_csv('winemag-data-130k-v2.csv')
wines = wines.drop(columns = ['Unnamed: 0'])

wines2 = pd.read_csv('winemag-data_first150k.csv')
wines2.head()

# Intersect and append two review tables
columns = [value for value in wines.columns if value in wines2.columns] # intersection(wines.columns, wines2.columns)
wines = wines[columns]
wines2 = wines2[columns]
wines = pd.concat([wines, wines2]).drop(columns = ['designation', 'winery'])

wines = wines.sample(n=20000)

# Imputation steps
median_price = wines.price.median()
median_points = wines.points.median()

# Impute variables
wines.price = wines.price.fillna(median_price)
wines.points = wines.points.fillna(median_points)
wines = wines.fillna('')
wines.price = wines.price.astype(int)
wines.description = wines.description.apply(lambda x: remove_punctuation(x.lower()))

Y = wines['variety']
wines = wines.drop(columns=['variety'])

wines.head()

Unnamed: 0,country,description,points,price,province,region_1,region_2
126345,US,the frederick is spring valleys bordeaux blend...,88,50,Washington,Walla Walla Valley (WA),Columbia Valley
63037,US,winemaker eric dunham was the first to latch o...,94,75,Washington,Columbia Valley (WA),Columbia Valley
147612,US,clean dry and oaky with modest flavors of peac...,84,7,California,California,California Other
44696,South Africa,herbal and spicy heres an easygoing merlot tha...,85,11,Western Cape,,
48259,Italy,caterina zardini delivers a homerun valpolicel...,90,33,Veneto,Valpolicella Classico Superiore,


In [4]:
text = wines.description
tfidf = TfidfVectorizer() 
review_vector = tfidf.fit_transform(text)
words = tfidf.get_feature_names_out()
rev_array = review_vector.toarray()
words_df = pd.DataFrame(rev_array, columns = words)

words.shape

(19221,)

In [5]:
import enchant

enchant.list_languages()
eng = enchant.Dict("en_US")
drops = []
for column in words_df.columns:
    english = eng.check(column)
    if any(map(str.isdigit, column)): 
        drops.append(column)
    if not(english):
        drops.append(column)

words_df = words_df.drop(columns = drops)
words_df.head()

Unnamed: 0,aah,abandoned,abbey,abbreviated,ability,able,ably,abnormal,abnormally,abound,...,zinfandel,zing,zinging,zings,zingy,zip,zippy,zips,zone,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
words_df = words_df.reset_index()
wines = wines.reset_index()
wines = pd.concat([wines, words_df], axis=1)

In [7]:
wines.shape

(20000, 10561)

In [8]:
wines.head()

Unnamed: 0,index,country,description,points,price,province,region_1,region_2,index.1,aah,...,zinfandel,zing,zinging,zings,zingy,zip,zippy,zips,zone,zucchini
0,126345,US,the frederick is spring valleys bordeaux blend...,88,50,Washington,Walla Walla Valley (WA),Columbia Valley,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,63037,US,winemaker eric dunham was the first to latch o...,94,75,Washington,Columbia Valley (WA),Columbia Valley,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,147612,US,clean dry and oaky with modest flavors of peac...,84,7,California,California,California Other,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,44696,South Africa,herbal and spicy heres an easygoing merlot tha...,85,11,Western Cape,,,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,48259,Italy,caterina zardini delivers a homerun valpolicel...,90,33,Veneto,Valpolicella Classico Superiore,,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Variety Classifier

In [9]:
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

X_train, X_test, y_train, y_test = train_test_split(wines, Y, test_size=0.25, random_state=0)

input_dim = X_train.shape[1]


In [None]:
# model = Sequential()
# model.add(tf.keras.layers.Embedding(input_dim=input_dim, output_dim=64))
# model.add(layers.Dense(units=64, input_dim=input_dim, activation='relu'))
# model.add(layers.Dense(units=32, activation='sigmoid'))
# model.add(layers.Dense(units=1, activation='sigmoid'))
# model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
# model.fit(x=X_train, y = y_train, batch_size = 1024, epochs = 10)

In [None]:
model.evaluate(X_test, y_test, batch_size=128)

In [None]:
# 
# Y1_price = wines.price
# Y2_points = wines.points 