In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
#data = pd.read_csv('winereviews.csv')
#data.head()
column_headers = ["country", "description","points", "province", "region_1", "variety", "winery"]
data = pd.read_csv('winereviews.csv', usecols=column_headers)

In [3]:
data = data[data.winery.notnull()]
data = data[data.variety.notnull()]
data = data[data.province != "Other"]
data = data[data.province.notnull()]
data = data[data.province != "France Other"]
data = data[data.province != "Spain Other"]
data = data[data.province != "Australia Other"]
data = data[data.country != "US-France"]
data = data[data.points > 88]

In [4]:
data = data.reset_index(drop=True)

In [5]:
!pip install --upgrade gensim

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install --upgrade pip' command.[0m


In [6]:
from gensim.models import Word2Vec



In [7]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from gensim.models.phrases import Phrases, Phraser

In [8]:
reviews = list(data['description'])
reviews = [str(r) for r in reviews]
reviews_full_doc = ' '.join(reviews)
sent_tokens = sent_tokenize(reviews_full_doc)

In [9]:
#remove stop words, lowercase, and punctuation
punc = '''!()-[]{};:.'"\, <>./?@#$%^&*_~'''

def process_text(text):
    words = word_tokenize(text)
    processed_sentence = []
    for word in words:
        w = str(word)
        lowercase = str.lower(w)
        stemmed = SnowballStemmer('english').stem(lowercase)
        remove_punc = stemmed
        for c in remove_punc:
            if c in punc:
                remove_punc.replace(c, "")
        processed_sentence.append(remove_punc)
    return(processed_sentence)
        
processed_sents = []
for sent in sent_tokens:
    processed = process_text(sent)
    processed_sents.append(processed)

In [10]:
mapping = pd.read_csv('descriptor_mapping.csv').set_index('raw descriptor')

In [11]:
# add bi grams/phrases
phrases = Phrases(processed_sents)
phrases = Phrases(phrases[processed_sents])
grams = Phraser(phrases)
phrase_sents = []
for sent in processed_sents:
    phrased = grams[sent]
    phrase_sents.append(phrased)
    
allwords = [item for sublist in phrase_sents for item in sublist]

In [12]:
#map corpus words to level 3
def mapped_words(word):
    if word in list(mapping.index):
        mapped = mapping['level_3'][word]
        return mapped
    else:
        return word

processed_sents = []
for sent in phrase_sents:
    processed_sent = []
    for word in sent:
        mapped_word = mapped_words(word)
        processed_sent.append(str(mapped_word))
    processed_sents.append(processed_sent)

In [13]:
#embeddings:

In [14]:
model = Word2Vec(processed_sents, vector_size=300, min_count=1)

In [15]:
#could use for query expansion
model.wv.most_similar(positive='peach', topn=10)

[('nectarine', 0.8802266120910645),
 ('pear', 0.8734254240989685),
 ('mango', 0.8442874550819397),
 ('tangerine', 0.8379481434822083),
 ('melon', 0.8357724547386169),
 ('quince', 0.8332266807556152),
 ('honeysuckle', 0.8276798129081726),
 ('pineapple', 0.8143859505653381),
 ('baked_apple', 0.8120083808898926),
 ('honeydew', 0.8113285303115845)]

In [18]:
punc = '''!()-[]{};:.'"\, <>./?@#$%^&*_~'''
all_des_words = list(set(all_des_words))
for i in range(len(all_des_words)):
    word = all_des_words[i]
    if word[len(word) - 1] in punc:
        remv = word[:len(word) -1]
        all_des_words[i] = remv
#all_des_words[2][len(all_des_words[2]) -1] in punc
#len(all_des_words[2])

for i in range(len(all_des_words)):
    word = all_des_words[i]
    all_des_words[i] = word.lower()

all_des_words = set(all_des_words)

In [19]:
for review in reviews:
    processed = process_text(review)
    phrased = grams[processed]

In [20]:
#update review data to only show meaningful words
reviews = list(data['description'])
mapped_reviews = []
def mapping_wine_words(word):
    if word in list(mapping.index):
        mapped = mapping['level_3'][word]
        return mapped
for review in reviews:
    processed = process_text(review)
    phrased = grams[processed]
    mapped = [mapping_wine_words(word) for word in phrased]
    remove_none = [str(d) for d in mapped if d is not None]
    mapped_rev = ' '.join(remove_none)
    mapped_reviews.append(mapped_rev)

In [21]:
## create embedding vectors for each review
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit(mapped_reviews)
dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))

In [23]:
review_vectors = []
for review in mapped_reviews:
    numwords = 0
    words = review.split(' ')
    tfidf_mapped_review = []
    for word in words:
        if word in dict_of_tfidf_weightings.keys():
            tfidf = dict_of_tfidf_weightings[word]
            vec = tfidf * model.wv.get_vector(word).reshape(1, 300)
            tfidf_mapped_review.append(vec)
            numwords = numwords + 1
    if len(tfidf_mapped_review) > 0:
       # weight_rev_vec = sum(tfidf_mapped_review)/len(tfidf_mapped_review)
        weight_rev_vec = sum(tfidf_mapped_review)
    else:
        weight_rev_vec = []
    review_vectors.append(weight_rev_vec)

In [24]:
import numpy as np
vectors_list = []
for vec in review_vectors:
    if len(vec) == 1:
        vectors_list.append(np.float32(vec[0]))
    else:
        vectors_list.append(np.float32(np.zeros(300)))

In [25]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=10, algorithm= 'brute', metric='cosine')
modelknn = knn.fit(vectors_list)

In [27]:
query = ['peach']

In [28]:
weightedqueryterms = []
for term in query:
    if term in dict_of_tfidf_weightings:
        tfidfweight = dict_of_tfidf_weightings[term]
        word_vector = tfidfweight * model.wv.get_vector(term).reshape(1,300)
        weightedqueryterms.append(word_vector)
    else:
        try:
            #print(term)
            word_vector = model.wv.get_vector(term)
            word_vector.shape(1,300)
            #print(word_vector)
            weightedqueryterms.append(word_vector)
        except:
            print("???")
query_vec = sum(weightedqueryterms)
#print(query_vec)
distance, indice = modelknn.kneighbors(query_vec, n_neighbors = 5)

In [29]:
distance_list = distance[0].tolist()[1:]
indice_list = indice[0].tolist()[1:]

In [None]:
# for i in indice_list:
#     print(i)
#     review = mapped_reviews[i]
#     location = data['province'][i]
#     wine = data['variety'][i]
#     print(location)
#     print(wine)
#     print(review)
#     print()

In [None]:
#new word embedding tfidf matrix of reviews
#np.array(vectors_list).shape

In [30]:
full = list(data['description'])
data['full_review'] = full
data['description'] = mapped_reviews

In [31]:
wine_dict = data.to_dict(orient='index')
#wine_dict

In [32]:
import pickle

In [33]:
matrix_word2vec = []
words_word2vec_dict = {}
for i, key in enumerate(list(model.wv.index_to_key)):
    words_word2vec_dict[key] = i
    embedding = model.wv.get_vector(key, norm=True)
    matrix_word2vec.append(embedding)
matrix_word2vec = np.array(matrix_word2vec)
matrix_word2vec.shape

(27596, 300)

In [34]:
from scipy import sparse

In [35]:
matrix_word2vec = sparse.csr_matrix(matrix_word2vec)

In [36]:
vectors_list = np.array(vectors_list)
vectors_list_sparse = sparse.csr_matrix(vectors_list)

In [37]:
country_to_idx = {}
for idx in wine_dict:
    c = wine_dict[idx]['country']
    if c not in country_to_idx:
        country_to_idx[c] = []
    country_to_idx[c].append(idx)

In [None]:
##### MAIN PICKLES ######

In [None]:
# with open('wine_dict02.pickle', 'wb') as handle:
#      pickle.dump(wine_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('review_tfidf_embeddings01.pickle', 'wb') as handle:
#      pickle.dump(vectors_list, handle, protocol=pickle.HIGHEST_PROTOCOL)#

In [None]:
# with open('idf_weight_dict0.pickle', 'wb') as handle:
#      pickle.dump(dict_of_tfidf_weightings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('matrix_word2vec0.pickle', 'wb') as handle:
#      pickle.dump(matrix_word2vec, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('country_to_idx0.pickle', 'wb') as handle:
#      pickle.dump(country_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('words_word2vec_dict0.pickle', 'wb') as handle:
#     pickle.dump(words_word2vec_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('wine_descript_dict.pickle', 'wb') as handle:
#     pickle.dump(wine_descript_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)