# Work on Recommender with DataFrames split by Color
- build FastText model for each color
- tfidf recommender works well but need a way to pull in text based words to find a specific wine
- use tfidf recommnder to find more wines like this of any color

In [1]:
import pandas as pd
import numpy as np
import regex as re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt

import collections
from collections import Counter

import pickle

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Phrases
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
# https://fasttext.cc/docs/en/supervised-tutorial.html
import fasttext
import gensim
#  pip install fasttext
from gensim.models import FastText

In [2]:
df = pd.read_csv('../data/test_wines.csv')
df.shape

(12888, 15)

In [3]:
df.head()

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,country_clean,text
0,"Skin Contact Silvaner ""Kleine Heimat,"" 2Naturk...",2Naturkinder,2017,750,white,24,Germany,franken,https://www.astorwines.com/SearchResultsSingle...,"Made from 100% Silvaner, this white wine ferme...",silvaner,0,made silvaner white wine ferments skins days g...,germany,franken white Germany made silvaner white wine...
1,"Fledermaus Rot, 2Naturkinder",2Naturkinder,2018,750,red,0,Germany,franken,https://www.astorwines.com/SearchResultsSingle...,Made from Schwartz Riesling (aka Pinot Meunier...,pinot meunier,0,made schwartz riesling pinot meunier grapes ha...,germany,franken red Germany made schwartz riesling pin...
2,"Sekt Weiss, 50o N NV",50oN,NV,750,white,14,Germany,rheingau,https://www.astorwines.com/SearchResultsSingle...,Sometimes you just want to drink a wine becaus...,müller-thurgau pinot blanc sylvaner,0,sometimes want drink wine tastes good one wine...,germany,rheingau white Germany sometimes want drink wi...
3,"Fuga, Mencía, Ribeira Sacra",A Fuga,2017,750,red,0,Spain,ribeira sacra,https://www.astorwines.com/SearchResultsSingle...,"FUGA MENCIAJust barely opaque, and dark garnet...",mencía,0,fuga menciajust barely opaque dark garnet colo...,spain,ribeira sacra red Spain fuga menciajust barely...
4,"Cirò Rosso Classico Superiore, A Vita",A Vita,2016,750,red,21,Italy,calabria,https://www.astorwines.com/SearchResultsSingle...,"Here is a wine with a distinct, alluring chara...",gaglioppo,cirò,wine distinct alluring character dates back an...,italy,calabriacirò red Italy wine distinct alluring ...


In [4]:
# these should be NV
df['vintage'].fillna('NV', inplace=True)

In [5]:
# drop duplicates
duplicate = df[df['url'].duplicated() & df['name'].duplicated() & df['producer'].duplicated()].index
df.drop(duplicate, inplace=True)

Deal the duplicate names. Some of the wines have the same name but bottle size or vintage is different. Clean up vintage columns

In [6]:
pd.set_option('display.max_rows', None)
# pd.reset_option('max_rows')

In [7]:
# need to strip the white space bewteen name and NV, will add NV back after space removed
nv_in_name = df[df['name'].str.contains('NV')].index
df['name'].loc[nv_in_name] = [row.rstrip('NV') for row in df['name'].loc[nv_in_name].astype(str)]
df['name'].loc[nv_in_name] = [row.strip() for row in df['name'].loc[nv_in_name].astype(str)]
df['name'].loc[nv_in_name] = df['name'].loc[nv_in_name] + ' ' + 'NV'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
misslabeled_2016 = df[df['vintage'] == '20161'].index

In [9]:
df.loc[12131, 'vintage'] = 'NV'
df.loc[4479, 'vintage'] = 'NV'
df.loc[misslabeled_2016, 'vintage'] = '2016'
df.loc[3649, 'vintage'] = '2018'

In [10]:
dup_names = df[df['name'].duplicated()].index

In [11]:
# # only run once
df['name'].loc[dup_names]=df['name'].loc[dup_names].astype(str) + ' ' + df['vintage'].loc[dup_names].astype(str)

In [12]:
dup_names2 = df[df['name'].duplicated()].index

In [13]:
# only run once
df['name'].loc[dup_names2] = df['name'].loc[dup_names2].astype(str) + '*'

In [14]:
dup_names3 = df[df['name'].duplicated()].index

In [15]:
# # only run once
df['name'].loc[dup_names3] = df['name'].loc[dup_names3].astype(str) + ' ' + df['bottle_size'].loc[dup_names3].astype(str)

In [16]:
# drop ciders
cider_rows = df[df['grape'] == 'apple'].index
df.drop(cider_rows, inplace=True)

In [17]:
nullcolor = df[df['color'].isnull()].index
df.loc[nullcolor, 'color'] = 'other'

In [18]:
df['text_no_color'] = df['region'] + ' ' + df['subregion'] + ' ' + df['country'] + ' ' + df['grape'] + ' ' + df['description_clean']
df['text_no_color'] = [re.sub('[0123456789\.]','', row) for row in df['text_no_color']]

In [19]:
# strip all white space from name
df['name'] = [str(row).strip() for row in df['name']]

In [20]:
df[df['vintage'].isnull()]

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,country_clean,text,text_no_color


In [21]:
# pd.set_option('display.max_rows', None)
pd.reset_option('max_rows')

### Reassign Colors
These need to be cleaned up to fit into their appropriate category

In [22]:
hidden_spark = df[df['name'].str.contains('brut', flags=re.IGNORECASE)].index
df.loc[hidden_spark, 'color'] = 'sparkling'

In [23]:
brut = df[df['description'].str.contains('brut', flags=re.IGNORECASE)].index
df.loc[brut, 'color'] = 'sparkling'

In [24]:
sekt = df[df['description'].str.contains('sekt', flags=re.IGNORECASE)].index
df.loc[sekt, 'color'] = 'sparkling'

In [25]:
sparkling = df[df['description'].str.contains('sparkling', flags=re.IGNORECASE)].index
df.loc[sparkling, 'color'] = 'sparkling'

In [26]:
sauternes = df[df['name'].str.contains('sauternes', flags=re.IGNORECASE)].index
df.loc[sauternes, 'color'] = 'other'

In [27]:
vermouth = df[df['name'].str.contains('vermouth', flags=re.IGNORECASE)].index
df.loc[vermouth, 'color'] = 'other'

In [28]:
ros_rose = df[df['color'] == 'ros'].index
df.loc[ros_rose, 'color'] = 'rose'

In [29]:
sweet = df[df['color'] == 'sweet'].index
df.loc[sweet, 'color'] = 'other'

In [30]:
df.isnull().sum()

name                 0
producer             0
vintage              0
bottle_size          0
color                0
price                0
country              0
region               0
url                  0
description          0
grape                0
subregion            0
description_clean    0
country_clean        0
text                 0
text_no_color        0
dtype: int64

# Split up Data by Color

Split up tokens for Vec models 

In [31]:
df['description_clean_tokens'] = [str(row).split() for row in df['description_clean']]
df['text_clean_tokens'] = [str(row).split() for row in df['text']]
df['text_no_color_tokens'] = [str(row).split() for row in df['text_no_color']]

In [32]:
df.head(1)

Unnamed: 0,name,producer,vintage,bottle_size,color,price,country,region,url,description,grape,subregion,description_clean,country_clean,text,text_no_color,description_clean_tokens,text_clean_tokens,text_no_color_tokens
0,"Skin Contact Silvaner ""Kleine Heimat,"" 2Naturk...",2Naturkinder,2017,750,white,24,Germany,franken,https://www.astorwines.com/SearchResultsSingle...,"Made from 100% Silvaner, this white wine ferme...",silvaner,0,made silvaner white wine ferments skins days g...,germany,franken white Germany made silvaner white wine...,franken Germany silvaner made silvaner white ...,"[made, silvaner, white, wine, ferments, skins,...","[franken, white, Germany, made, silvaner, whit...","[franken, Germany, silvaner, made, silvaner, w..."


In [33]:
df['color'].value_counts()

red          7515
white        2930
sparkling     903
other         223
rose          196
Name: color, dtype: int64

In [34]:
order = ['name', 'producer', 'vintage', 'color', 'grape', 'price', 'country', 'region', 
                'subregion', 'description', 'description_clean', 'description_clean_tokens', 
                 'text', 'text_clean_tokens', 'text_no_color_tokens', 'bottle_size',  'url']

df = df.reindex(columns=order)

In [35]:
# drop columns not using
df = df.drop(columns=(['subregion', 'text_no_color_tokens']))

In [36]:
df.shape

(11767, 15)

In [37]:
df_red = df[df['color'] == 'red']
df_white = df[df['color'] == 'white']
df_rose = df[df['color'] == 'rose']
df_sparkling = df[df['color'] == 'sparkling']
df_other = df[df['color'] == 'other']

In [38]:
df_red['index'] = df_red.index
df_white['index'] = df_white.index
df_rose['index'] = df_rose.index
df_sparkling['index'] = df_sparkling.index
df_other['index'] = df_other.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [39]:
print('all:', df.shape)
print('red:', df_red.shape)
print('white:', df_white.shape)
print('rose:', df_rose.shape)
print('sparkling:', df_sparkling.shape)
print('other:', df_other.shape)

all: (11767, 15)
red: (7515, 16)
white: (2930, 16)
rose: (196, 16)
sparkling: (903, 16)
other: (223, 16)


# Save DF for use in Flask
After cleaning and dropping-reindex

In [40]:
# df.to_csv('../app/for_flask.csv', index=False)
# df_red.to_csv('../app/red_flask.csv', index=False)
# df_white.to_csv('../app/white_flask.csv', index=False)
# df_rose.to_csv('../app/rose_flask.csv', index=False)
# df_sparkling.to_csv('../app/sparkling_flask.csv', index=False)
# df_other.to_csv('../app/other_flask.csv', index=False)

# Fast Text Models
Save these for use in Flask

In [66]:
fast_text_all = FastText(df['text_clean_tokens'], size=100, window=5, min_count=5, workers=4, sg=1)
fast_text_red = FastText(df_red['text_clean_tokens'], size=100, window=5, min_count=5, workers=4, sg=1)
fast_text_white = FastText(df_white['text_clean_tokens'], size=100, window=5, min_count=5, workers=4, sg=1)
fast_text_rose = FastText(df_rose['text_clean_tokens'], size=100, window=5, min_count=5, workers=4, sg=1)
fast_text_sparkling = FastText(df_sparkling['text_clean_tokens'], size=100, window=5, min_count=5, workers=4, sg=1)
fast_text_other = FastText(df_other['text_clean_tokens'], size=100, window=5, min_count=5, workers=4, sg=1)

## Pickle all Models 

In [67]:
# pickle.dump(fast_text_all, open('model.all', 'wb'))
# pickle.dump(fast_text_red, open('model.red', 'wb'))
# pickle.dump(fast_text_white, open('model.white', 'wb'))
# pickle.dump(fast_text_rose, open('model.rose', 'wb'))
# pickle.dump(fast_text_sparkling, open('model.sparkling', 'wb'))
# pickle.dump(fast_text_other, open('model.other', 'wb'))

Load the models

In [68]:
fast_text_all = pickle.load(open('model.all', 'rb'))
fast_text_red = pickle.load(open('model.red', 'rb'))
fast_text_white = pickle.load(open('model.white', 'rb'))
fast_text_rose = pickle.load(open('model.rose', 'rb'))
fast_text_sparkling = pickle.load(open('model.sparkling', 'rb'))
fast_text_other = pickle.load(open('model.other', 'rb'))

Test different models on same words

In [69]:
fast_text_red.wv.most_similar(['frank cornellison', 'sicily'], topn=3)

[('friuli-venezia', 0.9493786096572876),
 ('abruzzi', 0.946995735168457),
 ('umbriamontefalco', 0.943882167339325)]

In [70]:
fast_text_all.wv.most_similar(['frank cornellison', 'sicily'], topn=3)

[('umbriamontefalco', 0.948532223701477),
 ('igt', 0.9394370317459106),
 ('giuliacarso', 0.9379663467407227)]

In [71]:
fast_text_white.wv.most_similar(['frank cornellison', 'sicily'], topn=3)

[('giuliacollio', 0.9835367202758789),
 ('robert', 0.977310836315155),
 ('hubert', 0.9770914316177368)]

In [72]:
fast_text_sparkling.wv.most_similar(['frank cornellison', 'sicily'], topn=3)

[('billecart', 0.9984743595123291),
 ('ml', 0.9980751276016235),
 ('horiot', 0.9979497194290161)]

In [73]:
fast_text_other.wv.most_similar(['frank cornellison', 'sicily'], topn=3)

[('botrytis', 0.9999287128448486),
 ('vibrant', 0.9999226927757263),
 ('version', 0.9999194145202637)]

# TFIDF Recommender 
Trained on all vocab

In [74]:
tf = TfidfVectorizer(analyzer='word', 
                     ngram_range=(1, 2), 
                     stop_words='english')

tf_matrix = tf.fit_transform(df['text'])
print('text:', tf_matrix.shape)

text: (11767, 386751)


In [75]:
cosine_sim = cosine_similarity(tf_matrix, tf_matrix)
recommender = pairwise_distances(cosine_sim, metric='cosine')
rec_df = pd.DataFrame(recommender, index=df['url'], columns=df['name'])

In [76]:
# rec_df.to_csv('../data/recommender_flask.csv', index=False)

## Use FastText to extract wines containing Text
Use a lemmatizer to get a better variety of top words in top word selections

In [77]:
low = df[df['price'] < 25]
mid = df[(df['price'] >= 25) & (df['price'] <= 75)]
high = df[(df['price'] > 75) & (df['price'] <= 150)]
collector = df[df['price'] > 150]

In [78]:
pd.set_option('display.max_rows', None)
# pd.reset_option('max_rows')

In [79]:
lemmatizer = WordNetLemmatizer()

In [80]:
fast_input = fast_text_white.wv.most_similar(['user'], topn=3)
user_input = [i[0] for i in fast_input]
check = [lemmatizer.lemmatize(w) for w in user_input]
unique = set(check)
print('unique:', unique)
print('top_words:', user_input)

unique: {'daughter', 'walter', 'order'}
top_words: ['order', 'daughter', 'walter']


# Recommender HERE

In [81]:
fast_input = fast_text_white.wv.most_similar(['sweet rielsing'], topn=4)
user_input = [i[0] for i in fast_input]
check_for_similar = [lemmatizer.lemmatize(w) for w in user_input]
unique = set(check_for_similar)
unique

{'quenching', 'rising', 'serving', 'smacking'}

In [83]:
for i in get_wines(unique):
                    
    show_info = {}
    show_info['name'] = get_wines(unique)['name']
    show_info['price'] = get_wines(unique)['price']
    show_info['country'] = get_wines(unique)['country']
    show_info['desciption'] = get_wines(unique)['description']
    show_info['link'] = get_wines(unique)['url'] 

In [87]:
fast_input = fast_text_red.wv.most_similar(['user_input'], topn=4)
user_input = [i[0] for i in fast_input]
check_for_similar = [lemmatizer.lemmatize(w) for w in user_input]
unique = set(check_for_similar)

def get_wines(unique): 
    for word in unique: 
        contains_word = df_red[df_red['description_clean'].str.contains(word)]['index']

        wines_with = []
        for wine in contains_word:
            wines_with.append(wine)
            location = df.loc[wines_with]
            
    return location