In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
wine_df = pd.read_csv('clean_wine_df.csv', index_col=0)
wine_df.head()

Unnamed: 0,country,description,points,price,province,title,variety,winery,quality,price_range
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,ok,1-30
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,ok,1-30
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,ok,1-30
4,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,ok,61-100
15,Germany,Zesty orange peels and apple notes abound in t...,87,24.0,Mosel,Richard Böcking 2013 Devon Riesling (Mosel),Riesling,Richard Böcking,ok,1-30


In [6]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_features=1000)
vector = vectorizer.fit_transform(np.array(wine_df.description))

In [7]:
features = vector.todense()

In [36]:
label = wine_df.quality

In [37]:
X = features
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
log_reg = LogisticRegression()
m = log_reg.fit(X_train, y_train)



In [39]:
accuracy = m.score(X_test, y_test)
print(accuracy)

0.7057466585250017


In [40]:
x = vectorizer.transform(np.array([wine_df.description.iloc[0]]))
proba = m.predict_proba(x)
classes = m.classes_
pred_df = pd.DataFrame(data=proba, columns=classes)
pred = pred_df.T.sort_values(by=[0], ascending=False)
print(wine_df.iloc[0])
pred.head()

country                                                 Portugal
description    This is ripe and fruity, a wine that is smooth...
points                                                        87
price                                                         15
province                                                   Douro
title              Quinta dos Avidagos 2011 Avidagos Red (Douro)
variety                                           Portuguese Red
winery                                       Quinta dos Avidagos
quality                                                       ok
price_range                                                 1-30
Name: 1, dtype: object


Unnamed: 0,0
ok,0.73314
good,0.205695
bad,0.060781
great,0.000384


In [41]:
label = wine_df.variety
X = features
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
log_reg = LogisticRegression()
m = log_reg.fit(X_train, y_train)
accuracy = m.score(X_test, y_test)
print(accuracy)



0.6825429133591153


In [42]:
x = vectorizer.transform(np.array([wine_df.description.iloc[0]]))
proba = m.predict_proba(x)
classes = m.classes_
pred_df = pd.DataFrame(data=proba, columns=classes)
pred = pred_df.T.sort_values(by=[0], ascending=False)
print(wine_df.iloc[0])
pred.head()

country                                                 Portugal
description    This is ripe and fruity, a wine that is smooth...
points                                                        87
price                                                         15
province                                                   Douro
title              Quinta dos Avidagos 2011 Avidagos Red (Douro)
variety                                           Portuguese Red
winery                                       Quinta dos Avidagos
quality                                                       ok
price_range                                                 1-30
Name: 1, dtype: object


Unnamed: 0,0
Portuguese Red,0.486436
Bordeaux-style Red Blend,0.348712
Pinot Noir,0.044834
Red Blend,0.042183
Malbec,0.02538


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [55]:
wine_df['wine_id'] = wine_df['title'].astype('category').cat.codes
print(wine_df.shape)
wine_df.wine_id.nunique()

(73691, 11)


73691

In [61]:
train, test = train_test_split(wine_df, train_size=0.05)
train.reset_index(drop=True, inplace=True)

In [62]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 0, stop_words = 'english')
tf_matrix = tf.fit_transform(train.description)

In [63]:
cos_sim = linear_kernel(tf_matrix, tf_matrix)

In [64]:
pred_dict = {}
for idx, row in train.iterrows():
    sim_idx = cos_sim[idx].argsort()[:100:-1]
    sim_items = [(cos_sim[idx][i], train.wine_id[i]) for i in sim_idx]
    pred_dict[row['wine_id']] = sim_items[1:]

In [69]:
def get_title(id):
    return train.loc[train.wine_id == id].title.tolist()[0].split(' - ')[0]

def get_recommendation(title_id, num):
    print('Top ' + str(num) + ' recommendations for ' + get_title(title_id) + ':\n')
    recs = pred_dict[title_id][:num]
    for rec in recs:
        print(get_title(rec[1]) + ': score = ' + f'{rec[0]:.2f}')

In [92]:
test_id = train.loc[:, 'wine_id'].values[23]
test_title = train.loc[train.wine_id == test_id, 'title'].values[0]

get_recommendation(test_id, 8)

Top 8 recommendations for H. Abrantes Douro Wines 2011 Vargosa Red (Douro):

Château Beaulieu 2011 Château Beaulieu Rosé (Coteaux d'Aix-en-Provence): score = 0.05
Maison des 3 Ponts 2014 Lepontis Sauvignon Blanc (Charentais): score = 0.04
Sineann 2012 Red (Oregon): score = 0.04
Château Gauthier 2015  Blaye Côtes de Bordeaux: score = 0.04
Olivier Leflaive 2012 Abbaye de Morgeot Premier Cru  (Chassagne-Montrachet): score = 0.04
Kastania 2012 Jaden and Keira's Cuvée Pinot Noir (Sonoma Coast): score = 0.04
Louis Sipp NV Brut Sparkling (Crémant d'Alsace): score = 0.04
Château Troplong Mondot 2007  Saint-Émilion: score = 0.04


In [94]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [114]:
col = ['province', 'variety', 'points', 'price']

temp_df = wine_df[col]
temp_df.drop_duplicates(['province', 'variety'], inplace=True)
wine_pivot = temp_df.pivot(index = 'variety', columns = 'province', values=['points', 'price']).fillna(0)
pivot_matrix = csr_matrix(wine_pivot)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [115]:
knn = NearestNeighbors(n_neighbors = 10, algorithm = 'brute', metric = 'cosine')
m_knn = knn.fit(pivot_matrix)

In [121]:
rand_idx = np.random.choice(wine_pivot.shape[0])
dist, idx = m_knn.kneighbors(wine_pivot.iloc[rand_idx, :].values.reshape(1, -1), n_neighbors = 10)

for i in range(0, len(dist.flatten())):
    if i == 0:
        print('Top recommendations for ' + wine_pivot.index[rand_idx])
    else:
        print(str(i) + ' ' + wine_pivot.index[idx.flatten()[i]] + ' with distance ' + str(dist.flatten()[i]))

Top recommendations for Nebbiolo
1 Sangiovese with distance 0.5203008843908599
2 Tempranillo with distance 0.6931693010858284
3 Sparkling Blend with distance 0.709833663795427
4 Red Blend with distance 0.7737772438890955
5 Zinfandel with distance 0.7767075492995374
6 White Blend with distance 0.7806166873152927
7 Chardonnay with distance 0.7929535435480267
8 Malbec with distance 0.806316613795247
9 Pinot Gris with distance 0.8148184280321505
