### 1. DATA PREPROCESSING

In [4]:
# Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors 
import re
import nltk
import csv
import time
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import itertools
from collections import Counter
import operator

# User-input dataset
input_data = pd.read_csv('user_input_set.csv')

# Master Dataset
df = pd.read_csv('df_mapped_final.csv')
df.reset_index(inplace = True)
df.drop(['index', 'Unnamed: 0'], axis = 1, inplace = True)

whiskeys = pd.read_csv('whiskey_data_new_final_for_KNN.csv')
whiskeys.drop('Index', axis = 1, inplace = True)

whiskey_set1 = set(df['whiskey_name'])
whiskey_set2 = set(whiskeys['whiskey_name'])
whiskey_set = whiskey_set1 & whiskey_set2
df = df.loc[df.whiskey_name.isin(whiskey_set)].reset_index()
df.drop('index', axis = 1, inplace = True)
whiskeys = whiskeys.loc[whiskeys.whiskey_name.isin(whiskey_set)].reset_index()
whiskeys.drop('index', axis = 1, inplace = True)

whiskey_dict = dict(set([i for i in zip(df['whiskey_name'], df['whiskey_id'])]))
user_dict = dict(set([i for i in zip(df['name'], df['user_id'])]))
whiskeys = whiskeys.assign(whiskey_id=whiskeys['whiskey_name'].map(whiskey_dict))
whiskeys.whiskey_id = whiskeys.whiskey_id.astype(int)

# Inverse whiskey dictionary is in the {'id':'whiskey_name'} format. 
# There exists another whiskey dict above to match whiskey_id across dataset
inverse_whiskey_dict = dict(set([i for i in zip(df['whiskey_id'], df['whiskey_name'])]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/han2114/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# whiskey_cb is the dataset containing the numerical features used for content-based recommendation system
# Choose the numerical values
whiskeys_cb = whiskeys[['whiskey_id', 'whiskey_name', 'type', 'abv_score', 'age(Year)',
       'smoky', 'peaty', 'spicy', 'herbal', 'oily',
       'full_bodied', 'rich', 'sweet', 'briny', 'salty', 'vanilla', 'tart',
       'fruity', 'floral', 'cost_count']]

# Fill NAs values in age(Year) with mean age.
whiskeys_cb['age(Year)'] = whiskeys_cb['age(Year)'].fillna(whiskeys_cb['age(Year)'].mean())

# Create dummy variables for whiskey types
whiskeys_cb = pd.concat([whiskeys_cb, whiskeys_cb.type.str.get_dummies()], axis=1)
whiskeys_cb.drop('type', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
whiskeys_cb.head(10)

Unnamed: 0,whiskey_id,whiskey_name,abv_score,age(Year),smoky,peaty,spicy,herbal,oily,full_bodied,...,Peated Blended Malt,Peated Single Malt,Rye,Single Grain,Single Malt,Single Pot Still,Spirit,Tennessee,Wheat Whiskey,White
0,974,Hibiki 21 Year,43.0,21.0,30,85,50,30,20,80,...,0,0,0,0,0,0,0,0,0,0
1,996,Highland Park 18 Year,43.0,18.0,40,30,40,20,40,70,...,0,1,0,0,0,0,0,0,0,0
2,325,Bowmore Mizunara Cask Finish,53.9,14.177221,30,20,40,30,10,75,...,0,1,0,0,0,0,0,0,0,0
3,2052,The Macallan M,47.7,14.177221,50,10,60,0,15,90,...,0,0,0,0,1,0,0,0,0,0
4,1663,Port Ellen 32 Year 1979 (2012 Special Release),52.5,32.0,70,50,20,10,20,80,...,0,1,0,0,0,0,0,0,0,0
5,2070,Thomas H. Handy Sazerac Rye (Fall 2015),63.45,6.0,10,0,60,0,10,80,...,0,0,1,0,0,0,0,0,0,0
6,2242,William Larue Weller Bourbon (Fall 2015),67.3,12.0,10,0,30,0,30,80,...,0,0,0,0,0,0,0,0,0,0
7,1971,The Balvenie Tun 1509 Batch 1,47.1,14.177221,10,0,25,10,15,80,...,0,0,0,0,1,0,0,0,0,0
8,289,"Booker's Bourbon Batch 2015-01 ""Big Man, Small...",64.35,7.0,15,0,35,5,20,80,...,0,0,0,0,0,0,0,0,0,0
9,748,Four Roses Limited Edition Single Barrel Bourb...,59.8,13.0,30,0,80,40,40,90,...,0,0,0,0,0,0,0,0,0,0


### 2. COSINE SIMILARITY

In [16]:
def recommend_cosine(input_data, n):
    
    chosen_index = np.array(input_data['whiskey_id'])
    chosen_rating = np.array(input_data['rating'])
    
    whiskey_df = whiskeys_cb.drop('whiskey_name', axis = 1)
    whiskey_df.set_index('whiskey_id', inplace=True)
    whiskey_mat = whiskey_df.values

    whiskey_vec = 0
    for i in range(len(chosen_index)):
        whiskey_vec += whiskey_df.loc[chosen_index[i]] * chosen_rating[i]
    whiskey_vec = np.array(whiskey_vec/len(whiskey_vec)).reshape(1, -1)
    
    cs = cosine_similarity(whiskey_vec, whiskey_mat)
    rec_index = np.argsort(cs)[0][::-1]
    
    recommendations = list()
    
    i = 0
    while i <= n:
        rec = whiskey_df.index[rec_index[i]]
        if (rec not in chosen_index):
            recommendations.append(rec)
        i += 1
    
    return [inverse_whiskey_dict[rec] for rec in recommendations]

In [17]:
recommend_cosine(input_data, 10)

['Bowmore Mizunara Cask Finish',
 'Sonoma County Distilling Co. West of Kentucky Bourbon No. 1',
 'Nikka All Malt Whisky',
 'Sonoma County Distilling Co. Cherrywood Rye Whiskey (Batch #1)',
 'Fettercairn Fior',
 'Catskill The One and Only Buckwheat',
 'Kurayoshi 12 Year Pure Malt Whisky',
 "Michter's US*1 Toasted Barrel Finish Bourbon (2015 Release)",
 'Clynelish Select Reserve (2015 Special Release)',
 'Benromach Imperial Proof',
 'Ardbeg Dark Cove (2016 Committee Release)']

### 3. JACCARD ITEM SIMILARITY

In [40]:
def recommend_jaccard(input_data, n):

    chosen_index = np.array(input_data['whiskey_id'])
    chosen_rating = np.array(input_data['rating'])
    
    whiskey_df = whiskeys_cb.drop('whiskey_name', axis = 1)
    whiskey_df.set_index('whiskey_id', inplace=True)
    whiskey_mat = whiskey_df.values

    whiskey_vec = 0
    for i in range(len(chosen_index)):
        whiskey_vec += whiskey_df.loc[chosen_index[i]] * chosen_rating[i]
    whiskey_vec = np.array(whiskey_vec/len(whiskey_vec)).reshape(1, -1)
    
    j_dist = pairwise_distances(whiskey_vec, whiskey_mat, metric='jaccard')
    rec_index = np.argsort(j_dist)[0]
    
    recommendations = list()
    
    i = 0
    while i <= n:
        rec = whiskey_df.index[rec_index[i]]
        if (rec not in chosen_index):
            recommendations.append(rec)
        i += 1
    
    return [inverse_whiskey_dict[rec] for rec in recommendations]

In [87]:
recommend_jaccard(input_data, 10)



['Amrut Bengal Tiger Single Cask Strength',
 'Connemara 12 Year',
 'Kavalan Distillery Reserve Peaty Cask',
 'Caol Ila 12 Year ',
 'Tomintoul Peaty Tang',
 'Highland Park Thor',
 "Bunnahabhain 8 Year The MacPhail's Collection (Gordon & MacPhail)",
 'Longrow Red 11 Year Cabernet Sauvignon Finish',
 'Springbank 21 Year',
 'Amrut Spectrum',
 'Laphroaig Quarter Cask']

### 4. KNN (With Ball Tree algorithm)

In [82]:
def recommend_knn(whiskey_id, n):
    
    chosen_index = np.array(input_data['whiskey_id'])
    chosen_rating = np.array(input_data['rating'])
    
    whiskey_df = whiskeys_cb.drop('whiskey_name', axis = 1)
    whiskey_np = np.array(whiskey_df)
    knn = NearestNeighbors(n_neighbors = n, algorithm='ball_tree').fit(whiskey_np)

    whiskey_test = 0
    for i in range(len(chosen_index)):
        whiskey_test += whiskey_df.loc[chosen_index[i]] * chosen_rating[i]
    whiskey_test = np.array(whiskey_test/len(whiskey_test)).reshape(1, -1)
    
    distances, indices = knn.kneighbors(whiskey_test)  
    recommendations = list()
    for index in indices[0]:
        if (index not in chosen_index):
            rec = inverse_whiskey_dict[index]
            recommendations.append(rec)
    return recommendations

In [83]:
recommend_knn(input_data, 10)

['Johnny Drum Private Stock Bourbon',
 'Westland Single Cask Release #281',
 'Sazerac 18 Year Rye (Fall 2014)',
 'Double Barrel Highland Park and Bowmore',
 "Jack Daniel's Sinatra Select",
 'William Larue Weller Bourbon (Fall 2014) ',
 'Wild Turkey Forgiven',
 'Lagavulin 12 Year (2008 Special Release)',
 'Four Roses Single Barrel Bourbon',
 'Whisky Jewbilee Festival Bottling 2014 (Single Cask Nation)']

### 5. Pearson Correlation

In [84]:
def recommend_pearson(input_data, n):

    chosen_index = np.array(input_data['whiskey_id'])
    chosen_rating = np.array(input_data['rating'])
    
    whiskey_df = whiskeys_cb.drop('whiskey_name', axis = 1)
    whiskey_df.set_index('whiskey_id', inplace=True)
    whiskey_mat = whiskey_df.values

    whiskey_vec = 0
    for i in range(len(chosen_index)):
        whiskey_vec += whiskey_df.loc[chosen_index[i]] * chosen_rating[i]
    whiskey_vec = np.array(whiskey_vec/len(whiskey_vec)).reshape(1, -1)
    
    p_dist = pairwise_distances(whiskey_vec, whiskey_mat, metric='correlation')
    rec_index = np.argsort(p_dist)[0]
    
    recommendations = list()
    
    i = 0
    while i <= n:
        rec = whiskey_df.index[rec_index[i]]
        if (rec not in chosen_index):
            recommendations.append(rec)
        i += 1
    
    return [inverse_whiskey_dict[rec] for rec in recommendations]

In [85]:
recommend_pearson(input_data, 10)

['Bowmore Mizunara Cask Finish',
 'Sonoma County Distilling Co. West of Kentucky Bourbon No. 1',
 'Sonoma County Distilling Co. Cherrywood Rye Whiskey (Batch #1)',
 "Michter's US*1 Toasted Barrel Finish Bourbon (2015 Release)",
 'Nikka All Malt Whisky',
 'Compass Box The Circus',
 'Sonoma County Distilling Co. West of Kentucky Bourbon No. 2',
 'Catskill The One and Only Buckwheat',
 'Clynelish Select Reserve (2015 Special Release)',
 'Fettercairn Fior',
 'Ardbeg Dark Cove (2016 Committee Release)']

### 6. TF-IDF

In [44]:
def item(id_):
    return new_d.loc[id_-1]['whiskey_name']

def recommend(id_, num):
    recs = results[id_-1][:num]
    return [inverse_whiskey_dict[i[1]] for i in recs]

def recommend_tfidf(input_data, num=1):
    
    chosen_index = np.array(input_data['whiskey_id'])
    chosen_rating = np.array(input_data['rating'])
    
    new_data = whiskeys[['whiskey_id', 'type', 'origin', 'flavor_headlines', 'tastes_note', 'description']]
    
    ps = PorterStemmer()
    size = 2474
    corpus=[]
    def Data_Processing(file):
        for i in range(len(file)):
            type_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["type"].iloc[i]).split()
            flavor_headlines_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["flavor_headlines"].iloc[i]).lower().split()
            tastes_note_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["tastes_note"].iloc[i]).lower().split()
            description_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["description"].iloc[i]).lower().split()
            origin_temp = new_data["origin"].iloc[i].lower()
            type_temp = [ps.stem(word) for word in type_temp if not word in set(stopwords.words('english')) and len(word) != 1] 
            flavor_headlines_temp = [ps.stem(word) for word in flavor_headlines_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            tastes_note_temp = [ps.stem(word) for word in tastes_note_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            description_temp = [ps.stem(word) for word in description_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            origin_temp = [ps.stem(word) for word in origin_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            type_temp = ' '.join(type_temp)
            flavor_headlines_temp = ' '.join(flavor_headlines_temp)
            tastes_note_temp = ' '.join(tastes_note_temp)
            description_temp = ' '.join(description_temp)
            origin_temp = ' '.join(origin_temp)
            paragraph = type_temp+flavor_headlines_temp+tastes_note_temp+description_temp+origin_temp
            corpus.append(paragraph)
    corpus_1 = Data_Processing(new_data)
    new_data["corpus"] = corpus
    
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(new_data["corpus"])
    cosine_similarities = cosine_similarity(tfidf_matrix,Y=None,dense_output=False)
    results = {}
    for idx, row in new_data.iterrows(): 
        x = cosine_similarities[idx].toarray()
        similar_indices = np.argsort(-x)
        similar_items = [(cosine_similarities[idx,j], (j)) for i in similar_indices for j in i ]
        results[idx] = similar_items[1:]
        
    new_d = pd.DataFrame({'Index': whiskeys['whiskey_id'], 'whiskey_name': whiskeys['whiskey_name']})
    
    rt = list()
    for i in chosen_index:
        rt.append(recommend(i, num))
    
    rt = list(itertools.chain(*rt))
    
    return rt

In [112]:
recommend_tfidf(input_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


['Eagle Rare 17 Year Bourbon (Fall 2018)',
 'Wayne Gretzky No. 99 Ninety Nine Proof Canadian Whisky',
 'Aberlour 15 Year Select Cask Reserve',
 'St. George Single Malt Lot 16',
 'Royal Canadian Small Batch',
 'Twelve Barrels Canadian Whisky',
 'Wayne Gretzky No. 99 Ice Cask',
 'Bowmore 12 Year',
 'Port Charlotte 2007 CC:01 ',
 'Nikka Super "Revival" Limited Edition Blended Whisky']

### 7. Combine all similarity metrics

In [53]:
def top_k(numbers, k):
    c = Counter(numbers)
    most_common = [(key, val) for key, val in c.most_common(k)]
    return most_common

def recommend_content_based(input_data, n):
    rc = recommend_cosine(input_data, n)
    rp = recommend_pearson(input_data, n)
    rk = recommend_knn(input_data, n)
    rj = recommend_jaccard(input_data, n)
    rt = recommend_tfidf(input_data)
    combined_list = rc + rp + rk + rj + rt
    return top_k(combined_list, n)

In [116]:
recommend_content_based(input_data, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[('Bowmore Mizunara Cask Finish', 2),
 ('Sonoma County Distilling Co. West of Kentucky Bourbon No. 1', 2),
 ('Nikka All Malt Whisky', 2),
 ('Sonoma County Distilling Co. Cherrywood Rye Whiskey (Batch #1)', 2),
 ('Fettercairn Fior', 2),
 ('Catskill The One and Only Buckwheat', 2),
 ("Michter's US*1 Toasted Barrel Finish Bourbon (2015 Release)", 2),
 ('Clynelish Select Reserve (2015 Special Release)', 2),
 ('Ardbeg Dark Cove (2016 Committee Release)', 2),
 ('Kurayoshi 12 Year Pure Malt Whisky', 1)]

### 7. FUNCTIONALIZE THE MODEL

In [5]:
def recommend_all(input_file_name, n):
    
    def top_k(numbers, k):
        c = Counter(numbers)
        most_common = [(key, val) for key, val in c.most_common(k)]
        return most_common
    
    def item(id_):
        return new_d.loc[id_-1]['whiskey_name']

    def recommend(id_, num):
        recs = results[id_-1][:num]
        return [inverse_whiskey_dict[i[1]] for i in recs]
    
    input_data = pd.read_csv(input_file_name)
    chosen_index = np.array(input_data['whiskey_id'])
    chosen_rating = np.array(input_data['rating'])
    
    whiskey_df = whiskeys_cb.drop('whiskey_name', axis = 1)
    whiskey_df.set_index('whiskey_id', inplace=True)
    whiskey_mat = whiskey_df.values
    
    whiskey_vec = 0
    for i in range(len(chosen_index)):
        whiskey_vec += whiskey_df.loc[chosen_index[i]] * chosen_rating[i]
    whiskey_vec = np.array(whiskey_vec/len(whiskey_vec)).reshape(1, -1)
    
    # COSINE SIMILARITY
    cs = cosine_similarity(whiskey_vec, whiskey_mat)
    rec_index = np.argsort(cs)[0][::-1]
    
    recommendations_cosine = list()
    
    i = 0
    while i <= n:
        rec = whiskey_df.index[rec_index[i]]
        if (rec not in chosen_index):
            recommendations_cosine.append(rec)
        i += 1
    
    # JACCARD SIMILARITY
    j_dist = pairwise_distances(whiskey_vec, whiskey_mat, metric='jaccard')
    rec_index = np.argsort(j_dist)[0]
    
    recommendations_jaccard = list()
    
    i = 0
    while i <= n:
        rec = whiskey_df.index[rec_index[i]]
        if (rec not in chosen_index):
            recommendations_jaccard.append(rec)
        i += 1
        
    # PEARSON CORRELATION
    p_dist = pairwise_distances(whiskey_vec, whiskey_mat, metric='correlation')
    rec_index = np.argsort(p_dist)[0]
    
    recommendations_pearson = list()
    
    i = 0
    while i <= n:
        rec = whiskey_df.index[rec_index[i]]
        if (rec not in chosen_index):
            recommendations_pearson.append(rec)
        i += 1
    
    # KNN
    from sklearn.neighbors import NearestNeighbors 
    whiskey_np = np.array(whiskey_df)
    knn = NearestNeighbors(n_neighbors = n, algorithm='ball_tree').fit(whiskey_np)
    
    distances, indices = knn.kneighbors(whiskey_vec)  
    recommendations_knn = list()
    for index in indices[0]:
        if (index not in chosen_index):
            rec = inverse_whiskey_dict[index]
            recommendations_knn.append(rec)
    
    # TF-IDF
    new_data = whiskeys[['whiskey_id', 'type', 'origin', 'flavor_headlines', 'tastes_note', 'description']]    
    ps = PorterStemmer()
    size = 2474
    corpus=[]
    def Data_Processing(file):
        for i in range(len(file)):
            type_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["type"].iloc[i]).split()
            flavor_headlines_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["flavor_headlines"].iloc[i]).lower().split()
            tastes_note_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["tastes_note"].iloc[i]).lower().split()
            description_temp = re.sub('[^A-Za-z0-9 ]+', '', new_data["description"].iloc[i]).lower().split()
            origin_temp = new_data["origin"].iloc[i].lower()
            type_temp = [ps.stem(word) for word in type_temp if not word in set(stopwords.words('english')) and len(word) != 1] 
            flavor_headlines_temp = [ps.stem(word) for word in flavor_headlines_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            tastes_note_temp = [ps.stem(word) for word in tastes_note_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            description_temp = [ps.stem(word) for word in description_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            origin_temp = [ps.stem(word) for word in origin_temp if not word in set(stopwords.words('english')) and len(word) != 1]
            type_temp = ' '.join(type_temp)
            flavor_headlines_temp = ' '.join(flavor_headlines_temp)
            tastes_note_temp = ' '.join(tastes_note_temp)
            description_temp = ' '.join(description_temp)
            origin_temp = ' '.join(origin_temp)
            paragraph = type_temp+flavor_headlines_temp+tastes_note_temp+description_temp+origin_temp
            corpus.append(paragraph)
    corpus_1 = Data_Processing(new_data)
    new_data["corpus"] = corpus
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(new_data["corpus"])
    cosine_similarities = cosine_similarity(tfidf_matrix,Y=None,dense_output=False)
    results = {}
    for idx, row in new_data.iterrows(): 
        x = cosine_similarities[idx].toarray()
        similar_indices = np.argsort(-x)
        similar_items = [(cosine_similarities[idx,j], (j)) for i in similar_indices for j in i ]
        results[idx] = similar_items[1:]
    new_d = pd.DataFrame({'Index': whiskeys['whiskey_id'], 'whiskey_name': whiskeys['whiskey_name']})
    recommendations_tfidf = list()
    for i in chosen_index:
        recommendations_tfidf.append(recommend(i, 1))
    import itertools
    recommendations_tfidf = list(itertools.chain(*recommendations_tfidf))
    
    combined_list = recommendations_cosine + recommendations_pearson + recommendations_jaccard + recommendations_knn + recommendations_tfidf
    return [inverse_whiskey_dict[i[0]] for i in top_k(combined_list, n)]

In [6]:
recommend_all('user_input_set.csv', 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


['Glenfarclas 21 Year',
 'A Drop of the Irish Sherry Cask Finish (Blackadder)',
 'Fettercairn Fior',
 'Nikka Taketsuru Pure Malt 21 Year',
 'Nikka Yoichi 15 Year Single Malt',
 'Jura Superstition',
 'Kurayoshi 12 Year Pure Malt Whisky',
 'Barrell Bourbon Batch 004',
 'Jim Beam Single Barrel Bourbon',
 'Bunnahabhain 18 Year']