# Building a Crowdsourced Recommendation System
#### The objective of this project is to create the building blocks of a crowdsourced recommender system. It accepts user inputs in the form of desired attributes of a product and come up with 3 recommendations.

## Step 1: Creating a Web Scrapper to Extract Reviews

#### Getting page info

In [38]:
import sys, re, time, string, random, csv, argparse
import requests
import numpy as np
import pandas as pd
from time import sleep
from random import randint

# web scraping libraries
from bs4 import BeautifulSoup

# Url
url_forum = 'https://www.beeradvocate.com/beer/top-rated/'
# Downloading source code of the url
html_data = requests.get(url_forum).text
soup = BeautifulSoup(html_data, 'html.parser')
#soup

In [39]:
# Saving Top 250 Beers
order_soup = [i.td.text if i.td is not None else None for i in soup.find_all('tr')]
beer_soup = [i.b.text if i.b is not None else None for i in soup.find_all('tr')]
company_soup = [i.find('a').text if i.find('a') is not None else None  for i in soup.find_all('span', attrs = {'class':'muted'})]
type_soup = [i.text.split('|')[0] if len(i.text.split('|'))>0 else None for i in soup.find_all('span', attrs = {'class':'muted'} )]
pctage_soup = [i.text.split('|')[1] if len(i.text.split('|'))>1 else None for i in soup.find_all('span', attrs = {'class':'muted'} )]
ratings_soup = [ str(i.find_all('td')[2:3])[58:str(i.find_all('td')[2:3]).find('</b')].replace(',','') if i is not None else None for i in soup.find_all('tr')]
average_soup = [ str(i.find_all('td')[3:4])[58:str(i.find_all('td')[3:4]).find('</b')] if i is not None else None for i in soup.find_all('tr')]
profile_soup = [str(i.a)[9:str(i.a).find('><b')-1] if i is not None else None for i in soup.find_all('tr')]

# Creating a DataFrame
df_beer_ranking = pd.DataFrame({'order':order_soup,
                                'beer':beer_soup,
                                'company':company_soup,
                                'type':type_soup,
                                'percentage':pctage_soup,
                                'ratings':ratings_soup,
                                'average':average_soup,
                                'profile':profile_soup})
df_beer_ranking = df_beer_ranking[1:].copy().reset_index(drop=True)

# Savings
df_beer_ranking.to_csv('df_beer_ranking.csv', index=False)

#### Scraping Reviews

In [40]:
df_reviews = pd.DataFrame() # empty df

for p in range(len(df_beer_ranking)):
  # Random waiting time
  sleep(randint(2,8))

  # Starting
  page = df_beer_ranking.loc[p,'profile']
  # Url
  url_forum = f'https://www.beeradvocate.com{page}'
  # Downloading source code of the url
  html_data = requests.get(url_forum).text
  soup = BeautifulSoup(html_data, 'html.parser')

  # Reviews
  user_soup = [i.find('a', class_='username').text for i in soup.findAll('div', attrs = {'id':'rating_fullview_content_2'})]
  comment_soup = [i.find('span', class_='muted') for i in soup.findAll('div', attrs = {'id':'rating_fullview_content_2'})]
  review_soup = [str(i.find('div'))[str(i.find('div')).find('>')+1:-6].replace('<br>','').replace('</br>','').replace('\n','') 
                  for i in soup.findAll('div', attrs = {'id':'rating_fullview_content_2'})]
  beer_lst = [df_beer_ranking.loc[p,'beer']]*len(user_soup)
  order_lst = [df_beer_ranking.loc[p,'order']]*len(user_soup)

  # Creating a DataFrame
  df_reviews_tmp = pd.DataFrame({'order':order_lst,
                                  'beer':beer_lst,
                                  'user':user_soup,
                                  'comment':comment_soup,
                                  'review':review_soup})
  df_reviews = df_reviews.append(df_reviews_tmp)
  #ncom = len(df_reviews)

  # Saving Reviews
  df_reviews[df_reviews['review']!=''].to_csv('df_reviews_2.csv', index=False)


  df_reviews = df_reviews.append(df_reviews_tmp)


KeyboardInterrupt: 

## Step 2: Picking Three Features from Reviews

In [40]:
import pandas as pd
df = pd.read_csv('df_reviews_2.csv')
df.groupby(['beer'])['beer'].count()

beer
4th Anniversary                             5
A Deal With The Devil - Double Oak-Aged     9
A Deal With The Devil - Triple Oak-Aged     6
Abner                                      13
Abrasive Ale                               11
                                           ..
XTRA DUBL Benthic                           7
Yellow Bus                                  7
Zenne Y Frontera                            6
Zombie Dust                                21
§ucaba                                     15
Name: beer, Length: 248, dtype: int64

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

[nltk_data] Downloading package punkt to /Users/kenzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kenzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') #removes punctuation, keeps only words
def counts(x): #count frequencies
  return Counter(tokenizer.tokenize((x.lower())))
c = df['review'].astype(str).apply(counts)
c

0       {'i': 3, 'didnt': 1, 'think': 1, 'was': 2, 'go...
1       {'so': 4, 'i': 16, 'just': 3, 'read': 1, 'a': ...
2       {'2021': 1, 'vintage': 1, 'bottle': 2, '79holy...
3       {'celebrating': 1, 'my': 2, 'buddy': 1, 'rug':...
4       {'thick': 1, 'and': 3, 'syrupy': 2, 'pour': 1,...
                              ...                        
2220    {'pours': 2, 'a': 3, 'thick': 2, 'oily': 1, 'b...
2221    {'2021': 1, '11': 1, '2712oz': 1, 'bottle': 1,...
2222    {'2018': 1, 'vintage12oz': 1, 'bottle': 1, 'po...
2223    {'annual': 1, 'beer': 1, 'grab': 1, 'and': 1, ...
2224    {'vintage': 1, '2020': 2, '5': 1, '18this': 1,...
Name: review, Length: 2225, dtype: object

In [5]:
all = c.sum()

In [7]:
from nltk.corpus import stopwords #removing stopwords from attributes
nostop = [w for w in all if not w.lower() in set(stopwords.words('english'))]
sorted(nostop, key = lambda word:all[word], reverse=True)[:20]

### We can see that some popular attributes are chocolate, dark, bourbon, sweet, aroma. But we want to make sure the attributes we pick actually co-exist in real life (For example, a beer can't be strong and weak). To prevent that from happening, we will use a lift chart.

In [9]:
attrs = sorted(nostop, key = lambda word:all[word], reverse=True)[:20] #finding lifts for some example attributes
#presence mapping
cols = sorted(set(attrs))
import numpy as np
row = np.zeros(len(cols))
def modelcounts(x):
  r = row.copy() #init new row
  for i,b in enumerate(cols):
    if b in x:
      r[i]=1
  return r
counts = pd.DataFrame(c.apply(modelcounts).tolist(),columns = cols)
counts

Unnamed: 0,aroma,beer,body,bourbon,chocolate,coffee,dark,good,head,light,like,medium,nice,nose,notes,one,sweet,taste,vanilla,well
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
2,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2221,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2222,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
2223,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
attrlifts = pd.DataFrame()
N = len(df)
for w1 in counts.columns:
  for w2 in counts.columns:
    nw1 = counts[w1].sum()
    nw2 = counts[w2].sum()
    mask1 = (counts[w1] == 1)
    mask2 = (counts[w2] == 1)
    n12 = len(counts[mask1 & mask2])
    lift = (N*n12)/(nw1*nw2)
    attrlifts.loc[w1,w2] = lift
attrlifts 

Unnamed: 0,aroma,beer,body,bourbon,chocolate,coffee,dark,good,head,light,like,medium,nice,nose,notes,one,sweet,taste,vanilla,well
aroma,4.597107,1.242746,1.62306,1.427092,1.308195,1.302514,1.608512,1.276035,1.599374,1.528949,1.304338,1.596485,1.125389,0.803378,1.707203,1.295217,1.442955,1.484193,1.345785,1.424317
beer,1.242746,2.548683,1.139105,1.264614,1.184959,1.281421,1.29281,1.279026,1.206506,1.189006,1.410878,1.077058,1.212537,1.331254,1.362718,1.355252,1.246137,1.303619,1.228122,1.486732
body,1.62306,1.139105,4.540816,1.340292,1.474173,1.299178,1.551211,1.393964,1.653168,1.946064,1.387472,2.393903,1.436702,1.296117,1.828517,1.387472,1.306513,1.405621,1.576346,1.377769
bourbon,1.427092,1.264614,1.340292,5.661578,2.881845,2.484804,2.719433,1.134397,1.404631,1.377482,1.303062,1.042299,1.176771,1.319203,1.583215,1.314295,1.503679,1.300725,3.168137,1.524271
chocolate,1.308195,1.184959,1.474173,2.881845,4.458918,3.121242,2.631038,1.254071,1.470765,1.363553,1.477459,1.08208,1.307812,1.385295,1.45638,1.282824,1.596956,1.353311,2.95721,1.495834
coffee,1.302514,1.281421,1.299178,2.484804,3.121242,6.180556,2.559236,1.29519,1.345387,1.117467,1.361194,0.995613,1.298916,1.368123,1.355021,1.336668,1.529594,1.397538,2.593912,1.465901
dark,1.608512,1.29281,1.551211,2.719433,2.631038,2.559236,4.606625,1.270209,1.554517,1.460136,1.471561,1.252848,1.329857,1.449074,1.597376,1.517261,1.705471,1.437133,2.518129,1.535542
good,1.276035,1.279026,1.393964,1.134397,1.254071,1.29519,1.270209,4.090074,1.341326,1.214241,1.395819,1.420402,1.294088,1.286586,1.381658,1.339012,1.333183,1.266093,1.250333,1.214787
head,1.599374,1.206506,1.653168,1.404631,1.470765,1.345387,1.554517,1.341326,2.115019,1.666522,1.250547,1.774315,1.523986,1.482567,1.542497,1.229565,1.442639,1.506344,1.430104,1.468763
light,1.528949,1.189006,1.946064,1.377482,1.363553,1.117467,1.460136,1.214241,1.666522,4.966518,1.152942,2.0157,1.594333,1.542996,1.633284,1.251484,1.309082,1.519382,1.350996,1.48571


In [11]:
N

2225

In [13]:
#at a glance, it seems vanilla, bourbon, and dark have decent lifts >2.5. chocolate and coffee could work too
input = set(['vanilla','bourbon','dark']) #change values here if no input file
#input = ['vanilla', 'bourbon', 'dark']
print(input)

{'bourbon', 'dark', 'vanilla'}


## Step 3: Recommendation 1 - Perform a similarity analysis using cosine similarity (without word embeddings) with the 3 attributes specified by the customer and the reviews

In [41]:
#calculating cosine similarity
import numpy as np
def cosinesim(x):
  counter_row = Counter(tokenizer.tokenize((x.lower())))
  Y = [counter_row[i] for i in input]
  if sum(Y) == 0: return 0
  X = np.array([1,1,1]) # vector input

  num = X[0]*Y[0] + X[1]*Y[1] + X[2]*Y[2]
  dem = (X[0]*X[0] + X[1]*X[1] + X[2]*X[2])**.5 * (Y[0]*Y[0] + Y[1]*Y[1] + Y[2]*Y[2])**.5
  cos_sim = num/dem
  return cos_sim
cosim = df['review'].astype(str).apply(cosinesim)

In [19]:
cosinesim("vanilla bourbon dark") #double check that perfect match is 1

1.0000000000000002

In [42]:
out = pd.DataFrame([df['beer'],df['review'],cosim],index=['product_name','product_review','similarity_score']).T
#out.sort_values(by="similarity_score",ascending=False)
out.groupby('product_name')['similarity_score'].mean().sort_values(ascending=False)

product_name
Black Tuesday - Reserve                   0.869032
Expedition Stout - Bourbon Barrel-Aged    0.788349
BDCS                                      0.748466
Fundamental Observation                   0.729515
Truth                                     0.717687
                                            ...   
Impermanence                              0.000000
Cable Car                                 0.000000
Cable Car Kriek                           0.000000
Hommage                                   0.000000
4th Anniversary                           0.000000
Name: similarity_score, Length: 248, dtype: float64

#### Given vanilla bourbon dark as selected attributes, we recommend the three brands as follow:           
*   Black Tuesday - Reserve
*   Expedition Stout - Bourbon Barrel-Aged
*   LBDCS 








## Step 4: For every review, perform a sentiment analysis using VADER

In [23]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
    return sentiment_dict['compound']



In [27]:
sentiment_scores_list = []
for review in out["product_review"]:
  sentiment_score = sentiment_scores(review)
  sentiment_scores_list.append(sentiment_score)
len(sentiment_scores_list) # check number of rows

2225

In [28]:
sorted(sentiment_scores_list)[:5] #from lowest to highest sentiment score

[-0.9742, -0.9493, -0.9447, -0.8989, -0.8772]

In [29]:
positive = filter(lambda score: score > 0, sentiment_scores_list)
len(list(positive))/len(sentiment_scores_list) 
# 2007/90.2% reviews are positive, the rest are neutral or negative

0.9020224719101123

In [53]:
out["sentiment_score"] = sentiment_scores_list
out

Unnamed: 0,product_name,product_review,similarity_score,sentiment_score
0,Kentucky Brunch Brand Stout,I didnt think i was going to give it a perfect...,0.577350,0.9650
1,Kentucky Brunch Brand Stout,So I just read a review that called the legend...,0.774597,0.9972
2,Kentucky Brunch Brand Stout,"2021 vintage, bottle #79Holy. Fucking. Shit. K...",0.816497,0.9921
3,Kentucky Brunch Brand Stout,"Celebrating my buddy @Rug with his 1,000th bee...",0.000000,0.9476
4,Kentucky Brunch Brand Stout,"Thick and syrupy pour, mocha head. Aroma is bo...",0.000000,0.7264
...,...,...,...,...
2220,Expedition Stout - Bourbon Barrel-Aged,"Pours a thick oily black, vigorous pours doesn...",0.577350,0.9212
2221,Expedition Stout - Bourbon Barrel-Aged,2021-11-2712oz bottle served in a pair of Geor...,0.942809,0.7425
2222,Expedition Stout - Bourbon Barrel-Aged,2018 Vintage12oz bottle poured into a tulipA- ...,0.925820,0.9652
2223,Expedition Stout - Bourbon Barrel-Aged,Annual beer grab and generally availed in the ...,0.577350,0.5563


### Step 5: Create an evaluation score for each beer that uses both similarity and sentiment scores.
    total score  = average of (similarity score + sentiment score) or a multiplicative model

In [54]:
# total score is the average of similarity and sentiment scores
out["total_score"] = (out["similarity_score"]+out["sentiment_score"])/2
#out.sort_values(by="total_score",ascending=False)[:10]
out.groupby('product_name')['total_score'].mean().sort_values(ascending=False)

product_name
Black Tuesday - Reserve                    0.854537
BDCS                                       0.792975
Fundamental Observation                    0.785061
Bourbon Paradise                           0.785042
Expedition Stout - Bourbon Barrel-Aged     0.773224
                                             ...   
Art                                        0.162625
Impermanence                               0.160542
Double Dry Hopped Double Citra Daydream    0.110175
Double Nelson                              0.110075
JJJuiceee Machine                          0.099296
Name: total_score, Length: 248, dtype: float64

####    We would recommend **"Black Tuesday - Reserve"**, **"BDCS"**, and **"Fundamental Observation"** to the customer, because these three brands has the highest total score, calculated as the average of similarity score and sentiment score. These three brands have reviews that are the most relevant to the attributes specified by the customer, and those reviews express relatively positive emotions.

## Step 6: Recommendation 3 - Using SpaCy Word Embedding


In [33]:
! pip install spacy
! python -m spacy download en_core_web_md
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_md")

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[K     |████████████████████████████████| 42.8 MB 5.3 MB/s eta 0:00:01


Installing collected packages: en-core-web-md
  Attempting uninstall: en-core-web-md
    Found existing installation: en-core-web-md 3.4.0
    Uninstalling en-core-web-md-3.4.0:
      Successfully uninstalled en-core-web-md-3.4.0
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [34]:
df2 = pd.DataFrame([df['beer'],df['review']],index=['product_name','product_review']).T
with nlp.disable_pipes():
    spacy_reviews = [nlp(review).vector for review in df2['product_review'] ] #word embedding for reviews
df2['review_vector'] = spacy_reviews

In [35]:
with nlp.disable_pipes():  # word embedding for features
    feature1 = nlp('vanilla').vector
    feature2 = nlp('dark').vector
    feature3 = nlp('bourbon').vector
df2['Feature1 Similarity'] = df2['review_vector'].apply(lambda x: cosine_similarity([feature1],[x])) # cosine sim btw review & feature
df2['Feature2 Similarity'] = df2['review_vector'].apply(lambda x: cosine_similarity([feature2],[x]))
df2['Feature3 Similarity'] = df2['review_vector'].apply(lambda x: cosine_similarity([feature3],[x]))
df2['SpaCy Score'] = (df2['Feature1 Similarity']+df2['Feature2 Similarity']+df2['Feature3 Similarity'])/3 #calculate average
df2.sort_values(by = 'SpaCy Score',ascending=False)
df2.groupby('product_name')['SpaCy Score'].mean().sort_values(ascending=False) #aggregate sim score and rank

product_name
All That Is And All That Ever Will Be                      [[0.3767705]]
Somewhere, Something Incredible Is Waiting To Be Known    [[0.34305763]]
The Peach                                                  [[0.3273111]]
Sunday Brunch                                             [[0.32269025]]
Genealogy Of Morals - Bourbon Barrel-Aged                 [[0.32221702]]
                                                               ...      
Assassin                                                  [[0.18391205]]
Flora                                                     [[0.17236246]]
Chemtrailmix - Rye Barrel                                 [[0.16794708]]
Oude Geuze Vintage                                        [[0.13548075]]
Lou Pepe - Gueuze                                         [[0.11752429]]
Name: SpaCy Score, Length: 248, dtype: object

In [36]:
#with nlp.disable_pipes():  # word embedding for features
feature1 = nlp('vanilla dark bourbon')
    
df2['Feature1 Similarity'] = df2['review_vector'].apply(lambda x: feature1.similarity(nlp(str(x)))) # cosine sim btw review & feature
#df2['SpaCy Score'] = (df2['Feature1 Similarity']+df2['Feature2 Similarity']+df2['Feature3 Similarity'])/3 #calculate average
df2.sort_values(by = 'Feature1 Similarity',ascending=False)
#df2.groupby('product_name')['Feature1 Similarity'].mean().sort_values(ascending=False) #aggregate sim score and rank

Unnamed: 0,product_name,product_review,review_vector,Feature1 Similarity,Feature2 Similarity,Feature3 Similarity,SpaCy Score
971,Beatification,3/25/15,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.013892,[[0.0]],[[0.0]],[[0.0]]
1783,Truth - Vanilla Bean,8/20/2022,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.013892,[[0.0]],[[0.0]],[[0.0]]
1784,Truth - Vanilla Bean,2/6/21,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.013892,[[0.0]],[[0.0]],[[0.0]]
702,Oude Geuze Vintage,2016,"[0.46224, -6.2749, 4.1947, 2.7563, 8.3759, -5....",-0.014847,[[-0.111383274]],[[-0.16449346]],[[-0.104845814]]
1892,Bad Boy,6-28-19,"[-2.9978187, -6.4110003, -0.810454, 6.705826, ...",-0.042440,[[-0.06961019]],[[-0.13597737]],[[-0.09225232]]
...,...,...,...,...,...,...,...
1915,Cellarman Barrel Aged Saison,Bottle picked up from brewery today. Golden de...,"[-0.68508285, -1.7408154, -1.8804542, 0.618368...",-0.068094,[[0.34850982]],[[0.19004977]],[[0.29275325]]
499,Doubleganger,"Fresh cans from Sandwich, enjoyed on Cape Cod ...","[-2.0624087, -0.24333702, -2.2288241, 1.030628...",-0.068094,[[0.37675303]],[[0.15755668]],[[0.2757649]]
1304,King Sue - Double Dry-Hopped,Pours a hazed straw/gold with 3 fingers onf eg...,"[-1.3239363, -2.3851368, -1.2308985, 0.8977245...",-0.068094,[[0.2881712]],[[0.09969126]],[[0.24486847]]
682,Lou Pepe - Framboise,Today our fence is being repaired after a Torn...,"[-1.9826367, -1.0744156, -0.9610734, 0.6756927...",-0.068094,[[0.39452243]],[[0.1919223]],[[0.31008202]]


#### Previously, basic Bag of Words cosine similarity system recommended three beers: 
1.   Black Tuesday - Reserve
2.   BDCS 
3.   Fundamental Observation

#### Using spaCy word2vec, we recommend the following beers: 
1.   The Peach
2.   Sunday Brunch
3.   Genealogy Of Morals - Bourbon Barrel-Aged




In [37]:
def contain_vanilla(x):
    words = x.split()
    for word in words:
      if word == "vanilla":
        return 1
    return 0

def contain_dark(x):
    words = x.split()
    for word in words:
      if word == "dark":
        return 1
    return 0

def contain_bourbon(x):
    words = x.split()
    for word in words:
      if word == "bourbon":
        return 1
    return 0

df2['Review Contain vanilla'] = df2['product_review'].apply(contain_vanilla)
df2['Review Contain dark'] = df2['product_review'].apply(contain_dark)
df2['Review Contain bourbon'] = df2['product_review'].apply(contain_bourbon)  # check if the review contain the feature(exact match)

w2v_rec = df2[df2['product_name'].isin(('Sunday Brunch','The Peach','Genealogy Of Morals - Bourbon Barrel-Aged'))][['product_name','Review Contain vanilla','Review Contain dark','Review Contain bourbon']]
bow_rec = df2[df2['product_name'].isin(('Black Tuesday - Reserve','BDCS','Fundamental Observation'))][['product_name','Review Contain vanilla','Review Contain dark','Review Contain bourbon']]


### Comparing results:
#### Bag of word recommendation - % of its reviews mentioning the specified features

In [38]:
def get_ratio(s):
   return ((s.sum()/s.count())*100).round(2).astype('str')+'%'

bow_rec.groupby('product_name',as_index=False)[['Review Contain vanilla','Review Contain dark','Review Contain bourbon']].agg(get_ratio)

Unnamed: 0,product_name,Review Contain vanilla,Review Contain dark,Review Contain bourbon
0,BDCS,38.46%,69.23%,61.54%
1,Black Tuesday - Reserve,14.29%,57.14%,71.43%
2,Fundamental Observation,52.94%,52.94%,35.29%


#### W2Vec recommendation - % of its reviews mentioning the specified features

In [39]:
w2v_rec.groupby('product_name',as_index=False)[['Review Contain vanilla','Review Contain dark','Review Contain bourbon']].agg(get_ratio)


Unnamed: 0,product_name,Review Contain vanilla,Review Contain dark,Review Contain bourbon
0,Genealogy Of Morals - Bourbon Barrel-Aged,12.5%,12.5%,50.0%
1,Sunday Brunch,16.67%,50.0%,0.0%
2,The Peach,0.0%,0.0%,0.0%


#### SpaCy word embedding approach will always yield recommendations with higher similarity scores becuase it take into consideration of similar words through spacy library. When we do comparison of '% of review containing a spaeific feature' the bow apporach will always yield a higher percentage, because it looks for an exact match in calculating cosine similarity

## Step 7: Comparing recommendation result with simply chosing the 3 highest rated products from  entire dataset

#### List of 3 Highest rated beers

In [43]:
#df.groupby("order")#[:3]
df[['beer','order']].drop_duplicates()[:3]

Unnamed: 0,beer,order
0,Kentucky Brunch Brand Stout,1
16,Vanilla Bean Assassin,2
20,Marshmallow Handjee,3


#### We can observe that the 3 highest rated products are: 
(The order catch the rank of the beers according to the web-page)
+ Kentucky Brunch Brand Stout 
+ Vanilla Bean Assassin
+ Marshmallow Handjee

In [56]:
# Formatting variables
out['similarity_score'] = out['similarity_score'].astype('float')
out['total_score'] = out['total_score'].astype('float')


Unnamed: 0,product_name,product_review,similarity_score,sentiment_score,total_score
0,Kentucky Brunch Brand Stout,I didnt think i was going to give it a perfect...,0.577350,0.9650,0.771175
1,Kentucky Brunch Brand Stout,So I just read a review that called the legend...,0.774597,0.9972,0.885898
2,Kentucky Brunch Brand Stout,"2021 vintage, bottle #79Holy. Fucking. Shit. K...",0.816497,0.9921,0.904298
3,Kentucky Brunch Brand Stout,"Celebrating my buddy @Rug with his 1,000th bee...",0.000000,0.9476,0.473800
4,Kentucky Brunch Brand Stout,"Thick and syrupy pour, mocha head. Aroma is bo...",0.000000,0.7264,0.363200
...,...,...,...,...,...
2220,Expedition Stout - Bourbon Barrel-Aged,"Pours a thick oily black, vigorous pours doesn...",0.577350,0.9212,0.749275
2221,Expedition Stout - Bourbon Barrel-Aged,2021-11-2712oz bottle served in a pair of Geor...,0.942809,0.7425,0.842655
2222,Expedition Stout - Bourbon Barrel-Aged,2018 Vintage12oz bottle poured into a tulipA- ...,0.925820,0.9652,0.945510
2223,Expedition Stout - Bourbon Barrel-Aged,Annual beer grab and generally availed in the ...,0.577350,0.5563,0.566825


#### Analysis

In [57]:
# 3 highest rated Analysis
out[out['product_name'].isin(list(df['beer'].unique()[:3]))].groupby('product_name').mean()

Unnamed: 0_level_0,similarity_score,sentiment_score,total_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kentucky Brunch Brand Stout,0.22956,0.723875,0.476718
Marshmallow Handjee,0.48137,0.564033,0.522702
Vanilla Bean Assassin,0.531636,0.72175,0.626693


In [58]:
# 1st Recommendation: Total Score
out[out['product_name'].isin(list(['Black Tuesday - Reserve', 'BDCS', 'Fundamental Observation']))].groupby('product_name').mean()

Unnamed: 0_level_0,similarity_score,sentiment_score,total_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BDCS,0.748466,0.837485,0.792975
Black Tuesday - Reserve,0.869032,0.840043,0.854537
Fundamental Observation,0.729515,0.840606,0.785061


In [59]:
# 2nd Recommendation: W2Vec
out[out['product_name'].isin(list(['Sunday Brunch','The Peach','Genealogy Of Morals - Bourbon Barrel-Aged']))].groupby('product_name').mean()

Unnamed: 0_level_0,similarity_score,sentiment_score,total_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Genealogy Of Morals - Bourbon Barrel-Aged,0.413675,0.723313,0.568494
Sunday Brunch,0.327165,0.798633,0.562899
The Peach,0.0,0.889375,0.444688


#### Observation:
+ If we choose the 3 highest rated beers, they would not meet the requirements which the user are looking for (vanilla, dark, bourbon) because their scores are not as higher as the specialized recommendation.
+ As we can observe, although the 3 highest rated have good and positive sentiment analysis, it's so clear that they are not related to much with the attributes the user is interested in, so it means they are not direct competitors. For that reason, the total score is lower than our recommendations.
+ Lastly, we can noted that our recommendations show highest similarity score and so total score.

## Step 8:  Using the top four attributes of beer (from word frequency analysis), calculate the lifts between these attributes and any 10 beers in your data. Choose one beer, and find the most similar beer (among the remaining 9) using the lift values.

The top four attributes of beer are taste, chocolate,dark and sweet

In [60]:
features = ['taste', 'chocolate','dark','sweet'] #The top four attributes of beer are taste, chocolate, dark and sweet
import numpy as np
row = np.zeros(len(features))
def modelcounts(x):
  r = row.copy() #init new row
  for i,b in enumerate(features):
    if b in x:
      r[i]=1
  return r

counts_df = pd.DataFrame(c.apply(modelcounts).tolist(),columns = features,index=df.beer).reset_index()
counts_df 

Unnamed: 0,beer,taste,chocolate,dark,sweet
0,Kentucky Brunch Brand Stout,1.0,1.0,0.0,0.0
1,Kentucky Brunch Brand Stout,0.0,0.0,1.0,0.0
2,Kentucky Brunch Brand Stout,1.0,1.0,0.0,0.0
3,Kentucky Brunch Brand Stout,0.0,1.0,0.0,0.0
4,Kentucky Brunch Brand Stout,0.0,1.0,0.0,0.0
...,...,...,...,...,...
2220,Expedition Stout - Bourbon Barrel-Aged,1.0,1.0,0.0,0.0
2221,Expedition Stout - Bourbon Barrel-Aged,1.0,1.0,1.0,0.0
2222,Expedition Stout - Bourbon Barrel-Aged,0.0,1.0,1.0,0.0
2223,Expedition Stout - Bourbon Barrel-Aged,0.0,0.0,1.0,0.0


In [61]:
import random
beers = df['beer'].unique()
sample_list = random.choices(beers, k=10) 

In [62]:
import numpy as np
data=np.zeros(10*4).reshape(10,4) 
lifts_df=pd.DataFrame(data,columns=features,index=sample_list) #initial lifts_df between sample_list and features

In [63]:
sample_list

['Imperial German Chocolate Cupcake Stout',
 'Crusher',
 'King Sue - Double Dry-Hopped',
 'Expedition Stout - Bourbon Barrel-Aged',
 'I Will Not Be Afraid',
 'Last Buffalo In The Park',
 'Ghost In The Machine - Double Dry-Hopped',
 'Darkness',
 'Keene Idea',
 'Expedition Stout - Bourbon Barrel-Aged']

In [64]:
#calculate the lift between beer and feature
n_total=counts_df.shape[0] # totall number of reviews
for i in range(len(sample_list)):
    n_p=counts_df[counts_df.beer==sample_list[i]].shape[0] #number of reviews that mention sample_list[i]
    for j in range(len(features)):
        n_pf=counts_df[counts_df.beer==sample_list[i]][features[j]].sum() #number of reviews that mention beer sample_list[j] and features[j]
        n_f=counts_df[features[j]].sum() #number of reviews that mention features[j]
        lift=n_total*n_pf/n_p/n_f #calculate the lift between beer and feature
        lifts_df.iloc[i][j]=lift
lifts_df 

Unnamed: 0,taste,chocolate,dark,sweet
Imperial German Chocolate Cupcake Stout,0.0,2.972612,0.0,0.0
Crusher,0.597877,0.0,0.0,0.497429
King Sue - Double Dry-Hopped,0.896816,0.0,0.0,0.746144
Expedition Stout - Bourbon Barrel-Aged,1.729573,2.866447,3.290447,0.319776
I Will Not Be Afraid,1.076179,2.675351,1.84265,1.790744
Last Buffalo In The Park,1.537399,1.910965,0.658089,0.0
Ghost In The Machine - Double Dry-Hopped,1.345224,0.0,0.0,0.0
Darkness,1.076179,3.121242,2.763975,1.119215
Keene Idea,0.733758,0.0,0.0,0.813975
Expedition Stout - Bourbon Barrel-Aged,1.729573,2.866447,3.290447,0.319776


In [66]:
from scipy import spatial

feature_vector_1 = lifts_df.iloc[0] #Choose the vector of one beer
for i in range(9):
    feature_vector_2 = lifts_df.iloc[i+1] #choose the vector of another beer
    cosine_similarity = 1 - spatial.distance.cosine(feature_vector_1, feature_vector_2) #calculates the cosine similarity between the two vectors
    print(f'cosine similarity of {lifts_df.iloc[i+1].name:50} is {cosine_similarity:10f}')

cosine similarity of Crusher                                            is   0.000000
cosine similarity of King Sue - Double Dry-Hopped                       is   0.000000
cosine similarity of Expedition Stout - Bourbon Barrel-Aged             is   0.609231
cosine similarity of I Will Not Be Afraid                               is   0.692673
cosine similarity of Last Buffalo In The Park                           is   0.752531
cosine similarity of Ghost In The Machine - Double Dry-Hopped           is   0.000000
cosine similarity of Darkness                                           is   0.701580
cosine similarity of Keene Idea                                         is   0.000000
cosine similarity of Expedition Stout - Bourbon Barrel-Aged             is   0.609231
