In [1]:
''' Installing neccesary libraries'''
import pandas as pd
import numpy as np
import nltk #for natural language processing
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords #to remove stopwords
stop = set(stopwords.words('english')) #remove common english stopwords
stop.remove('not') #can convey different sentiment, we want to keep not
import re #importing regular expression to clean reviews


import string
from scipy import spatial #calculating bag-of-words cosine similarity

import spacy #for calculating word vector cosine similarity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer #using Vader tool for sentiment analysis 

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\india\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\india\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Importing Dataset Containing WebScraped Reviews

In [2]:
beer_reviews = pd.read_csv('REALbeer_reviews.csv') 
beer_reviews['product_review'] = beer_reviews['product_review'].astype(str)

In [61]:

class Recommender:

    def __init__(self,attributes): #must be 3 attributes
        self.attr = attributes

    def CosSim(self, review):  # calculates cosine similary
        reviews = review
        feature_vector = [1.0, 1.0, 1.0]
        cosine_sim = []  # empty list to store cos score
        for rev in reviews:
            review_vector = self.generate_bow(rev)  # returns array containing number of times each attr occurs in rev
            cosine_similarity = 1 - spatial.distance.cosine(feature_vector, review_vector)
            # calculates cosine distance of each review and our list of attributes
            cosine_sim.append(cosine_similarity)  # adds cosine score for each review to our list

        # cosine sim will return nan if attributes were not present in the review
        # we will convert these to zeros and transform it to a pandas series
        cosine_sim = pd.Series(cosine_sim).fillna(0)
        return cosine_sim

    def generate_bow(self, review):
        ''' returns an array with the number of times each attribute occurs in each review '''
        words = self.word_extraction(review)  # returns cleaned list of words in review
        bag_vector = np.zeros(len(self.attr))  # creates a vector of zeros for the number of attributes

        for w in words:  # for each word in the review
            for i, word in enumerate(self.attr):  # for each attribute
                if word == w:  # if the word is an attribute
                    bag_vector[i] = 1  # increment index of attribute
        return np.array(bag_vector)

    def word_extraction(self, sentence):
        ''' returns list of words in review after removing specific stop words, punctuation, and digits'''
        # ignore = ['a', "the", "is"] #remove these words
        words = re.sub("[^\w]", " ", sentence).split()  # remove punctuation and digits
        cleaned_text = [w.lower() for w in words if w not in stop]
        return cleaned_text
    
        
    def word_window(self,string,n,keyword):
    
        '''Searches for keyword in text and returns n words on either side of it as a tuple'''
    
    
        '''splitting words seperated by spaces, want to contain all capitalization and 
        punctuation as it contains sentiment, also removing all stopwords'''

        string_tokens = string.split() 
        string_no_sw = [word for word in string_tokens if not word in stop] 
    
        #empty list to store words before and after keyword
        window = ''
    
        #identifying if keyword in string and then grabbing 3 words before and 3 words after
        for i in range(len(string_no_sw)):
            word = string_no_sw[i]
            if word == keyword:
            
                #words that appear before keyword
                while (i-n) >= 0 and n >= 1:
                    window += string_no_sw[i-n] + ' '
                    n = n - 1
            
                #words that appear after keyword
                while (i+n) <= (len(string_no_sw)-1) and n <= 3:
                    window += string_no_sw[i+n] + ' '
                    n = n + 1
                
        return window

    def sentiment_score(self,review_column,attribute):

        '''  this function finds the sentiment for a window of words surrounding each attribute in each review'''
        
       
        analyzer = SentimentIntensityAnalyzer() #creating sentiment-intensity-analyzer object
        review_index = 0 #keeps track of index of review
        attribute_score = []
        
        for rev in review_column:
        
            
            has_attributes = False #variable to check if review contains attributes
    
            for word in rev.split(): #for each word in the review
            
                word2 = word.lower() #only grab lower case to identify presence of attribute
     
           
                if word2 == attribute: #if word is an attribute
                    has_attributes = True #we know this review contains an attribute 
                    a = word2
                    window = self.word_window(rev,3,word) #grabbing window of 3 surrounding words in each direction 
                    score = analyzer.polarity_scores(window) #grab the sentiment score of this window of words
                    score = score['compound'] #grab the total compound score
                    #this grabs multiple mentions of same attribute in one review
                    attribute_score.append(score)
                    
            if has_attributes == False: # if rev contained no attributes.. update as np.nan for all attr to keep review in df 
                attribute_score.append(np.nan)
            review_index += 1 #increments review index     
        
        attribute_score = pd.Series(attribute_score)
        return attribute_score
    
    def eval_scores(self,df,sim_score_col):
        attr1 =str(self.attr[0])
        attr2 = str(self.attr[1])
        attr3 = str(self.attr[2])
    
        '''evaluation scores for regular cosine similarity'''
        df[attr1 + ' cosine eval score'] = (df[sim_score_col] + df[attr1 + ' sentiment score']) / 2
        df[attr2 + ' cosine eval score'] = (df[sim_score_col] + df[attr2 + ' sentiment score']) / 2
        df[attr3 + ' cosine eval score'] = (df[sim_score_col] + df[attr3 + ' sentiment score']) / 2
    
        df['Overall_eval_score'] = (df[sim_score_col] + df[attr1 + ' cosine eval score'] + \
                                        df[attr2 + ' cosine eval score'] + df[attr3 + ' cosine eval score']) / 4
    
        df = df.fillna(0)
        df = df.sort_values(by='Overall_eval_score', ascending=False)
        return df
    


    def Run(self,review_col,df,prod_name_col): 
        df['Similarity_Score'] = self.CosSim(review_col) #returns column w similarity score
        for i in range(len(self.attr)):
            attr =str(self.attr[i])
            df[attr + ' sentiment score'] = self.sentiment_score(review_col,attr)
        #grouping by mean scores
        df = df.groupby([prod_name_col])[[self.attr[0]+ ' sentiment score',self.attr[1]+ ' sentiment score',self.attr[2]+ ' sentiment score','Similarity_Score']].mean()
       
        #calculating overal evaluation scores
        df = self.eval_scores(df,'Similarity_Score')
        df = df.sort_values(by='Overall_eval_score', ascending=False)
        prod1 = df.index[0]
        prod2 = df.index[1]
        prod3 = df.index[2]
        rec = "Based upon your preferences, we recommend the following 3 beers: " + prod1 + ', ' + prod2 + ', and ' + prod3 + '!'
        print(rec)
        return df
 



## Finding the recommendations based upon input attributes

In [75]:
a = Recommender(['fruit','citrus','smooth'])
new = a.Run(beer_reviews['product_review'], beer_reviews,'product_name') 

Based upon your preferences, we recommend the following 3 beers: Emerald Grouper, 3rd Anniversary Imperial IPA, and Swish!


Their overall scores can be observed below:

In [78]:
new[0:3]

Unnamed: 0_level_0,fruit sentiment score,citrus sentiment score,smooth sentiment score,Similarity_Score,fruit cosine eval score,citrus cosine eval score,smooth cosine eval score,Overall_eval_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Emerald Grouper,0.551067,0.61705,0.384533,0.418486,0.484776,0.517768,0.401509,0.455635
3rd Anniversary Imperial IPA,0.7351,0.5106,0.1591,0.40158,0.56834,0.45609,0.28034,0.426587
Swish,0.18872,0.41808,0.7269,0.365542,0.277131,0.391811,0.546221,0.395176
