In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Input File Names
inputfile = 'CDE.bagofwords.csv'
inputfile2 = 'translated_messages.csv'

# Import Data
data = pd.read_csv(inputfile)
data2 = pd.read_csv(inputfile2)

In [6]:
# Find Top 10 Beers
data['Ratings'] = data2['rating']
#highest_rated = 
highest_rated = data[['Product Name', 'Ratings']].groupby('Product Name').mean('Ratings').sort_values('Ratings', ascending=False).index[0:10].tolist()

In [7]:
# Represent Each Beer As a Single Document
beer_messages = pd.DataFrame()
beer_messages['Beer'] = highest_rated

fullmessages = []
for beer in highest_rated:
    fullmessages.append(data[data['Product Name'] == beer]['Product Review'].str.cat(sep = ' '))

beer_messages['Combined Reviews'] = fullmessages

In [8]:
# Find Cosine Similarity Between Highest Rated Beer and Rest of Top 10 Using Bag of Words Model

# Setting Target Beer
text1 = beer_messages['Combined Reviews'][0]

# Initializing the Lists
similarity_scores = []
similarity_scorestfidf = []

# Calculating Similarity
for text2 in beer_messages['Combined Reviews'][1:10]:
    documents =[text1, text2]
    
    # Non-Normalized
    count_vectorizer = CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(documents)
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
        columns=count_vectorizer.get_feature_names_out(),
        index=['x', 'y'])
    similarity_scores.append(cosine_similarity(df, df)[0,1])

    # Normalized
    tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
    sparse_matrixtfidf = tfidf_vectorizer.fit_transform(documents)
    doc_term_matrixtfidf = sparse_matrixtfidf.todense()
    dftfidf = pd.DataFrame(doc_term_matrixtfidf, 
        columns = tfidf_vectorizer.get_feature_names_out(),
        index = ['x', 'y'])
    similarity_scorestfidf.append(cosine_similarity(dftfidf, dftfidf)[0,1])


In [22]:
# Saving to Dataframe
beersimilarities = pd.DataFrame()
beersimilarities['Beer 1'] = highest_rated[0:1] * (len(highest_rated) - 1)
beersimilarities['Beer 2'] = highest_rated[1:10]
beersimilarities['Cosine Similarity'] = similarity_scores
beersimilarities['Normalized Similarity'] = similarity_scorestfidf

beersimilarities.sort_values('Cosine Similarity', ascending=False)

Unnamed: 0,Beer 1,Beer 2,Cosine Similarity,Normalized Similarity
7,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Goose Island Bourbon County Stout - Rare 2010🇺...,0.842742,0.835625
1,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Side Project Beer : Barrel : Time - 2018🇺🇸Stou...,0.774314,0.715586
5,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Cycle / 3 Sons Rare Scooop🇺🇸Stout - Imperial F...,0.652199,0.567901
8,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Superstition Grand Cru Berry - F.O. Barrel Age...,0.485763,0.440068
4,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Sahtipaja MeadMe Batch #2 - Bourbon Vanilla🇸🇪M...,0.47852,0.408192
3,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,B. Nektar Ken Schramm Signature Series - The H...,0.474992,0.380317
6,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Schramm's The Heart of Darkness🇺🇸Mead - Melome...,0.471103,0.375422
2,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Cigar City Pilot Series Dragonfruit Passion Fr...,0.318535,0.231672
0,Toppling Goliath Kentucky Brunch🇺🇸Stout - Impe...,Cigar City Pilot Series Miami Madness🇺🇸Berline...,0.276189,0.187509


# Method & Logic
I chose the top 10 rated beers for this analysis. I made the target beer the number one highest rated beer by rating and found the cosine similarity with the rest of the top 10. 

For the similarity calculations, I took every review of each beer and joined it into 1 long review. This made it easy to compare reviews using bag of words cosine similarity. 

I chose bag of words cosine similarity because as we saw before, the spacy similarity was not appropritate for this scenario, inflating similarities greatly. 

I used both count vectorizer and tfidf vectorizer to find cosine similarities to account for frequency of word usage among reviews, this was done because I did not account for stop words in my analysis. 

You'll find that if you use the count or tfidf vectorizer, you end up with similar results. 