In [103]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk



In [104]:
# Read in scraped data file
data = pd.read_csv('translated_messages.csv')
text = data['translated_messages'].astype(str)

In [105]:
#Find out how to take in user attributes
# Take in 3 user attributes
attribute1 = input("Enter the first attribute: ")
attribute2 = input("Enter the second attribute: ")
attribute3 = input("Enter the third attribute: ")

# Combine the attributes into a single string
attributes_combined = " ".join([attribute1, attribute2, attribute3])

In [106]:
print(attribute1)
print(attribute2)
print(attribute3)

thick
rich
bodied


### Convert Each Message to Vector

In [107]:
print(len(data))

8231


In [108]:
# Create vectorizer
count_vectorizer = CountVectorizer(stop_words='english')

#  Fit to text 
count_matrix = count_vectorizer.fit_transform(text)

In [109]:

count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array, columns=count_vectorizer.get_feature_names_out())
print(df)

      00  000  000th  001  00am  00th  01  010515  011  02  ...  ølbutik  \
0      0    0      0    0     0     0   0       0    0   0  ...        0   
1      0    0      0    0     0     0   0       0    0   0  ...        0   
2      0    0      0    0     0     0   0       0    0   0  ...        0   
3      0    0      0    0     0     0   0       0    0   0  ...        0   
4      0    0      0    0     0     0   0       0    0   0  ...        0   
...   ..  ...    ...  ...   ...   ...  ..     ...  ...  ..  ...      ...   
8226   0    0      0    0     0     0   0       0    0   0  ...        0   
8227   0    0      0    0     0     0   0       0    0   0  ...        0   
8228   0    0      0    0     0     0   0       0    0   0  ...        0   
8229   0    0      0    0     0     0   0       0    0   0  ...        0   
8230   0    0      0    0     0     0   0       0    0   0  ...        0   

      øldage  ølfestival  ølklubben  øllets  østerfælled  über  überragend  \
0        

### Normalize Review Vectors and Attribute Vectors

In [110]:
# Normalize the review vectors, calculate the magnitude of each review vector (L2 norm)
magnitude = np.linalg.norm(df, axis=1)

# Avoid division by zero
magnitude[magnitude == 0] = 1

# Normalize each review vector (divide by its magnitude)
normalized_reviews = df.div(magnitude, axis=0)

# Transform attribute string into a vector
attribute_vector = count_vectorizer.transform([attributes_combined]).toarray()

# Calculate magnitude of the attribute vector, avoid division by zero
attribute_magnitude = np.linalg.norm(attribute_vector)
if attribute_magnitude == 0:
    attribute_magnitude = 1

# Normalize the attribute vector by dividing each element by its magnitude
normalized_attribute_vector = attribute_vector / attribute_magnitude



### Calculate Cosine Similarity Between Messages

In [111]:
similarity_scores = cosine_similarity(normalized_reviews, normalized_attribute_vector).flatten()


In [112]:
# Step 6: Create the output DataFrame
# Include product_name, product_review, and similarity_score
output_df = pd.DataFrame({
    "product_name": data['Beer Name'],           # From the original CSV file
    "product_review": data['translated_messages'], # Review text
    "similarity_score": similarity_scores         # Calculated similarity score
})

In [113]:
output_df.to_csv("review_similarity_scores.csv", index=False)

### Sentiment Analysis Using VADER

In [114]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

# Create sentiment analyzer object
sid = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jennamferguson/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [115]:
# Ensure all values in 'product_review' are strings and handle NaNs
output_df['product_review'] = output_df['product_review'].fillna("").astype(str)

# Add a new column to the output_df with sentiment scores
output_df['sentiment_score'] = output_df['product_review'].apply(lambda review: sid.polarity_scores(review)['compound'])


In [116]:
# Step 3: Save the updated DataFrame to a new CSV file
output_df.to_csv("review_similarity_scores_sentiment.csv", index=False)
