In [1]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# Read in scraped data file
data = pd.read_csv('beer_comments_final.csv')
text = data['translated_comment'].astype(str)

# Read in user attributes, convert to single string
customer_attributes = open("Refreshing", "r")
attributes_combined = " ".join([attr.strip() for attr in customer_attributes])

### Convert Each Message to Vector

In [3]:
print(len(data))

8696


In [4]:
# Create vectorizer
count_vectorizer = CountVectorizer(stop_words='english')

#  Fit to text 
count_matrix = count_vectorizer.fit_transform(text)

In [5]:

count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array, columns=count_vectorizer.get_feature_names_out())
print(df)

      00  000  001  007074  007286  01  011827  016322  017  02  ...  zn  \
0      0    0    0       0       0   0       0       0    0   0  ...   0   
1      0    0    0       0       0   0       0       0    0   0  ...   0   
2      0    0    0       0       0   0       0       0    0   0  ...   0   
3      0    0    0       0       0   0       0       0    0   0  ...   0   
4      0    0    0       0       0   0       0       0    0   0  ...   0   
...   ..  ...  ...     ...     ...  ..     ...     ...  ...  ..  ...  ..   
8691   0    0    0       0       0   0       0       0    0   0  ...   0   
8692   0    0    0       0       0   0       0       0    0   0  ...   0   
8693   0    0    0       0       0   0       0       0    0   0  ...   0   
8694   0    0    0       0       0   0       0       0    0   0  ...   0   
8695   0    0    0       0       0   0       0       0    0   0  ...   0   

      zoltan  zombier  zone  zoo  zum  zwanze  zwei  zwil  zzt  
0          0        0 

### Normalize Review Vectors and Attribute Vectors

In [6]:
# Normalize the review vectors, calculate the magnitude of each review vector (L2 norm)
magnitude = np.linalg.norm(df, axis=1)

# Avoid division by zero
magnitude[magnitude == 0] = 1

# Normalize each review vector (divide by its magnitude)
normalized_reviews = df.div(magnitude, axis=0)

# Transform attribute string into a vector
attribute_vector = count_vectorizer.transform([attributes_combined]).toarray()

# Calculate magnitude of the attribute vector, avoid division by zero
attribute_magnitude = np.linalg.norm(attribute_vector)
if attribute_magnitude == 0:
    attribute_magnitude = 1

# Normalize the attribute vector by dividing each element by its magnitude
normalized_attribute_vector = attribute_vector / attribute_magnitude



### Calculate Cosine Similarity Between Messages

In [7]:
similarity_scores = cosine_similarity(normalized_reviews, normalized_attribute_vector).flatten()


In [8]:
# Step 6: Create the output DataFrame
# Include product_name, product_review, and similarity_score
output_df = pd.DataFrame({
    "product_name": data['beer_name'],           # From the original CSV file
    "product_review": data['translated_comment'], # Review text
    "similarity_score": similarity_scores         # Calculated similarity score
})

In [9]:
output_df.to_csv("review_similarity_scores.csv", index=False)