# Beer Recommender System
This notebook implements data preprocessing and modeling techniques to create a beer recommender system. I

In [1]:
#Imports
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import os
import pickle




### Preprocessing Functions
These functions clean the dataset by handling duplicates, missing values, and incorrect formats. They prepare the data for splitting and analysis.

In [2]:
def preprocess_data(df):
    df_filtered = df.drop_duplicates(["name", "reviewer", "review_text"]) # Remove duplicate entries
    print("Size after drop_duplicates: ", len(df_filtered))
    
    df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN
    df_filtered = df_filtered.dropna(subset=['rating'])  # Drop rows where 'rating' is NaN
    print("Size after drop rating NA: ", len(df_filtered))
    
    df_filtered['abv'] = pd.to_numeric(df_filtered['abv'].str.rstrip('%'), errors='coerce') 
    df_filtered = df_filtered.dropna(subset=['abv'])
    print("Size after drop abv NA: ", len(df_filtered))
    
    return df_filtered

def create_test_train(df, reviewer_col="reviewer", random_state=7, test_size=100, mask_percentage=0.10):
    """
    Splits a dataset into training and test sets, masking a portion of test set entries.
    
    Parameters:
    - df (pd.DataFrame): The dataset to split.
    - reviewer_col (str): The column name containing reviewer IDs.
    - random_state (int): The random state for reproducibility.
    - test_size (int): The number of reviewers to sample for the test set.
    - mask_percentage (float): The percentage of beers to mask for each reviewer in the test set.

    Returns:
    - df_train (pd.DataFrame): The training set.
    - df_test_masked (pd.DataFrame): The test set with masked entries.
    """
    # Randomly sample reviewers
    sampled_reviewers = df[reviewer_col].sample(n=test_size, random_state=random_state)
    
    # Get reviews from the sampled reviewers
    df_test = df[df[reviewer_col].isin(sampled_reviewers)]
    
    # Group by reviewer to get each user's beers
    df_test_grouped = df_test.groupby(reviewer_col)
    
    # Randomly mask a percentage of beers for each reviewer
    test_set_masked = []
    for reviewer, group in df_test_grouped:
        # Calculate how many beers to mask
        num_to_mask = max(int(len(group) * mask_percentage), 1)
        
        # Sample the calculated number of beers
        masked_group = group.sample(n=num_to_mask, random_state=random_state)
        test_set_masked.append(masked_group)
    
    # Combine masked reviews into a single DataFrame
    df_test_masked = pd.concat(test_set_masked)
    
    # Remove masked reviews from the training data
    df_train = df.drop(df_test_masked.index)
    
    # Display dataset summaries
    print("\n### Dataset Summary ###")
    print(f"Total reviewers sampled: {len(sampled_reviewers)}")
    print(f"Training set size: {df_train.shape}")
    print(f"Test set size: {df_test_masked.shape}")
    
    return df_train, df_test_masked

In [3]:
# Load data and preprocess
df = pd.read_pickle('encoded_beers_SBERT.pkl')

df_filtered = preprocess_data(df)
del df

Size after drop_duplicates:  1157819


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN


Size after drop rating NA:  1157807
Size after drop abv NA:  1154739


In [4]:
df_filtered.head()

Unnamed: 0,id,name,brewery,subgenre,abv,location,rating,average_rating,reviewer,review_date,review_text,algorithm_rating,total_reviews,sbert_embedding
0,1,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,🇯🇪Jersey,3.5,2.99,Jerseyislandbeer,"December 14, 2023",330ml can from Shoprite in Livingstone. At hom...,28.0,11,"[0.037878353, 0.00593541, 0.0062317043, -0.011..."
1,2,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇬🇧Ipswich, England",3.2,2.99,Grumbo,"February 28, 2022","18/2/2022. Can sample courtesy of fonefan, che...",28.0,11,"[-0.037820198, -0.044825517, 0.07764052, 0.065..."
2,3,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇸🇪Tyresö, Sweden",3.5,2.99,omhper,"February 19, 2022","--Sample, thanks fonefan! -- Hazy deep golden,...",28.0,11,"[0.056960188, -0.00059301173, 0.11057871, 0.02..."
3,4,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇫🇮Vasa, Finland",2.8,2.99,oh6gdx,"January 31, 2022","Panda from a can, thanks fonefan!. Golden colo...",28.0,11,"[0.003549767, -0.010705345, 0.02083684, 0.0106..."
4,6,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇩🇰Haderslev, Denmark",2.6,2.99,martin00sr,"January 8, 2022","Can @Ulfborg. Cloudy amber, white head. Malty ...",28.0,11,"[-0.01005388, -0.02942978, 0.0016338513, 0.017..."


In [5]:
df_train, df_test_masked = create_test_train(df_filtered)



### Dataset Summary ###
Total reviewers sampled: 100
Training set size: (1149910, 14)
Test set size: (4829, 14)


In [6]:
# Get count of each unique beer in the training set
beer_counts = df_test_masked['name'].value_counts()
print(beer_counts)

name
Maredsous 8 Brune / Bruin              11
New Glarus Wisconsin Belgian Red       10
AleSmith Speedway Stout                10
Moinette Biologique                    10
Russian River Pliny the Elder          10
                                       ..
Real Ale Devils Backbone                1
Bockor Pils                             1
Des Vignes Vent d'Ange                  1
Birrificio del Ducato Wedding Rauch     1
Harvest Moon Full Moon Pale Ale         1
Name: count, Length: 2864, dtype: int64


In [7]:
# Create dataframe for retrieving beer information
beer_info = df_train[['name', 'abv', 'subgenre']]

# Drop duplicate rows based on the 'name' column (i.e. beers)
beer_info = beer_info.drop_duplicates(subset='name')

beer_info.set_index('name', inplace=True)

### Create locality-sensitive hashing (LSH)

In [8]:
# Initialize a variable to store the model
sbert_model = None

def encode_sbert(query, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    """
    Encodes a query using SBERT. Loads the model if not already loaded.
    
    Parameters:
        query (str or list of str): The query or list of queries to encode.
        model_name (str): The name of the SBERT model to load (default is 'all-MiniLM-L6-v2').
    
    Returns:
        numpy.ndarray: The embedding(s) for the input query/queries.
    """
    global sbert_model  # Use the global variable to store the model
    
    # Load the model if it's not already loaded
    if sbert_model is None:
        sbert_model = SentenceTransformer(model_name)
    
    # Encode the query and return the embeddings
    return sbert_model.encode(query)

def generate_hyperplanes(dim, num_hash_functions):
    """
    Generate random hyperplanes for hash functions.
    
    Parameters:
    - dim: Dimensionality of the embeddings.
    - num_hash_functions: Number of hash functions per table.
    
    Returns:
    - A matrix of shape (num_hash_functions, dim) where each row is a hyperplane.
    """
    return np.random.randn(num_hash_functions, dim)

def hash_vectors(vectors, hyperplanes):
    """
    Hash a batch of vectors using a set of hyperplanes.

    Parameters:
    - vectors: Input vectors (2D array of shape [n_samples, d]).
    - hyperplanes: Matrix of hyperplanes (2D array of shape [k, d]).

    Returns:
    - A matrix of binary hash values (shape [n_samples, k]).
    """
    # Compute dot products and return binary hash values
    return (np.dot(vectors, hyperplanes.T) > 0).astype(int)

class LSHVectorized:
    def __init__(self, d, k, L):
        """
        Initialize the LSH scheme with vectorized support.

        Parameters:
        - d: Dimensionality of the input vectors.
        - k: Number of hash functions per table.
        - L: Number of hash tables.
        """
        self.L = L
        self.tables = [defaultdict(list) for _ in range(L)]
        self.hyperplanes = [generate_hyperplanes(d, k) for _ in range(L)]

    def add_vectors(self, vectors, identifiers):
        """
        Add a batch of vectors to the LSH index.

        Parameters:
        - vectors: Input vectors (2D array of shape [n_samples, d]).
        - identifiers: A list of unique identifiers for the vectors.
        """
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all vectors at once
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Add vectors to their corresponding buckets
            for identifier, key in zip(identifiers, hash_keys):
                table[key].append(identifier)

    def query(self, vectors):
        """
        Query the LSH index to find similar items for a batch of vectors.

        Parameters:
        - vectors: Query vectors (2D array of shape [n_samples, d]).

        Returns:
        - A list of sets, where each set contains the candidates for a query vector.
        """
        candidates = [set() for _ in range(len(vectors))]
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all query vectors
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Retrieve candidates for each query
            for i, key in enumerate(hash_keys):
                candidates[i].update(table.get(key, []))
        return candidates

In [9]:
def evaluate_lsh(lsh, data, ground_truth, top_k=10):
    retrieved_neighbors = []
    for query in data:
        neighbors = lsh.query(query, top_k=top_k)
        retrieved_neighbors.append(neighbors)
    
    # Compute recall and precision
    recall = sum(len(set(ground_truth[i]) & set(retrieved_neighbors[i])) / len(ground_truth[i])
                 for i in range(len(data))) / len(data)
    precision = sum(len(set(ground_truth[i]) & set(retrieved_neighbors[i])) / len(retrieved_neighbors[i])
                    for i in range(len(data))) / len(data)
    
    return {"recall": recall, "precision": precision}


In [10]:
vectors = np.vstack(df_train["sbert_embedding"].values)  # Combine embeddings into a 2D array
identifiers = df_train.index.tolist()  # Use review IDs as identifiers

vectors_test = np.vstack(df_test_masked["sbert_embedding"].values)  # Combine embeddings into a 2D array
identifiers_test = df_test_masked.index.tolist()  # Use review

In [141]:
## Run LSH ##
d = 384
k = 7
L = 50

lsh = LSHVectorized(d, k, L)

In [None]:
# Add vectors to the LSH index
lsh.add_vectors(vectors, identifiers)

### Create Collaborative Filtering (CF)

In [None]:

def predict_ratings_user_based(user_item_matrix, similarity_matrix):
   
    """
    this function predicts the ratings for the user_item_matrix using the similarity_matrix
    

    Parameters: 
    
    - user_item_matrix (DataFrame): User-item matrix with ratings centered around the user mean.
    - similarity_matrix (DataFrame): User-user similarity matrix.
    
    Returns:
        - pred (DataFrame): Predicted ratings for all user-item pairs.
    
    """


    # Compute predictions
    similarity_sum = np.abs(similarity_matrix).sum(axis=1)[:, None]
    pred = np.dot(similarity_matrix, user_item_matrix) / (similarity_sum + 1e-8)



    return pred


def collaborative_filtering(df):
    
    """
    Predicts user ratings for items using user-based collaborative filtering with cosine similarity. 
    Preprocesses the input data to create a centered user-item matrix, computes user similarities, 
    and generates predicted ratings.

    Parameters:
    - df (DataFrame): Input data with 'reviewer', 'name', and 'rating' columns.

    Returns:
    - pr_df (DataFrame): Predicted ratings for all user-item pairs.

    """


    user_item_matrix = df.pivot_table(
    index="reviewer",     # Rows: Reviewers
    columns="name",       # Columns: Beer names
    values="rating",      # Values: Ratings
    fill_value=0          # Fill missing ratings with 0
    )
    

    user_item_np = np.where(user_item_matrix != 0, (user_item_matrix - 3) / 2, 0)
    user_item_matrix = pd.DataFrame(user_item_np, index=user_item_matrix.index, columns=user_item_matrix.columns)


    # Compute cosine similarity
    cosine_similarity_matrix = cosine_similarity(user_item_matrix)
    
    # Predict ratings
    predicted_ratings = predict_ratings_user_based(user_item_matrix, cosine_similarity_matrix)

    df_out = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)
   
    return df_out


pickle_file = 'collab_df.pkl'

# Check if the pickle file exists
if os.path.exists(pickle_file):
    # Load the DataFrame from the pickle file
    with open(pickle_file, 'rb') as f:
        collab_df = pickle.load(f)
    print("Loaded collab_df from pickle.")
else:
    # Generate the DataFrame
    collab_df = collaborative_filtering(df_train)
    # Save it to a pickle file
    with open(pickle_file, 'wb') as f:
        pickle.dump(collab_df, f)
    print("Generated collab_df and saved to pickle.")



Loaded collab_df from pickle.


In [None]:
# Caculate fraction of beers per user which is zero
zero_percentage = (collab_df == 0).mean(axis=1) * 100
zero_percentage.describe()

count    33531.000000
mean         4.749092
std         13.940267
min          0.034176
25%          0.128161
50%          0.375940
75%          1.768626
max        100.000000
dtype: float64

### Review Term Explanations:

In [None]:

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap
from sklearn.feature_extraction.text import CountVectorizer

# Define reference context for flavor-related words. The first 20 is from a aromatic kit used for sommeliers, the rest is ai-generated.
context_words = [
    "bitter", "sweet", "salt", "sour", "umami",
    "lemon", "grapefruit", "apple", "pear", "blackcurrant", "prune", "melon", 
    "banana", "acacia", "rose", "cut grass", "hay", "bay leaf", "thyme", 
    "tomato", "pepper", "nutmeg", "clove", "bread", "butter", "vanilla", 
    "hazelnut", "toast", "malt", "caramel", "honey", "coffee", "licorice",
    "pine", "grass", "resin", "floral", "perfume", "incense", "cinnamon",
    "ginger", "anise", "nut", "almond", "walnut", "chestnut", "peanut",
    "soy", "mushroom", "earth", "dust", "wood", "barnyard", "horse",
    "wet", "dry", "metallic", "sulfur", "fish", "cheese", "butter",
    "cream", "leather", "silk", "rubber", "barnyard", "ammonia",
    "rotten", "acid"
]

custom_stop_words = ["beer", "beers", "bottle", "taste", "nice", "aroma", "like", "good", "great", "head", "flavor", "flavors", "flavour", "flavours", "brew", "can"]
context_embeddings = encode_sbert(context_words)

# Function to filter terms dynamically
def is_flavor_related(term, context_embeddings, threshold=0.35):
    term_embedding = sbert_model.encode([term])[0]
    cosine_similarity = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    max_similarity = max(cosine_similarity(term_embedding, context) for context in context_embeddings)
    return max_similarity > threshold

def plot_bucket(bucket_vectors, cluster_labels, perplexity=30, n_iter=5000, learning_rate=200):
    """
    Visualizes differences within an LSH bucket using t-SNE with configurable parameters.
    
    Args:
        bucket_vectors (np.ndarray): High-dimensional vectors of beers in the bucket.
        cluster_labels (np.ndarray): Cluster labels assigned to each vector.
        subgenres (np.ndarray): Subgenre or categorical labels for each beer.
        perplexity (int): The t-SNE perplexity parameter, balancing local/global data views.
        n_iter (int): Number of iterations for t-SNE optimization.
        learning_rate (float): Learning rate for t-SNE optimization.
    """
    # t-SNE reducer with tuned parameters
    reducer = TSNE(
        n_components=2, 
        random_state=42, 
        perplexity=perplexity, 
        n_iter=n_iter, 
        learning_rate=learning_rate
    )
    reduced_vectors = reducer.fit_transform(bucket_vectors)
    
    # Plotting
    plt.figure(figsize=(16, 10))
    scatter = plt.scatter(
        reduced_vectors[:, 0],
        reduced_vectors[:, 1],
        c=cluster_labels,
        cmap='plasma',
        alpha=0.7
    )
    plt.colorbar(scatter, label='Cluster Label')
    plt.title(f"t-SNE Visualization (Perplexity={perplexity}, n_iter={n_iter}, LR={learning_rate})")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.grid(alpha=0.3)
    plt.show()

def getThemes(df_filtered, beer_name, query_embedding, limit=100):
    # Initialize CountVectorizer with custom stopwords
    default_stop_words = CountVectorizer(stop_words='english').get_stop_words()
    all_stop_words = list(set(default_stop_words).union(custom_stop_words))

    # Pass the combined stop words to CountVectorizer
    vectorizer = CountVectorizer(max_features=100, stop_words=all_stop_words, token_pattern=r'\b[a-zA-Z]{2,}\b')
    df_beer = df_filtered[df_filtered["name"] == beer_name]
    # Extract top terms from cluster reviews
    term_matrix = vectorizer.fit_transform(df_beer["review_text"])
    terms = vectorizer.get_feature_names_out()
    term_counts = np.array(term_matrix.sum(axis=0)).flatten()
    top_terms = [terms[i] for i in term_counts.argsort()[-limit:]]  # Top 5 terms
    filtered_top_terms = [term for term in top_terms if is_flavor_related(term, context_embeddings)]
    
    # Return the terms most similar to the query
    term_embeddings = np.vstack([encode_sbert(term) for term in filtered_top_terms])
    
    # Calculate cosine similarity between query and terms
    similarities = cosine_similarity(query_embedding, term_embeddings)[0]
    
    # Create a DataFrame to store terms and their similarities
    term_similarity_df = pd.DataFrame({
        'term': filtered_top_terms,
        'similarity': similarities
    })
    
    # Sort terms by similarity to the query
    term_similarity_df = term_similarity_df.sort_values(by='similarity', ascending=False)
    
    # Return the top similar themes
    return term_similarity_df['term'].head(10).tolist()

def compute_bucket_score(bucket_data, query_embedding, user_name, beta, abv_desired, beer_info, style_desired):
    bucket_data = bucket_data.drop_duplicates(subset="name").reset_index(drop=True)
    # Calculate LSH scores for each beer in the bucket
    bucket_vectors = np.vstack(bucket_data["sbert_embedding"].to_numpy())
    sims = cosine_similarity(query_embedding, bucket_vectors)[0]
    
    # Add LSH scores to the bucket data
    bucket_data["LSH_score"] = sims
    
    # Collaborative filtering scores for each beer
    collab_scores = collab_df.loc[user_name, bucket_data["name"].values]
    bucket_data["collab_score"] = collab_scores.values

    # ABV weight calculation for each beer
    if abv_desired:
        abv_values = beer_info.loc[bucket_data["name"], "abv"]
        alpha = 0.05
        if abv_desired == 0:
            abv_weights = -2 * abs(abv_values - abv_desired)
        else:
            abv_weights = -alpha * ((abv_values - abv_desired)**2) / (abv_desired**1.5 + 1)
        bucket_data["ABV_weight"] = abv_weights.values
    else:
        bucket_data["ABV_weight"] = 0

    # Style bonus for each beer
    if style_desired:
        style_match = beer_info.loc[bucket_data["name"], "subgenre"] == style_desired
        bucket_data["style_bonus"] = style_match.astype(float) * 0.05
    else:
        bucket_data["style_bonus"] = 0

    # Weighted score calculation for each beer
    bucket_data["score"] = (
        beta * bucket_data["LSH_score"] +
        (1 - beta) * bucket_data["collab_score"] +
        bucket_data["ABV_weight"] +
        bucket_data["style_bonus"]
    )

    # Return the final bucket score DataFrame
    bucket_score = bucket_data[[
        "name", "LSH_score", "collab_score", "ABV_weight", "style_bonus", "score"
    ]].rename(columns={"name": "beer"})

    return bucket_score

## Make reccomendations

In [None]:
def recommend_beer_B(query_embedding, df_train, user_name, beer_info, abv_desired=None, style_desired=None, n_clusters=15):
    # Query the LSH index
    candidates = lsh.query(query_embedding)

    # Filter bucket vectors and metadata
    bucket_data = df_train[df_train["id"].isin(list(candidates[0]))]
    bucket_vectors = np.vstack(bucket_data["sbert_embedding"].to_numpy())
    
    # Extract subgenre information
    subgenres = bucket_data["subgenre"].values  # Adjust column name as necessary

    # Perform clustering on bucket vectors
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    
    cluster_labels = kmeans.fit_predict(bucket_vectors)
    
    # Assign query to the nearest cluster
    query_cluster = kmeans.predict(query_embedding)[0]

    # Filter beers in the same cluster as the query
    cluster_indices = np.where(cluster_labels == query_cluster)[0]
    cluster_vectors = bucket_vectors[cluster_indices]
    cluster_beers = bucket_data.iloc[cluster_indices]
    
    # Compute similarities within the selected cluster
    sims = cosine_similarity(query_embedding, cluster_vectors)[0]

    # Perform collaborative filtering
    predcicted_rating_user = collab_df.loc[user_name]
    
    beer_LSH = pd.DataFrame({
        'similarity': sims,
        'beer': cluster_beers["name"].values,  # Adjust column name if necessary
    })
    
    LSH_score = beer_LSH.groupby('beer')['similarity'].mean()
    
    collab_filtering_scores = predcicted_rating_user[LSH_score.index.tolist()] # Get CF_score for user for the beers in cluster

    if abv_desired:
        # Penalise difference in abv by using a nonlinear function penalising greater differences more
        abv = beer_info.loc[LSH_score.index.tolist()]["abv"]

        alpha = 0.05
        if abv_desired == 0:
            abv_weight = -2 * abs(abv - abv_desired)
        else:
            abv_weight = -alpha * ((abv - abv_desired)**2) / (abv_desired**1.5 + 1)
    else:
        abv_weight = 0

    beta = 0.6 + 0.004*zero_percentage.loc[user_name] # Linear function to scale beta such that persons with few reviews rely more on LSH and vice versa
    
    # Add bonus for match in style
    style_bonus = np.zeros(len(LSH_score))
    
    if style_desired:
        relevant_styles = beer_info.loc[LSH_score.index.tolist()]["subgenre"]
        style_mask = relevant_styles == style_desired
        style_bonus[style_mask] = 0.05
    
    
    # Combine weights and scores
    weighted_score = (
        beta * LSH_score +
        (1-beta) * collab_filtering_scores +
        abv_weight +
        style_bonus
    )

    # Create final DataFrame
    beer_weighted_score = pd.DataFrame({
        'beer': LSH_score.index,
        'score': weighted_score,
        'abv': abv.values,
        'LSH_score': LSH_score.values,
        'LSH_score_weighted': LSH_score.values * beta,
        'collab_score': collab_filtering_scores.values,
        'collab_score_weighted': collab_filtering_scores.values * (1-beta),
        'abv_diff': abs(abv.values - abv_desired),
        'abv_weight': abv_weight.values,
        'Style bonus': style_bonus,
        'Weight beta': beta
    })
    
    bucket_weighted_score = compute_bucket_score(bucket_data, query_embedding, user_name, beta, abv_desired, beer_info, style_desired)
    
    # Get the 10 beers with the highest weighted scores
    beer_weighted_score = beer_weighted_score.sort_values(by='score', ascending=False)
    
    # Apply getThemes to each beer in the DataFrame 
    #beer_weighted_score['notes'] = beer_weighted_score['beer'].apply(lambda x: getThemes(df_train, x, query_embedding))
    beer_weighted_score['notes'] = ""
    
    return beer_weighted_score.sort_values(by='score', ascending=False), bucket_weighted_score.sort_values(by='score', ascending=False)


In [None]:
# Create a query
test_query = "Light, refreshing bitter beer with a orange taste"
user_name = "Jerseyislandbeer"
query_embedding = encode_sbert(test_query).reshape(1, -1)
beer_recommendations, bucket_recommendations = recommend_beer_B(query_embedding, df_train, user_name, beer_info, abv_desired=7)

print("Top 5 recommended beers:")
# Set max column width to display full array
pd.set_option('display.max_colwidth', None)

# Display the DataFrame
display(beer_recommendations.head(10))
display(bucket_recommendations.head(10))


Top 5 recommended beers:


In [None]:
# Hide warnings
import warnings
## Run LSH ##
d = 384
k_values = [3, 5, 7, 10]
L_values = [10, 20, 30, 40, 50]
average_ratings = []
num_true_rec_result = []
num_true_bucket_result = []

for L in L_values:
    for k in k_values:
        average_ranking_rec = 0
        average_ranking_bucket = 0
        
        lsh = LSHVectorized(d, k, L)
        lsh.add_vectors(vectors, identifiers)
        warnings.filterwarnings('ignore')

        num_true_bucket = 0
        num_true_recommendations = 0

        test_length = 100

        for i in range(test_length):
            review_row = df_test_masked.iloc[i]
            
            real_beer = review_row["name"]
            query = review_row["sbert_embedding"].reshape(1, -1)
            user = review_row["reviewer"]
            abv_desired = review_row["abv"]
            style_desired = review_row["subgenre"]

            beer_recommendations, bucket = recommend_beer_B(query_embedding=query, df_train=df_train, user_name=user, beer_info=beer_info, abv_desired=abv_desired, style_desired=style_desired, n_clusters=5)
            
            # Reset the index of beer_recommendations
            beer_recommendations.reset_index(drop=True, inplace=True)
            bucket.reset_index(drop=True, inplace=True)
            
            if real_beer in bucket["beer"].to_list():
                num_true_bucket += 1
                average_ranking_bucket += bucket[bucket["beer"] == real_beer].index[0] / len(bucket)
                #print("Ranking in bucket", bucket[bucket["beer"] == real_beer].index[0], "/", len(bucket))
            
            if real_beer in beer_recommendations["beer"].to_list():
                num_true_recommendations += 1
                average_ranking_rec += beer_recommendations[beer_recommendations["beer"] == real_beer].index[0] / len(beer_recommendations)
                #print("Ranking in recommended", beer_recommendations[beer_recommendations["beer"] == real_beer].index[0], "/", len(beer_recommendations))
                
        print("Number of true bucket: ", num_true_bucket, "/", test_length)
        print("Number of true recommendations: ", num_true_recommendations, "/", test_length)
        print("Average ranking in bucket: ", average_ranking_bucket / num_true_bucket)
        print("Average ranking in recommendations: ", average_ranking_rec / num_true_recommendations)
        
        average_ratings.append((k, L, average_ranking_bucket / num_true_bucket, average_ranking_rec / num_true_recommendations))
        num_true_rec_result.append((k, L, num_true_recommendations))
        num_true_bucket_result.append((k, L, num_true_bucket))
        
        
        
        


Ranking in bucket 5261 / 7728
Ranking in recommended 2 / 3538
Ranking in bucket 1698 / 7682
Ranking in recommended 81 / 3514
Ranking in bucket 5545 / 7652
Ranking in recommended 37 / 5708
Ranking in bucket 5998 / 7747
Ranking in recommended 16 / 6209
Ranking in bucket 5620 / 7685
Ranking in recommended 39 / 3597
Ranking in bucket 1018 / 7694
Ranking in recommended 4 / 3650
Ranking in bucket 5554 / 7709
Ranking in recommended 59 / 5720
Ranking in bucket 7726 / 7737
Ranking in recommended 37 / 3638
Ranking in bucket 1566 / 7708
Ranking in recommended 129 / 5831
Ranking in bucket 1041 / 7731
Ranking in recommended 133 / 3727
Ranking in bucket 5567 / 7644
Ranking in recommended 0 / 3752
Ranking in bucket 7318 / 7656
Ranking in recommended 14 / 5693
Ranking in bucket 681 / 7705
Ranking in recommended 69 / 5876
Ranking in bucket 7425 / 7670
Ranking in recommended 45 / 3594
Ranking in bucket 6170 / 7564
Ranking in recommended 27 / 3312
Ranking in bucket 2849 / 7711
Ranking in recommended 309 

KeyboardInterrupt: 

# TESTTEST

In [None]:
def recommend_beer(query_embedding, df_train, user_name, abv_desired, n_clusters=15):    
    # Query the LSH index
    candidates = lsh.query(query_embedding)

    # Filter bucket vectors and metadata
    bucket_data = df_train[df_train["id"].isin(list(candidates[0]))]
    bucket_vectors = np.vstack(bucket_data["sbert_embedding"].to_numpy())
    
    # Extract subgenre information
    subgenres = bucket_data["subgenre"].values  # Adjust column name as necessary
    
    # Perform clustering on bucket vectors
    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=42)
    cluster_labels = kmeans.fit_predict(bucket_vectors)
    
    # Assign query to the nearest cluster
    query_cluster = kmeans.predict(query_embedding)[0]
    
    perplexities = [50]
    n_iters = [10000]
    learning_rates = [100, 200]
    
    param_combinations = [(p, n, lr) for p in perplexities for n in n_iters for lr in learning_rates]
    
    #for perplexity, n_iter, learning_rate in param_combinations:
    #    plot_bucket(bucket_vectors, cluster_labels, subgenres, perplexity, n_iter, learning_rate)
        
    # Filter beers in the same cluster as the query
    cluster_indices = np.where(cluster_labels == query_cluster)[0]
    cluster_vectors = bucket_vectors[cluster_indices]
    cluster_beers = bucket_data.iloc[cluster_indices]
    
    # Compute similarities within the selected cluster
    sims = cosine_similarity(query_embedding, cluster_vectors)[0]

    # Perform collaborative filtering
    predcicted_rating_user = collab_df.loc[user_name]
    
    beer_LSH = pd.DataFrame({
        'similarity': sims,
        'beer': cluster_beers["name"].values,  # Adjust column name if necessary
    })
    
    LSH_score = beer_LSH.groupby('beer')['similarity'].mean()
    
    collab_filtering_scores = predcicted_rating_user[LSH_score.index.tolist()]
    
    # Penalise difference in abv by using a nonlinear function penalising greater differences more
    abv = beer_info.loc[LSH_score.index.tolist()]["abv"]

    alpha = 0.05
    if abv_desired == 0:
        abv_weight = -2 * abs(abv - abv_desired)
    else:
        abv_weight = -alpha * ((abv - abv_desired)**2) / (abv_desired**1.5 + 1)
    

    # Combine weights and scores
    weighted_score = (
        0.85 * LSH_score +
        0.15 * collab_filtering_scores +
        abv_weight
    )
    
    # Create final DataFrame
    beer_weighted_score = pd.DataFrame({
        'beer': LSH_score.index,
        'score': weighted_score,
        'abv': abv.values,
        'LSH_score': LSH_score.values,
        'LSH_score_weighted': LSH_score.values * 0.85,
        'collab_score': collab_filtering_scores.values,
        'collab_score_weighted': collab_filtering_scores.values * 0.15,
        'abv_diff': abs(abv.values - abv_desired),
        'abv_weight': abv_weight.values
    })
    
    # Remove index of weighted score and keep the beer name as a column
    beer_weighted_score.reset_index(drop=True, inplace=True)
    
    # Get the 10 beers with the highest weighted scores
    beer_weighted_score = beer_weighted_score.sort_values(by='score', ascending=False).head(10)
    
    # Apply getThemes to each beer in the DataFrame
    beer_weighted_score['notes'] = beer_weighted_score['beer'].apply(lambda x: getThemes(df_train, x, query_embedding))


    return beer_weighted_score.sort_values(by='score', ascending=False)
    

# Create a query
test_query = "Light, refreshing bitter beer with a orange taste"
user_name = "Jerseyislandbeer"
query_embedding = encode_sbert(test_query).reshape(1, -1)
beer_recommendations= recommend_beer(query_embedding, df_train, user_name, 7)

print("Top 5 recommended beers:")
# Set max column width to display full array
pd.set_option('display.max_colwidth', None)

# Display the DataFrame
display(beer_recommendations.head(10))




Top 5 recommended beers:


Unnamed: 0,beer,score,abv,LSH_score,LSH_score_weighted,collab_score,collab_score_weighted,abv_diff,abv_weight,notes
129,Wacken Beer Walkürenschluck,0.60122,7.8,0.709152,0.60278,0.000534,8e-05,0.8,-0.001639,"[orange, bitter, citrus, sweetness, alcohol, bitterness, alcoholic, fruit, spicy, fruity]"
55,Jabeerwocky / Profesja Basista,0.59633,5.9,0.705066,0.599306,0.000823,0.000123,1.1,-0.003099,"[orange, bitter, citrus, sweetness, bitterness, sour, drinkable, grapes, fruit, spicy]"
34,De Ranke Cuvée De Ranke,0.595826,7.0,0.695959,0.591565,0.028405,0.004261,0.0,-0.0,"[orange, bitter, citrus, sweetness, ale, wine, bitterness, grapefruit, aromas, sour]"
45,Gypsy Inc Gyp Wit,0.589277,4.7,0.708873,0.602542,0.001899,0.000285,2.3,-0.01355,"[orange, oranges, bitter, citrus, citrusy, sweetness, brewdog, bitterness, grapefruit, drinking]"
42,Galway Bay / Begyle Brewing Goodbye Blue Monday,0.585319,6.6,0.688692,0.585388,0.002269,0.00034,0.4,-0.00041,"[orange, bitter, citrus, ipa, bitterness, grapefruit, oatmeal, tasty, fruit, fresh]"
125,Trolden Railroad Rye,0.575663,5.1,0.688169,0.584944,-0.000227,-3.4e-05,1.9,-0.009247,"[orange, oranges, bitter, citrus, sweetness, sour, fruit, bottled, fresh, fruity]"
15,Bayerischer Bahnhof Original Leipziger Gose,0.569178,4.6,0.68401,0.581408,0.016825,0.002524,2.4,-0.014754,"[orange, bitter, citrus, drink, sour, drinkable, sourness, fruit, spicy, acidic]"
67,Le Trou du Diable La Buteuse Brassin Spécial (Calvados),0.565809,10.0,0.691271,0.587581,0.008543,0.001281,3.0,-0.023053,"[orange, tasting, bitter, citrus, sweetness, alcohol, ale, sour, fruit, spicy]"
43,Giesinger Märzen Festbier,0.555965,5.7,0.658988,0.56014,0.001023,0.000153,1.3,-0.004329,"[orange, bitter, brewery, sweetness, bitterness, drinkable, fruit, mellow, bottled, fruity]"
100,Schneeeule Kennedy,0.555827,3.0,0.701773,0.596507,0.002021,0.000303,4.0,-0.040983,"[orange, citrus, citrusy, bitterness, grapefruit, fruitiness, sour, sourness, sourish, fruit]"


### Notes for improvement
Add better stop-words, flavor, flavour, flavors etc.


### Example:

### Evaluation settup