# Beer Recommender System
This notebook implements data preprocessing and modeling techniques to create a beer recommender system. I

In [1]:
#Imports
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


### Preprocessing Functions
These functions clean the dataset by handling duplicates, missing values, and incorrect formats. They prepare the data for splitting and analysis.

In [2]:
def preprocess_data(df):
    df_filtered = df.drop_duplicates(["name", "reviewer", "review_text"]) # Remove duplicate entries
    print("Size after drop_duplicates: ", len(df_filtered))
    
    df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN
    df_filtered = df_filtered.dropna(subset=['rating'])  # Drop rows where 'rating' is NaN
    print("Size after drop rating NA: ", len(df_filtered))
    
    df_filtered['abv'] = pd.to_numeric(df_filtered['abv'].str.rstrip('%'), errors='coerce') 
    df_filtered = df_filtered.dropna(subset=['abv'])
    print("Size after drop abv NA: ", len(df_filtered))
    
    return df_filtered

def create_test_train(df, reviewer_col="reviewer", random_state=7, test_size=100, mask_percentage=0.10):
    """
    Splits a dataset into training and test sets, masking a portion of test set entries.
    
    Parameters:
    - df (pd.DataFrame): The dataset to split.
    - reviewer_col (str): The column name containing reviewer IDs.
    - random_state (int): The random state for reproducibility.
    - test_size (int): The number of reviewers to sample for the test set.
    - mask_percentage (float): The percentage of beers to mask for each reviewer in the test set.

    Returns:
    - df_train (pd.DataFrame): The training set.
    - df_test_masked (pd.DataFrame): The test set with masked entries.
    """
    # Randomly sample reviewers
    sampled_reviewers = df[reviewer_col].sample(n=test_size, random_state=random_state)
    
    # Get reviews from the sampled reviewers
    df_test = df[df[reviewer_col].isin(sampled_reviewers)]
    
    # Group by reviewer to get each user's beers
    df_test_grouped = df_test.groupby(reviewer_col)
    
    # Randomly mask a percentage of beers for each reviewer
    test_set_masked = []
    for reviewer, group in df_test_grouped:
        # Calculate how many beers to mask
        num_to_mask = max(int(len(group) * mask_percentage), 1)
        
        # Sample the calculated number of beers
        masked_group = group.sample(n=num_to_mask, random_state=random_state)
        test_set_masked.append(masked_group)
    
    # Combine masked reviews into a single DataFrame
    df_test_masked = pd.concat(test_set_masked)
    
    # Remove masked reviews from the training data
    df_train = df.drop(df_test_masked.index)
    
    # Display dataset summaries
    print("\n### Dataset Summary ###")
    print(f"Total reviewers sampled: {len(sampled_reviewers)}")
    print(f"Training set size: {df_train.shape}")
    print(f"Test set size: {df_test_masked.shape}")
    
    return df_train, df_test_masked

In [3]:
# Load data and preprocess
df = pd.read_pickle('encoded_beers_SBERT.pkl')

df_filtered = preprocess_data(df)
del df

Size after drop_duplicates:  1157819


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN


Size after drop rating NA:  1157807
Size after drop abv NA:  1154739


In [5]:
df_filtered.head()

Unnamed: 0,id,name,brewery,subgenre,abv,location,rating,average_rating,reviewer,review_date,review_text,algorithm_rating,total_reviews,sbert_embedding
0,1,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,🇯🇪Jersey,3.5,2.99,Jerseyislandbeer,"December 14, 2023",330ml can from Shoprite in Livingstone. At hom...,28.0,11,"[0.037878353, 0.00593541, 0.0062317043, -0.011..."
1,2,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇬🇧Ipswich, England",3.2,2.99,Grumbo,"February 28, 2022","18/2/2022. Can sample courtesy of fonefan, che...",28.0,11,"[-0.037820198, -0.044825517, 0.07764052, 0.065..."
2,3,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇸🇪Tyresö, Sweden",3.5,2.99,omhper,"February 19, 2022","--Sample, thanks fonefan! -- Hazy deep golden,...",28.0,11,"[0.056960188, -0.00059301173, 0.11057871, 0.02..."
3,4,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇫🇮Vasa, Finland",2.8,2.99,oh6gdx,"January 31, 2022","Panda from a can, thanks fonefan!. Golden colo...",28.0,11,"[0.003549767, -0.010705345, 0.02083684, 0.0106..."
4,6,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇩🇰Haderslev, Denmark",2.6,2.99,martin00sr,"January 8, 2022","Can @Ulfborg. Cloudy amber, white head. Malty ...",28.0,11,"[-0.01005388, -0.02942978, 0.0016338513, 0.017..."


In [4]:
df_train, df_test_masked = create_test_train(df_filtered)


### Dataset Summary ###
Total reviewers sampled: 100
Training set size: (1149910, 14)
Test set size: (4829, 14)


In [5]:
# Create dataframe for retrieving beer information
beer_info = df_filtered[['name', 'abv', 'subgenre']]

# Drop duplicate rows based on the 'name' column (i.e. beers)
beer_info = beer_info.drop_duplicates(subset='name')

beer_info.set_index('name', inplace=True)

### Create locality-sensitive hashing (LSH)

In [6]:
# Initialize a variable to store the model
sbert_model = None

def encode_sbert(query, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    """
    Encodes a query using SBERT. Loads the model if not already loaded.
    
    Parameters:
        query (str or list of str): The query or list of queries to encode.
        model_name (str): The name of the SBERT model to load (default is 'all-MiniLM-L6-v2').
    
    Returns:
        numpy.ndarray: The embedding(s) for the input query/queries.
    """
    global sbert_model  # Use the global variable to store the model
    
    # Load the model if it's not already loaded
    if sbert_model is None:
        sbert_model = SentenceTransformer(model_name)
    
    # Encode the query and return the embeddings
    return sbert_model.encode(query)

def generate_hyperplanes(dim, num_hash_functions):
    """
    Generate random hyperplanes for hash functions.
    
    Parameters:
    - dim: Dimensionality of the embeddings.
    - num_hash_functions: Number of hash functions per table.
    
    Returns:
    - A matrix of shape (num_hash_functions, dim) where each row is a hyperplane.
    """
    return np.random.randn(num_hash_functions, dim)

def hash_vectors(vectors, hyperplanes):
    """
    Hash a batch of vectors using a set of hyperplanes.

    Parameters:
    - vectors: Input vectors (2D array of shape [n_samples, d]).
    - hyperplanes: Matrix of hyperplanes (2D array of shape [k, d]).

    Returns:
    - A matrix of binary hash values (shape [n_samples, k]).
    """
    # Compute dot products and return binary hash values
    return (np.dot(vectors, hyperplanes.T) > 0).astype(int)

class LSHVectorized:
    def __init__(self, d, k, L):
        """
        Initialize the LSH scheme with vectorized support.

        Parameters:
        - d: Dimensionality of the input vectors.
        - k: Number of hash functions per table.
        - L: Number of hash tables.
        """
        self.L = L
        self.tables = [defaultdict(list) for _ in range(L)]
        self.hyperplanes = [generate_hyperplanes(d, k) for _ in range(L)]

    def add_vectors(self, vectors, identifiers):
        """
        Add a batch of vectors to the LSH index.

        Parameters:
        - vectors: Input vectors (2D array of shape [n_samples, d]).
        - identifiers: A list of unique identifiers for the vectors.
        """
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all vectors at once
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Add vectors to their corresponding buckets
            for identifier, key in zip(identifiers, hash_keys):
                table[key].append(identifier)

    def query(self, vectors):
        """
        Query the LSH index to find similar items for a batch of vectors.

        Parameters:
        - vectors: Query vectors (2D array of shape [n_samples, d]).

        Returns:
        - A list of sets, where each set contains the candidates for a query vector.
        """
        candidates = [set() for _ in range(len(vectors))]
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all query vectors
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Retrieve candidates for each query
            for i, key in enumerate(hash_keys):
                candidates[i].update(table.get(key, []))
        return candidates

In [7]:
vectors = np.vstack(df_train["sbert_embedding"].values)  # Combine embeddings into a 2D array
identifiers = df_train.index.tolist()  # Use review IDs as identifiers

In [361]:
## Run LSH ##
# Initialize LSH scheme
d = 384
k = 13
L = 30

lsh = LSHVectorized(d, k, L)

In [362]:
# Add vectors to the LSH index
lsh.add_vectors(vectors, identifiers)

### Create Collaborative Filtering (CF)

In [185]:

def predict_ratings_user_based(user_item_matrix, similarity_matrix):
   
    """
    this function predicts the ratings for the user_item_matrix using the similarity_matrix
    

    Parameters: 
    
    - user_item_matrix (DataFrame): User-item matrix with ratings centered around the user mean.
    - similarity_matrix (DataFrame): User-user similarity matrix.
    
    Returns:
        - pred (DataFrame): Predicted ratings for all user-item pairs.
    
    """


    # Compute predictions
    similarity_sum = np.abs(similarity_matrix).sum(axis=1)[:, None]
    pred = np.dot(similarity_matrix, user_item_matrix) / (similarity_sum + 1e-8)



    return pred


def collaborative_filtering(df):
    
    """
    Predicts user ratings for items using user-based collaborative filtering with cosine similarity. 
    Preprocesses the input data to create a centered user-item matrix, computes user similarities, 
    and generates predicted ratings.

    Parameters:
    - df (DataFrame): Input data with 'reviewer', 'name', and 'rating' columns.

    Returns:
    - pr_df (DataFrame): Predicted ratings for all user-item pairs.

    """


    user_item_matrix = df.pivot_table(
    index="reviewer",     # Rows: Reviewers
    columns="name",       # Columns: Beer names
    values="rating",      # Values: Ratings
    fill_value=0          # Fill missing ratings with 0
    )
    
    user_item_np = np.where(user_item_matrix != 0, (user_item_matrix - 3) / 2, 0) # scale to [-1,1]
    user_item_matrix = pd.DataFrame(user_item_np, index=user_item_matrix.index, columns=user_item_matrix.columns)


    # Compute cosine similarity
    cosine_similarity_matrix = cosine_similarity(user_item_matrix)
    
    # Predict ratings
    predicted_ratings = predict_ratings_user_based(user_item_matrix, cosine_similarity_matrix)

    df_out = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)
   
    return df_out


collab_df = collaborative_filtering(df_train)

In [301]:
user_beer_matrix = df_train.pivot_table(
    index="reviewer",     # Rows: Reviewers
    columns="name",       # Columns: Beer names
    values="rating",      # Values: Ratings
    fill_value=0          # Fill missing ratings with 0
    )

In [338]:
n_reviews = np.sum(user_beer_matrix > 0, axis=1)
beta = 0.55 + ((0.45) / (1+np.log(n_reviews))) # Describe how much weight to put on collaborative filtering (1-beta), so more weight is on users with many reviews

### Review Term Explanations:

In [12]:

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap
from sklearn.feature_extraction.text import CountVectorizer

# Define reference context for flavor-related words. The first 20 is from a aromatic kit used for sommeliers, the rest is ai-generated.
context_words = [
    "bitter", "sweet", "salt", "sour", "umami",
    "lemon", "grapefruit", "apple", "pear", "blackcurrant", "prune", "melon", 
    "banana", "acacia", "rose", "cut grass", "hay", "bay leaf", "thyme", 
    "tomato", "pepper", "nutmeg", "clove", "bread", "butter", "vanilla", 
    "hazelnut", "toast", "malt", "caramel", "honey", "coffee", "licorice",
    "pine", "grass", "resin", "floral", "perfume", "incense", "cinnamon",
    "ginger", "anise", "nut", "almond", "walnut", "chestnut", "peanut",
    "soy", "mushroom", "earth", "dust", "wood", "barnyard", "horse",
    "wet", "dry", "metallic", "sulfur", "fish", "cheese", "butter",
    "cream", "leather", "silk", "rubber", "barnyard", "ammonia",
    "rotten", "acid"
]
custom_stop_words = ["beer", "beers", "bottle", "taste", "nice", "aroma", "like", "good", "great", "head", "flavor", "flavors", "flavour", "flavours", "brew", "can"]
context_embeddings = encode_sbert(context_words)

# Function to filter terms dynamically
def is_flavor_related(term, context_embeddings, threshold=0.35):
    term_embedding = sbert_model.encode([term])[0]
    cosine_similarity = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    max_similarity = max(cosine_similarity(term_embedding, context) for context in context_embeddings)
    return max_similarity > threshold

def plot_bucket(bucket_vectors, cluster_labels, perplexity=30, n_iter=5000, learning_rate=200):
    """
    Visualizes differences within an LSH bucket using t-SNE with configurable parameters.
    
    Args:
        bucket_vectors (np.ndarray): High-dimensional vectors of beers in the bucket.
        cluster_labels (np.ndarray): Cluster labels assigned to each vector.
        subgenres (np.ndarray): Subgenre or categorical labels for each beer.
        perplexity (int): The t-SNE perplexity parameter, balancing local/global data views.
        n_iter (int): Number of iterations for t-SNE optimization.
        learning_rate (float): Learning rate for t-SNE optimization.
    """
    # t-SNE reducer with tuned parameters
    reducer = TSNE(
        n_components=2, 
        random_state=42, 
        perplexity=perplexity, 
        n_iter=n_iter, 
        learning_rate=learning_rate
    )
    reduced_vectors = reducer.fit_transform(bucket_vectors)
    
    # Plotting
    plt.figure(figsize=(16, 10))
    scatter = plt.scatter(
        reduced_vectors[:, 0],
        reduced_vectors[:, 1],
        c=cluster_labels,
        cmap='plasma',
        alpha=0.7
    )
    plt.colorbar(scatter, label='Cluster Label')
    plt.title(f"t-SNE Visualization (Perplexity={perplexity}, n_iter={n_iter}, LR={learning_rate})")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.grid(alpha=0.3)
    plt.show()

def getThemes(df_filtered, beer_name, query_embedding, limit=100):
    # Initialize CountVectorizer with custom stopwords
    default_stop_words = CountVectorizer(stop_words='english').get_stop_words()
    all_stop_words = list(set(default_stop_words).union(custom_stop_words))

    # Pass the combined stop words to CountVectorizer
    vectorizer = CountVectorizer(max_features=100, stop_words=all_stop_words, token_pattern=r'\b[a-zA-Z]{2,}\b')
    df_beer = df_filtered[df_filtered["name"] == beer_name]
    # Extract top terms from cluster reviews
    term_matrix = vectorizer.fit_transform(df_beer["review_text"])
    terms = vectorizer.get_feature_names_out()
    term_counts = np.array(term_matrix.sum(axis=0)).flatten()
    top_terms = [terms[i] for i in term_counts.argsort()[-limit:]]  # Top 5 terms
    filtered_top_terms = [term for term in top_terms if is_flavor_related(term, context_embeddings)]
    
    # Return the terms most similar to the query
    term_embeddings = np.vstack([encode_sbert(term) for term in filtered_top_terms])
    
    # Calculate cosine similarity between query and terms
    similarities = cosine_similarity(query_embedding, term_embeddings)[0]
    
    # Create a DataFrame to store terms and their similarities
    term_similarity_df = pd.DataFrame({
        'term': filtered_top_terms,
        'similarity': similarities
    })
    
    # Sort terms by similarity to the query
    term_similarity_df = term_similarity_df.sort_values(by='similarity', ascending=False)
    
    # Return the top similar themes
    return term_similarity_df['term'].head(10).tolist()


## Make reccomendations

In [367]:
def recommend_beer_A(query_embedding, df_train, user_name, beer_info, beta, abv_desired=None, style_desired=None, n_clusters=5):
    # Query the LSH index
    candidates = lsh.query(query_embedding)
    print(len(candidates[0]))
    # Filter bucket vectors and metadata
    bucket_data = df_train[df_train["id"].isin(list(candidates[0]))]
    bucket_vectors = np.vstack(bucket_data["sbert_embedding"].to_numpy())
    
    # Extract subgenre information
    subgenres = bucket_data["subgenre"].values  # Adjust column name as necessary

    # Perform clustering on bucket vectors
    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=7)
    cluster_labels = kmeans.fit_predict(bucket_vectors)
    
    # Assign query to the nearest cluster
    query_cluster = kmeans.predict(query_embedding)[0]

    # Filter beers in the same cluster as the query
    cluster_indices = np.where(cluster_labels == query_cluster)[0]
    cluster_vectors = bucket_vectors[cluster_indices]
    cluster_beers = bucket_data.iloc[cluster_indices]
    print(len(cluster_beers))
    # Compute similarities within the selected cluster
    sims = cosine_similarity(query_embedding, cluster_vectors)[0]

    # Perform collaborative filtering
    predcicted_rating_user = collab_df.loc[user_name]
    
    beer_LSH = pd.DataFrame({
        'similarity': sims,
        'beer': cluster_beers["name"].values,  # Adjust column name if necessary
    })
    
    LSH_score = beer_LSH.groupby('beer')['similarity'].mean()
    print(len(LSH_score))

    collab_filtering_scores = predcicted_rating_user[LSH_score.index.tolist()] # Get CF_score for user for the beers in cluster


    # Penalise difference in abv by using a nonlinear function penalising greater differences more
    abv = beer_info.loc[LSH_score.index.tolist()]["abv"]

    alpha = 0.05
    if abv_desired:
        if abv_desired == 0:
            abv_weight = -2 * abs(abv - abv_desired)
        else:
            abv_weight = -alpha * ((abv - abv_desired)**2) / (abv_desired**1.5 + 1)
        abv_diff = abs(abv.values - abv_desired)
    else:
        abv_weight = abv - abv
        abv_diff = 0
    
    # Add bonus for match in style
    style_bonus = np.zeros(len(LSH_score))
    if style_desired:
        relevant_styles = beer_info.loc[LSH_score.index.tolist()]["subgenre"]
        style_mask = relevant_styles == style_desired
        style_bonus[style_mask] = 0.05
    
    beta = beta[user_name]

    # Combine weights and scores
    weighted_score = (
        beta * LSH_score +
        (1-beta) * collab_filtering_scores +
        abv_weight +
        style_bonus
    )

    # Create final DataFrame
    beer_weighted_score = pd.DataFrame({
        'beer': LSH_score.index,
        'score': weighted_score,
        'abv': abv.values,
        'LSH_score': LSH_score.values,
        'LSH_score_weighted': LSH_score.values * beta,
        'collab_score': collab_filtering_scores.values,
        'collab_score_weighted': collab_filtering_scores.values * (1-beta),
        'abv_diff': abv_diff,
        'abv_weight': abv_weight.values,
        'Style bonus': style_bonus,
        'Weight beta': beta
    })
    
    # Remove index of weighted score and keep the beer name as a column
    beer_weighted_score.reset_index(drop=True, inplace=True)
    
    # Get the 10 beers with the highest weighted scores
    #beer_weighted_score = beer_weighted_score.sort_values(by='score', ascending=False)
    #beer_weighted_score['notes'] = ""
    # Apply getThemes to each beer in the DataFrame
    #beer_weighted_score['notes'] = beer_weighted_score['beer'].apply(lambda x: getThemes(df_train, x, query_embedding))
    #beer_weighted_score.loc[:9, 'notes'] = beer_weighted_score.loc[:9, 'beer'].apply(
    #    lambda x: getThemes(df_train, x, query_embedding)) # TODO Fix!

    return beer_weighted_score.sort_values(by='score', ascending=False)

    

In [369]:
# Create a query
test_query = "Light, refreshing bitter beer with a orange taste"
user_name = "100Beier"
query_embedding = encode_sbert(test_query).reshape(1, -1)
beer_recommendations= recommend_beer_A(query_embedding, df_train, user_name, beer_info, beta, abv_desired=7)

print("Top 5 recommended beers:")
# Set max column width to display full array
pd.set_option('display.max_colwidth', None)

# Display the DataFrame
display(beer_recommendations.head(10))

122106
10879
2939
Top 5 recommended beers:


Unnamed: 0,beer,score,abv,LSH_score,LSH_score_weighted,collab_score,collab_score_weighted,abv_diff,abv_weight,Style bonus,Weight beta
643,Cyclic Beer Farm Saison,0.467894,5.8,0.751576,0.471238,0.000924,0.000345,1.2,-0.003688,0.0,0.627
2511,Theodor Schiøtz Anarkist Bloody Weizen,0.460542,5.2,0.747391,0.468614,0.000607,0.000226,1.8,-0.008299,0.0,0.627
15,3 Fonteinen Zenne y Frontera - Oloroso & Pedro Ximénez Blend (Season 17|18 Blend No. 50),0.460244,6.9,0.731065,0.458378,0.005071,0.001892,0.1,-2.6e-05,0.0,0.627
2899,Zwettler Zwickl,0.456061,5.5,0.736462,0.461761,0.000169,6.3e-05,1.5,-0.005763,0.0,0.627
2261,Schützen Bräu,0.45511,4.4,0.754124,0.472836,-0.001099,-0.00041,2.6,-0.017315,0.0,0.627
1272,Insel-Brauerei Seepferd,0.454555,5.5,0.733217,0.459727,0.001586,0.000592,1.5,-0.005763,0.0,0.627
2424,Stupavar Herbal Ale 13°,0.448392,5.2,0.728242,0.456607,0.000225,8.4e-05,1.8,-0.008299,0.0,0.627
660,De Boei Noorderzon,0.445241,6.0,0.714707,0.448121,-0.000852,-0.000318,1.0,-0.002561,0.0,0.627
695,De Proefbrouwerij / Trillium Bouket Farmhouse Ale,0.438676,6.5,0.69895,0.438241,0.002883,0.001075,0.5,-0.00064,0.0,0.627
2913,Ølfabrikken Hvid Jul,0.437744,8.5,0.706354,0.442883,0.001671,0.000623,1.5,-0.005763,0.0,0.627


### Evaluation of experiment B short pipeline without KMEANS

In [370]:
def recommend_beer_B(query_embedding, df_train, user_name, beer_info, beta, abv_desired=None, style_desired=None, n_clusters=5):
    # Query the LSH index
    candidates = lsh.query(query_embedding)
    # Filter bucket vectors and metadata
    bucket_data = df_train[df_train["id"].isin(list(candidates[0]))]
    bucket_vectors = np.vstack(bucket_data["sbert_embedding"].to_numpy())
    
    # Extract subgenre information
    subgenres = bucket_data["subgenre"].values 
    
    # Compute similarities within the selected cluster
    sims = cosine_similarity(query_embedding, bucket_vectors)[0]

    # Perform collaborative filtering
    predcicted_rating_user = collab_df.loc[user_name]
    
    beer_LSH = pd.DataFrame({
        'similarity': sims,
        'beer': df_train[df_train["id"].isin(list(candidates[0]))]["name"].values,
    })
    
    LSH_score = beer_LSH.groupby('beer')['similarity'].mean()
    
    collab_filtering_scores = predcicted_rating_user[LSH_score.index.tolist()] # Get CF_score for user for the beers in cluster

    # Penalise difference in abv by using a nonlinear function penalising greater differences more
    abv = beer_info.loc[LSH_score.index.tolist()]["abv"]

    alpha = 0.05
    if abv_desired:
        if abv_desired == 0:
            abv_weight = -2 * abs(abv - abv_desired)
        else:
            abv_weight = -alpha * ((abv - abv_desired)**2) / (abv_desired**1.5 + 1)
        abv_diff = abs(abv.values - abv_desired)
    else:
        abv_weight = abv-abv
        abv_diff = 0

    beta = beta[user_name]    
    
    # Add bonus for match in style
    style_bonus = np.zeros(len(LSH_score))
    if style_desired:
        relevant_styles = beer_info.loc[LSH_score.index.tolist()]["subgenre"]
        style_mask = relevant_styles == style_desired
        style_bonus[style_mask] = 0.05
    
    
    # Combine weights and scores
    weighted_score = (
        beta * LSH_score +
        (1-beta) * collab_filtering_scores +
        abv_weight +
        style_bonus
    )

    # Create final DataFrame
    beer_weighted_score = pd.DataFrame({
        'beer': LSH_score.index,
        'score': weighted_score,
        'abv': abv.values,
        'LSH_score': LSH_score.values,
        'LSH_score_weighted': LSH_score.values * beta,
        'collab_score': collab_filtering_scores.values,
        'collab_score_weighted': collab_filtering_scores.values * (1-beta),
        'abv_diff': abv_diff,
        'abv_weight': abv_weight.values,
        'Style bonus': style_bonus,
        'Weight beta': beta
    })
    
    # Remove index of weighted score and keep the beer name as a column
    beer_weighted_score.reset_index(drop=True, inplace=True)

    return beer_weighted_score.sort_values(by='score', ascending=False)

    

In [371]:
# Reccomendation using only cosine similarity
def recommend_beer_C(query_embedding, df_train, vectors):

    # Compute similarities within the selected cluster
    sims = cosine_similarity(query_embedding, vectors)[0]
    
    beer_LSH = pd.DataFrame({
        'similarity': sims,
        'beer': df_train["name"].values,
    })
    
    cosine_score = beer_LSH.groupby('beer')['similarity'].mean()

    # Create final DataFrame
    beer_weighted_score = pd.DataFrame({
        'beer': cosine_score.index,
        'cosine_score': cosine_score.values,
    })

    return beer_weighted_score.sort_values(by='cosine_score', ascending=False)

In [372]:
# Reccomend beer just using collaborative filtering
def recommend_beer_D(user_name):
    # Perform collaborative filtering
    predcicted_rating_user = collab_df.loc[user_name]
    
    # Create final DataFrame
    beer_weighted_score = pd.DataFrame({
        'beer': predcicted_rating_user.index,
        'collab_score': predcicted_rating_user.values,
    })

    return beer_weighted_score.sort_values(by='collab_score', ascending=False)

In [373]:
import time
num_true_A = 0
num_false_A = 0
total_time_A = 0.0
times_A = []
for i in range(50):
    review_row = df_test_masked.iloc[i]
    real_beer = review_row["name"]
    query = review_row["sbert_embedding"].reshape(1, -1)
    user = review_row["reviewer"]
    abv_desired = review_row["abv"]
    style_desired = review_row["subgenre"]
    start_time = time.perf_counter()
    beer_recommendations= recommend_beer_A(query_embedding=query, df_train=df_train, user_name=user, beer_info=beer_info, beta=beta, abv_desired=None, style_desired=None)
    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    total_time_A += elapsed_time
    times_A.append(elapsed_time)
    
    if real_beer in beer_recommendations["beer"].head(20).tolist():
        num_true_A += 1
    else:
        num_false_A +=1

145506
18805
4051
239032
21141
2950
258455
19303
4389
160762
16006
2663
218136
23909
4214
105088
9396
2039
239705
25372
4388
261242
34400
5012
174295
18106
3883
161519
16106
2434
233264
16947
4118
289542
27054
3018
158319
16798
3708
217848
22993
4289
123013
13445
3333
229720
20713
2930
77491
5502
2219
160749
14809
2577
157869
11913
2433
102375
12508
3399
118350
12443
2271
214860
23455
4266
204064
20380
4261
68693
5731
1744
126946
9048
2140
116326
11870
2254
73457
9093
2857
84156
7663
1835
191463
18430
2655
142628
14111
3625
63430
7425
2530
123980
13704
2758
104989
7002
2889
221552
28445
4797
281545
22703
3135
77034
9653
3013
120633
12153
3270
119239
15823
3708
108859
13766
3572
122269
13898
2818
133101
17138
3830
78406
7130
2152
151801
19231
4148
161697
15818
2429
285839
28977
4849
169567
14886
2656
285463
27127
2907
163486
20695
4269
67589
5045
1337
173862
21970
4361


In [379]:
num_true_B = 0
num_false_B = 0
total_time_B = 0.0
times_B = []
for i in range(2000):
    review_row = df_test_masked.iloc[i]
    real_beer = review_row["name"]
    query = review_row["sbert_embedding"].reshape(1, -1)
    user = review_row["reviewer"]
    abv_desired = review_row["abv"]
    style_desired = review_row["subgenre"]
    start_time = time.perf_counter()
    beer_recommendations= recommend_beer_B(query_embedding=query, df_train=df_train, user_name=user, beer_info=beer_info, beta = beta, abv_desired=None, style_desired=None)
    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    total_time_B += elapsed_time
    times_B.append(elapsed_time)
    
    if real_beer in beer_recommendations["beer"].head(20).tolist():
        num_true_B += 1
    else:
        num_false_B +=1

In [375]:
import time
num_true_C = 0
num_false_C = 0
total_time_C = 0.0
times_C = []
for i in range(2000):
    review_row = df_test_masked.iloc[i]
    real_beer = review_row["name"]
    query = review_row["sbert_embedding"].reshape(1, -1)
    user = review_row["reviewer"]
    abv_desired = review_row["abv"]
    style_desired = review_row["subgenre"]
    start_time = time.perf_counter()
    beer_recommendations= recommend_beer_C(query_embedding=query, df_train=df_train, vectors=vectors)
    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    total_time_C += elapsed_time
    times_C.append(elapsed_time)
    
    if real_beer in beer_recommendations["beer"].head(20).tolist():
        num_true_C += 1
    else:
        num_false_C +=1

In [376]:
num_true_D = 0
num_false_D = 0
total_time_D = 0.0
times_D = []
errors = []
for i in range(2000):
    review_row = df_test_masked.iloc[i]
    real_beer = review_row["name"]
    query = review_row["sbert_embedding"].reshape(1, -1)
    user = review_row["reviewer"]
    abv_desired = review_row["abv"]
    style_desired = review_row["subgenre"]
    start_time = time.perf_counter()
    beer_recommendations= recommend_beer_D(user_name=user)
    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    total_time_D += elapsed_time
    times_D.append(elapsed_time)
    
    if real_beer in beer_recommendations["beer"].head(20).tolist():
        num_true_D += 1
    else:
        num_false_D +=1
    


In [378]:
def calculate_precision_recall(tp, total_cases, recommendations_per_case=20):
    """
    Calculate Precision and Recall.

    Parameters:
    - tp: True Positives
    - total_cases: Total number of test cases
    - recommendations_per_case: Number of recommendations per test case (default=20)

    Returns:
    - precision: TP / (TP + FP)
    - recall: TP / Total Relevant Items
    """
    fp = tp * (recommendations_per_case - 1)  # Each TP has (recommendations_per_case -1) FPs
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / total_cases if total_cases > 0 else 0
    return precision, recall
total_test_cases = 50
# Calculate Precision and Recall for each experiment
precision_A, recall_A = calculate_precision_recall(num_true_A, num_true_A+num_false_A)
precision_B, recall_B = calculate_precision_recall(num_true_B, num_true_A+num_false_B)
precision_C, recall_C = calculate_precision_recall(num_true_C, num_true_A+num_false_C)
precision_D, recall_D = calculate_precision_recall(num_true_D, num_true_A+num_false_D)

summary_data = {
    'Experiment': ['A', 'B', 'C', 'D'],
    'Type': ["LSH/KMEANS/CF", "LSH/CF", "COSINE", "CF"],
    'Queries in top 20': [num_true_A, num_true_B, num_true_C, num_true_D],
    'Queries not in top 20': [num_false_A, num_false_B, num_false_C, num_false_D],
    'Precision': [precision_A, precision_B, precision_C, precision_D],
    'Recall': [recall_A, recall_B, recall_C, recall_D],
    'Accuracy (%)': [
        (num_true_A / (num_true_A + num_false_A)) * 100,
        (num_true_B / (num_true_B + num_false_B)) * 100,
        (num_true_C / (num_true_C + num_false_C)) * 100,
        (num_true_D / (num_true_D + num_false_D)) * 100
    ],
    'Average Time (s)': [
        total_time_A / 50,
        total_time_B / 50,
        total_time_C / 50,
        total_time_D / 50
    ]
}

summary_df = pd.DataFrame(summary_data)

# Display the summary table
print("Experiment Summary:")
print(summary_df)



Experiment Summary:
  Experiment           Type  Queries in top 20  Queries not in top 20  \
0          A  LSH/KMEANS/CF                  3                     47   
1          B         LSH/CF                  1                     49   
2          C         COSINE                  4                     46   
3          D             CF                  3                     47   

   Precision  Recall  Accuracy (%)  Average Time (s)  
0       0.05    0.06           6.0          1.316225  
1       0.05    0.02           2.0          0.356019  
2       0.05    0.08           8.0          0.971998  
3       0.05    0.06           6.0          0.001234  


In [115]:
real_beer

'BIIR Country - Belgian Farmhouse Ale'

In [117]:
np.sum(beer_recommendations["beer"] == 'BIIR Country - Belgian Farmhouse Ale')

np.int64(1)

# TESTTEST

In [20]:
def recommend_beer(query_embedding, df_train, user_name, abv_desired, n_clusters=15):    
    # Query the LSH index
    candidates = lsh.query(query_embedding)

    # Filter bucket vectors and metadata
    bucket_data = df_train[df_train["id"].isin(list(candidates[0]))]
    bucket_vectors = np.vstack(bucket_data["sbert_embedding"].to_numpy())
    
    # Extract subgenre information
    subgenres = bucket_data["subgenre"].values  # Adjust column name as necessary
    
    # Perform clustering on bucket vectors
    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=42)
    cluster_labels = kmeans.fit_predict(bucket_vectors)
    
    # Assign query to the nearest cluster
    query_cluster = kmeans.predict(query_embedding)[0]
    
    perplexities = [50]
    n_iters = [10000]
    learning_rates = [100, 200]
    
    param_combinations = [(p, n, lr) for p in perplexities for n in n_iters for lr in learning_rates]
    
    #for perplexity, n_iter, learning_rate in param_combinations:
    #    plot_bucket(bucket_vectors, cluster_labels, subgenres, perplexity, n_iter, learning_rate)
        
    # Filter beers in the same cluster as the query
    cluster_indices = np.where(cluster_labels == query_cluster)[0]
    cluster_vectors = bucket_vectors[cluster_indices]
    cluster_beers = bucket_data.iloc[cluster_indices]
    
    # Compute similarities within the selected cluster
    sims = cosine_similarity(query_embedding, cluster_vectors)[0]

    # Perform collaborative filtering
    predcicted_rating_user = collab_df.loc[user_name]
    
    beer_LSH = pd.DataFrame({
        'similarity': sims,
        'beer': cluster_beers["name"].values,  # Adjust column name if necessary
    })
    
    LSH_score = beer_LSH.groupby('beer')['similarity'].mean()
    
    collab_filtering_scores = predcicted_rating_user[LSH_score.index.tolist()]
    
    # Penalise difference in abv by using a nonlinear function penalising greater differences more
    abv = beer_info.loc[LSH_score.index.tolist()]["abv"]

    alpha = 0.05
    if abv_desired == 0:
        abv_weight = -2 * abs(abv - abv_desired)
    else:
        abv_weight = -alpha * ((abv - abv_desired)**2) / (abv_desired**1.5 + 1)
    

    # Combine weights and scores
    weighted_score = (
        0.85 * LSH_score +
        0.15 * collab_filtering_scores +
        abv_weight
    )
    
    # Create final DataFrame
    beer_weighted_score = pd.DataFrame({
        'beer': LSH_score.index,
        'score': weighted_score,
        'abv': abv.values,
        'LSH_score': LSH_score.values,
        'LSH_score_weighted': LSH_score.values * 0.85,
        'collab_score': collab_filtering_scores.values,
        'collab_score_weighted': collab_filtering_scores.values * 0.15,
        'abv_diff': abs(abv.values - abv_desired),
        'abv_weight': abv_weight.values
    })
    
    # Remove index of weighted score and keep the beer name as a column
    beer_weighted_score.reset_index(drop=True, inplace=True)
    
    # Get the 10 beers with the highest weighted scores
    beer_weighted_score = beer_weighted_score.sort_values(by='score', ascending=False).head(10)
    
    # Apply getThemes to each beer in the DataFrame
    beer_weighted_score['notes'] = beer_weighted_score['beer'].apply(lambda x: getThemes(df_train, x, query_embedding))


    return beer_weighted_score.sort_values(by='score', ascending=False)
    

# Create a query
test_query = "Light, refreshing bitter beer with a orange taste"
user_name = "Jerseyislandbeer"
query_embedding = encode_sbert(test_query).reshape(1, -1)
beer_recommendations= recommend_beer(query_embedding, df_train, user_name, 7)

print("Top 5 recommended beers:")
# Set max column width to display full array
pd.set_option('display.max_colwidth', None)

# Display the DataFrame
display(beer_recommendations.head(10))


Top 5 recommended beers:


Unnamed: 0,beer,score,abv,LSH_score,LSH_score_weighted,collab_score,collab_score_weighted,abv_diff,abv_weight,notes
639,Nørrebro Pacific Summer Ale (Økologisk),0.655155,5.6,0.777055,0.660497,-0.002144,-0.000322,1.4,-0.00502,"[orange, oranges, bitter, citrus, citrusy, sweetness, ale, bitterness, grapefruit, fruitiness]"
424,Insel-Brauerei Insel Saison,0.650174,5.5,0.772663,0.656764,-0.005514,-0.000827,1.5,-0.005763,"[orange, tasting, bitter, citrus, sweetness, ale, wine, bitterness, sour, fruit]"
508,Legenda Brutal Bitter IPA,0.639414,8.2,0.756718,0.64321,-0.000722,-0.000108,1.2,-0.003688,"[orange, bitter, citrus, citrusy, sweetness, ipa, alcohol, bitterness, grape, grapefruit]"
82,Baden Baden Witbier,0.635766,4.9,0.761292,0.647098,-0.00024,-3.6e-05,2.1,-0.011296,"[orange, oranges, bitter, citrus, sweetness, ale, bitterness, ales, drinkability, sour]"
529,MONYO American Beauty APA,0.621311,5.6,0.737444,0.626828,-0.003306,-0.000496,1.4,-0.00502,"[orange, tasting, bitter, citrus, citrusy, sweetness, ale, bitterness, grapefruit, sour]"
161,Browar Brodacz Mózg,0.621038,5.7,0.735744,0.625382,-0.000103,-1.5e-05,1.3,-0.004329,"[orange, bitternes, bitter, citrus, citrusy, ale, bitterness, ales, sour, drinkable]"
92,Basqueland Saison (Lasai'son),0.616947,5.5,0.73264,0.622744,-0.000227,-3.4e-05,1.5,-0.005763,"[orange, bitter, citrus, sweetness, bitterness, beery, aromas, sour, fruit, spicy]"
292,Fanø Vestkyst,0.616771,5.7,0.730639,0.621043,0.000377,5.6e-05,1.3,-0.004329,"[orange, bitter, citrus, sweetness, ipa, ale, bitterness, grapefruit, fruit, spicy]"
451,Kaapse Tess,0.607172,5.0,0.726712,0.617705,-0.001917,-0.000288,2.0,-0.010246,"[orange, bitter, citrus, sweetness, wine, bitterness, grape, grapefruit, fruitiness, sour]"
653,Orca / Heidenpeters / Ale Mania Head in the Clouds,0.606479,3.9,0.742507,0.631131,-0.000244,-3.7e-05,3.1,-0.024615,"[orange, bitter, citrus, bitterness, grapefruit, sour, lager, phenolic, coffee, freshness]"


### Notes for improvement
Add better stop-words, flavor, flavour, flavors etc.


### Example:

### Evaluation settup