In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

#Preprocess the Dataset
def preprocess_data(file_path):
    """Preprocess the Coffee Reviews dataset."""
    data = pd.read_csv(file_path)

    # Convert review_date to datetime and extract year
    data['review_date'] = pd.to_datetime(data['review_date'], format='%B %Y', errors='coerce')
    data['review_year'] = data['review_date'].dt.year

    # Drop rows with missing values
    data = data.dropna(subset=['review_year', '100g_USD', 'rating', 'loc_country'])

    # Preserve original values
    data['original_price'] = data['100g_USD']
    data['original_rating'] = data['rating']

    # Normalize numerical attributes for multidimensional queries
    scaler = MinMaxScaler()
    data[['normalized_price', 'normalized_rating']] = scaler.fit_transform(
        data[['original_price', 'original_rating']]
    )

    return data, scaler


file_path = "simplified_coffee.csv"  
processed_data, scaler = preprocess_data(file_path)


print("Processed Data:")
print(processed_data.head())

Processed Data:
                               name                     roaster         roast  \
0          Ethiopia Shakiso Mormora                Revel Coffee  Medium-Light   
1                Ethiopia Suke Quto                 Roast House  Medium-Light   
2        Ethiopia Gedeb Halo Beriti   Big Creek Coffee Roasters        Medium   
3           Ethiopia Kayon Mountain  Red Rooster Coffee Roaster         Light   
4  Ethiopia Gelgelu Natural Organic   Willoughby's Coffee & Tea  Medium-Light   

     loc_country    origin  100g_USD  rating review_date  \
0  United States  Ethiopia      4.70      92  2017-11-01   
1  United States  Ethiopia      4.19      92  2017-11-01   
2  United States  Ethiopia      4.85      94  2017-11-01   
3  United States  Ethiopia      5.14      93  2017-11-01   
4  United States  Ethiopia      3.97      93  2017-11-01   

                                              review  review_year  \
0  Crisply sweet, cocoa-toned. Lemon blossom, roa...         2017  

In [2]:
# Multidimensional Queries 
def filter_and_rank_reviews(data, start_year, end_year, min_rating, min_price, max_price, country, n_results):
    """Filter and rank reviews based on constraints."""
  
    filtered = data[
        (data['review_year'] >= start_year) &
        (data['review_year'] <= end_year) &
        (data['original_rating'] >= min_rating) &
        (data['original_price'] >= min_price) &
        (data['original_price'] <= max_price) &
        (data['loc_country'] == country)
    ].reset_index(drop=True)

    
    ranked = filtered.sort_values(by='original_rating', ascending=False)
    return ranked.head(n_results)

In [3]:
#Build LSH 
# LSH model
def build_lsh(data, features):
    """Build an LSH model for multidimensional attributes."""
    feature_matrix = data[features].values
    lsh_model = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='euclidean')
    lsh_model.fit(feature_matrix)
    return lsh_model


def query_lsh(lsh_model, data, query_point, features, n_results):
    """Query the LSH model with a query point."""
    available_rows = len(data)
    print(f"DEBUG: Available rows for LSH query: {available_rows}") 

    if available_rows == 0:
        print("No data available for LSH query.")
        return pd.DataFrame()

    # Dynamically adjust n_results to not exceed available rows
    n_results = min(n_results, available_rows)
   

    try:
        
        distances, indices = lsh_model.kneighbors([query_point], n_neighbors=n_results)
        return data.iloc[indices[0]]
    except Exception as e:
        print(f"ERROR during LSH query: {e}")
        return pd.DataFrame()


features = ['normalized_price', 'normalized_rating']
lsh_model = build_lsh(processed_data, features)

# Partition the Dataset 
partitions = {country: group for country, group in processed_data.groupby('loc_country')}


In [5]:
def interactive_query_system():
    """Run the interactive query system."""
    # Get user input
    print("\nWelcome to the Coffee Reviews Query System!")
    start_year = int(input("Enter the start year (e.g., 2019): "))
    end_year = int(input("Enter the end year (e.g., 2021): "))
    min_rating = float(input("Enter the minimum rating (e.g., 94): "))
    min_price = float(input("Enter the minimum price (100g_USD, e.g., 4): "))
    max_price = float(input("Enter the maximum price (100g_USD, e.g., 10): "))
    country = input("Enter the country of origin (e.g., 'United States'): ").strip()
    n_results = int(input("Enter the number of top results to display: "))

    # Checks if country  exists
    if country not in partitions:
        print(f"No data available for {country}. Please try another country.")
        return

    country_data = partitions[country]
    if country_data.empty:
        print(f"No data available for {country} after filtering.")
        return

    # filtering and ranking
    results = filter_and_rank_reviews(country_data, start_year, end_year, min_rating, min_price, max_price, country, n_results)

    # case when no results match the filter criteria
    if results.empty:
        print("No data matches the given filter criteria.")
        return

    print("\nFiltered and Ranked Results:")
    print(results[['name', 'original_price', 'original_rating', 'review']].to_string(index=False))

    # a new LSH model on the filtered dataset
    print("\nRebuilding LSH model on filtered data...")
    local_lsh_model = build_lsh(results, features)

    # Perform LSH query
    print("\nQuerying LSH...")
    query_point = [
        float(input("Enter normalized price (e.g., 0.5): ")),
        float(input("Enter normalized rating (e.g., 0.9): "))
    ]

    # Perform LSH query with validation
    lsh_results = query_lsh(local_lsh_model, results, query_point, features, n_results)
    if lsh_results.empty:
        print("No results found for the LSH query.")
    else:
        print("\nSimilar Reviews Based on Multidimensional Query:")
        print(lsh_results[['name', 'original_price', 'original_rating', 'loc_country']].to_string(index=False))



interactive_query_system()


Welcome to the Coffee Reviews Query System!

Filtered and Ranked Results:
                                 name  original_price  original_rating                                                                                                                                                                                                                                                                                                                                                                                                                     review
                      Rukera Espresso            6.23               97                                                              Evaluated as espresso. Exquisitely aromatic: high-toned, savory and impossibly floral. Toffee, dried black currant, myrrh, lavender, fine musk in aroma and small cup. Plush, syrupy mouthfeel; big, complex finish that is equal parts sweet and savory. In cappuccino-scaled milk, a whole new mid-palate is created,