In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import defaultdict

# Constants representing the paths to the datasets
offer_retailer_csv = 'offer_retailer.csv'
brand_category_csv = 'brand_category.csv'
categories_csv = 'categories.csv'

In [None]:
class OfferSearchTool:
    def __init__(self):
        self.merged_df = self.load_datasets()
        

    def load_datasets(self):
        # Load the datasets into pandas DataFrames
        offers_df = pd.read_csv(offer_retailer_csv)
        brands_df = pd.read_csv(brand_category_csv)
        categories_df = pd.read_csv(categories_csv)

        # Merge the datasets on the appropriate keys
        merged_df = pd.merge(offers_df, brands_df, on='BRAND', how='left')
        merged_df = pd.merge(merged_df, categories_df, left_on='BRAND_BELONGS_TO_CATEGORY', right_on='PRODUCT_CATEGORY', how='left')

        # If there are any missing values after the merge, you can fill them with a placeholder
        merged_df.fillna('Unknown', inplace=True)
        print(merged_df.head())
        return merged_df

    def search(self, query):
        # Use defaultdict to handle potential duplicate offers with their highest score
        results = defaultdict(lambda: {'score': 0, 'row_data': None})

        # Search for direct category matches (product cat)
        for _, row in self.merged_df.iterrows():
            category_similarity = fuzz.partial_ratio(query.lower(), row['PRODUCT_CATEGORY'].lower())
            offer_similarity = fuzz.partial_ratio(query.lower(), row['OFFER'].lower())
            brand_similarity = fuzz.partial_ratio(query.lower(), row['BRAND'].lower())
            retailer_similarity = fuzz.partial_ratio(query.lower(), row['RETAILER'].lower())

            # Take the highest similarity score among the category, offer, brand, retailer
            max_similarity = max(category_similarity, offer_similarity, brand_similarity, retailer_similarity)

            # Update results if this is the highest score for this offer so far
            if max_similarity > results[row['OFFER']]['score']:
                results[row['OFFER']]['score'] = max_similarity
                results[row['OFFER']]['row_data'] = row

        # If no strong category matches, search within the broader categories (is to child cat)
        if all(res['score'] <= 80 for res in results.values()):
            for _, row in self.merged_df.iterrows():
                if 'IS_CHILD_CATEGORY_TO' in row and query.lower() in row['IS_CHILD_CATEGORY_TO'].lower():
                    # Use a fixed similarity score for broader category matches
                    parent_category_similarity = 75  # Lower score for broader category matches
                    if parent_category_similarity > results[row['OFFER']]['score']:
                        results[row['OFFER']]['score'] = parent_category_similarity
                        results[row['OFFER']]['row_data'] = row

        # Convert results to a DataFrame, ensuring that row data is not None
        results_df = pd.DataFrame([res['row_data'] for res in results.values() if res['row_data'] is not None])
        results_df['Search Score'] = [res['score'] for res in results.values() if res['row_data'] is not None]

        # Sort by 'Search Score' and return the top 50
        results_df.sort_values(by='Search Score', ascending=False, inplace=True)
        return results_df.head(50)



    def run_cli(self):
        while True:
            query = input("Enter your search query (type 'exit' to stop): ").lower()
            if query == 'exit':
                break

            results_df = self.search(query)
            if results_df.empty:
                print("No results found.")
                continue
            
            # Iterate over the DataFrame rows and print the results
            for index, result in results_df.iterrows():
                print(f"Search Score: {result['Search Score']}")
                print(f"Offer: {result['OFFER']}")
                # Assuming that the merged DataFrame has the following columns
                print(f"Retailer: {self.merged_df.loc[self.merged_df['OFFER'] == result['OFFER'], 'RETAILER'].iloc[0]}")
                print(f"Category: {self.merged_df.loc[self.merged_df['OFFER'] == result['OFFER'], 'PRODUCT_CATEGORY'].iloc[0]}")
                print(f"Brand: {self.merged_df.loc[self.merged_df['OFFER'] == result['OFFER'], 'BRAND'].iloc[0]}")
                print("\n---\n\n")

#The main execution point of the script
if __name__ == "__main__":
    tool = OfferSearchTool()
    tool.run_cli()


                                            OFFER   RETAILER        BRAND  \
0  Spend $50 on a Full-Priced new Club Membership  SAMS CLUB    SAMS CLUB   
1    Beyond Meat® Plant-Based products, spend $25    Unknown  BEYOND MEAT   
2    Beyond Meat® Plant-Based products, spend $25    Unknown  BEYOND MEAT   
3    Beyond Meat® Plant-Based products, spend $25    Unknown  BEYOND MEAT   
4        Good Humor Viennetta Frozen Vanilla Cake    Unknown   GOOD HUMOR   

  BRAND_BELONGS_TO_CATEGORY RECEIPTS                           CATEGORY_ID  \
0                   Unknown  Unknown                               Unknown   
1          Plant-Based Meat   1584.0  d8cb60e5-b0c6-478a-971d-c6c55b17831f   
2   Frozen Plant-Based Meat    313.0  8e0a9431-5462-4810-9f65-68fe36adf454   
3             Packaged Meat     30.0  e73f7957-0e65-4466-9588-795bdc5f67ac   
4           Frozen Desserts   1052.0  38160828-1029-4505-9849-673773c5fad3   

          PRODUCT_CATEGORY IS_CHILD_CATEGORY_TO  
0                 