In [55]:
import pandas as pd
import json
import requests
import base64
import re
import os
import csv
import time

Define Classes

In [56]:
import base64
import requests

class Product:
    def __init__(self, id, name, brand, mrp, barcode, has_image):
        self.id = id
        self.name = name
        self.brand = brand
        self.mrp = mrp
        self.barcode = barcode
        self.has_image = has_image

class Association:
    def __init__(self, id, categories, short_desc, long_desc, images, confidence):
        self.id = id
        self.categories = categories
        self.short_desc = short_desc
        self.long_desc = long_desc
        self.images = images
        self.confidence = confidence

    @staticmethod
    def from_json(json_str, imagelist):
        data = json.loads(json_str)
        return Association(data['id'], data['categories'], data['shortDesc'], data['longDesc'], [imagelist[i] for i in data['images']], data['confidence'])
    
class Image:
    def __init__(self, title, original, thumbnail):
        self.title = title
        self.original = original
        self.thumbnail = thumbnail

    def base64Thumbnail(self):
        response = requests.get(self.thumbnail)
        binary = response.content
        return base64.b64encode(binary).decode('utf-8')

    def toGenerativePart(self, mimeType="image/jpeg"):
        return {
            "inline_data": {
                "data": self.base64Thumbnail(),
                "mime_type": mimeType,
            },
        }

    def get_cleaned_image_link(self):
        cleaned = re.sub(r'(\.jpeg|\.jpg|\.png).*', r'\1', self.original)
        return cleaned if re.search(r'\.(jpeg|jpg|png)$', cleaned) else ''

Define AI and Image Services


In [57]:
class AIServices:
    @staticmethod
    def generate_association(product, imagelist):
        api_key = os.environ.get('GOOGLE_API_KEY')
        url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?alt=sse&key={api_key}"

        prompt = f"Id: {product.id}\nProduct: {product.name}\nBrand: {product.brand}\nMRP: {product.mrp}\nImages:\n"
        parts = [{"text": prompt}] + [image.toGenerativePart() for image in imagelist]

        request_body = {
            "system_instruction": {"parts": {"text": "You are a product associator for Jesh Supermarket's online webstore.\nWe wanna set up a online web store, but dont wanna go through the hassle of manually setting up all the images and details and categories... So we want u to smartly do it for us.\nYou will be given a product with some information like its mrp (in rupees) and the name along with some images.\nYour job is to smartly analyse the given data and use circumstantial evidence, intuition and guesswork to respond in json with the following data (u can directly start with the {} and dont need to put the blockquotes or mention lang)-\n{\n    \"id\": integer (provided in the input),\n    \"categories\": int array (starts from index = 0) (okay, so this requires a lot of thought kay... add all the right 'ids' of categories (categories is provided below). Like suppose product is milk, then the response should be `[28, 84]`. try to match as many categories as possible, ACCURATELY!!),\n\t\"shortDesc\": string (around 50-200 characters, describing the product in a confident, straightforward manner (this is gonna be displayed to customer in search page.)),\n\t\"longDesc\": string (around 300-700 characters, describing the product confidently, straightforward and generic in a well formatted manner (only pure text, no markdown or html)),\n\t\"images\": int array (starts from index = 0) (you will be provided some images that were scraped from google... Whether this is the right image or not is unknown, so what u are gonna have to do is study all the images and decide whether they are the right fit for this product, considering the product details such as name, price (in rupees), quantity, flavor, etc. like `[1, 4]`. For example, if the product is a blue geometry box from Natraj, then from the images, compare and select whichever is the closes match. In case all the images are weird, then at least try to match with whichever ones are the closest (even if not natraj or blue. But this applies only to situation where all other images are completely irrelevant.). In case no good images, then just return empty array like `[]`. For abstract products like \"bangles\", since the image is important, but u dont kknow what it looks like, always leave blank.),\n\t\"confidence\": float (0 to 1) (how sure are you of the images and ur knowledge of this specific product? for example, for a biscuit like parle-g, u might have high confidence since its common knowledge, but for a product like a snacks box, u might have low confidence since u dont know what the ACTUAL physical product looks like.)\n}\n\ncategories:= {\"Breakfast&Dairy\":{ \"MilkProduct\":28, \"Bread\":29, \"Paneer\":30, \"Butter&Cheese\":31, \"James,Honey\":32 },\"Beverages\":{ \"ColdDrinks\":23, \"Juices&Drinks\":24, \"Tea&Coffee\":25, \"Health&EnergyDrinks\":26, \"Water&Soda\":27, \"MilkDrinks\":52 },\"HouseholdNeeds\":{ \"Detergents&Dishwash\":40, \"Cleaners\":41, \"Freshener&Repellents\":42, \"Mops,Brushes&Scrubs\":43 },\"Biscuiits,Snacks&Chocolate\":{ \"Biscuits&Cookies\":33, \"Namkeen&Snacks\":34, \"Chips&Crisps\":35, \"Chocolates&Candies\":36, \"Sweets\":37 },\"PersonalCare\":{\"Bath&Body\":49,\"HairCare\":50,\"SkinCare\":53,\"OralCare\":54,\"Deos&Perfumes\":55,\"FaceCare\":56,\"FeminineHygiene\":57,\"Cosmetics\":{ \"Sachets\":62 },\"Toothpastes\":81,\"BathSoap\":{ \"Facewash\":100 },\"Oils\":90,\"Sanitaries\":94,\"FacePowder\":102,\"Dishwashingitems\":112,\"Soappowders\":113},\"Home&Kitchen\":{ \"Cookware\":45, \"Storage&Containers\":46, \"KitchenTools&Accessories\":47, \"Bags&TravelAccessories\":48 },\"ChocalateItems\":60,\"HouseholdEssentials\":63,\"Stationaries\":{ \"Tools\":65, \"TailaringItems\":66, \"Fancyitems\":95, \"Coveringitems\":103, \"NoteBooks\":108, \"HomeDecor\":110 },\"Groceries\":{\"Teapowder\":70,\"MilkProducts\":84,\"Spices&Seasonings\":85,\"FastFood\":88,\"CookingOil\":93,\"Oils\":96,\"Condiments\":97,\"Flours\":98, \"Rice&Salt\":99,\"Products\":101,\"Egg\":111},\"Consumables\":{ \"Sweets\":69, \"HealthSupplements\":73, \"Drinks\":74, \"Biscuits\":77, \"Bakery\":78 },\"PoojaItems\":71,\"Xeroxcharges\":72,\"BirthdayItems\":80,\"ElactricalItems\":83,\"KicthenEssentials\":87,\"AnimalsFoods\":89, \"BabyProducts\":91,\"PlasticItems\":92,\"SteelItems\":105,\"Glassproducts\":106,\"Footwear\":107,\"Offeritems\":109}"}},
            "contents": [{"parts": parts}],
            "safety_settings": [{"category": "HARM_CATEGORY_HARASSMENT","threshold": "BLOCK_NONE"},{"category": "HARM_CATEGORY_HATE_SPEECH","threshold": "BLOCK_NONE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","threshold": "BLOCK_NONE"}]
        }

        response = requests.post(url, json=request_body, headers={'Content-Type': 'application/json'})
        response.raise_for_status()
        data = response.text.removeprefix("data: ")
        data = json.loads(data)
        result = data['candidates'][0]['content']['parts'][0]['text']
        association_json = re.search(r'\{.*\}', result, re.DOTALL).group(0)
        return Association.from_json(association_json, imagelist)

class ImageServices:
    @staticmethod
    def get_images(query):
        api_key = open('google_api.key', 'r').read()
        url = f"https://www.googleapis.com/customsearch/v1?cx=364fea58938af485a&searchType=image&key={api_key}&q={query}"
        
        response = requests.get(url)
        response.raise_for_status()
        results = response.json().get('items', [])
        
        return [Image(result["title"], result["link"], result["image"]["thumbnailLink"]) for result in results]
    
    @staticmethod
    def get_images_test(query=""):
        return [Image("150g protein in a day","https://i.pinimg.com/originals/06/92/2c/06922c965121901b9ad3ea565a1c9e0d.jpg","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRasub-YnSZhWXmaS5sr9rwThTojbKDyTsGWSC6VEIRCEeKjDZ4rqT5uw&s"), Image("Cadbury Dairy Milk Milk Chocolate 150g","https://www.cadbury.com.au/media/catalog/product/m/o/mond-9300617063872-1.jpg?quality=80&bg-color=255,255,255&fit=bounds&height=519&width=712&canvas=712:519","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSYrU2XDUEAS4huRJRj0dnKbJxJqZTDg0NCb2cLIR9W68J1_3pdgOQ6Ig&s"), Image("Argentina Corned Beef 150g — .","https://primomart.ph/cdn/shop/products/3f904b3ca7f0e7ccbca241d3297e9330_700x700.jpg?v=1597314096","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQjP9DVMdpIIWZ1-cwU6-dKT3imIZWuoaowsZSOty_cyaj1rrMDF-ELgzLy&s"), Image("150g chicken breast (raw weight), 2 boiled eggs and a medium ...","https://i.redd.it/xpzekgyixoma1.jpg","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRe8dPaZ539HOobsYNAFOiscy8grAgxCxv81fBAIK0Rq2o9YqkIxf6hAS0&s"), Image("Arla Gouda Slices 150g | Arla","https://www.arla.com.mt/4998e0/globalassets/arla-global/products---overview/all-products-a-z/cheese/mt/arla-gouda-slices-150g.png?preset=product-mobile","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT5KNcQXpzH_H7m5n46gVBiqi66-DdqF37hieMjz7OTe9NlK8VsIvdhg2k&s"), Image("Lindt Les Grandes Milk Chocolate Hazelnut Bar 150g","https://www.chocolate.lindt.com/media/catalog/product/6/3/63701f83bdab661e4f87f538d00708546b411ac594d6e3150dff7d76208f8677.jpeg?quality=80&fit=bounds&height=700&width=700&canvas=700:700","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQtp7wfhComTTWYElEP2YJfDAIZTuDl3PbV1fYOpdUZIddLDXkJHZ_PjWZ9&s"), Image("Desert Salt 150g Box | Savoursmiths","https://savoursmiths.com/wp-content/uploads/2020/03/Savoursmiths_Products_Dessert_Salt.jpg","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSN0iTXHJ4YH3ISFzZLqRuCNXE-of4OKoVXbYz8ZSiLN_mhNpL20V7cnxI&s"), Image("Cessna 150G — Above All Aviation","https://images.squarespace-cdn.com/content/v1/544f2a03e4b03cb72fecc09b/1503010958440-C3SR1DLC458OT984BE1W/image-asset.jpeg","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS8TNFhOOjqMrAmvi-1YUUjVg7tkHl7HWGVdDhECc4uq5EcHiIELMsP1FM&s"), Image("Marie Biscuits - 150g – Surati Snacks - Buy Indian Snacks & Sweets","https://www.suratiworld.com/cdn/shop/products/Marie144gm_final_800x.jpg?v=1632939103","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQvzuqquPr2Q5V6kWwJ2CC2um9e6iQeJdmRsCRhq83sxKvbS06mqwdnww&s"), Image("Argentina Corned Beef 150g - SDC Global Choice","https://sdcglobalchoice.com/wp-content/uploads/2021/07/150g-argentina-corned-beef.jpg","https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT49fV_H1OAq5Ks1THAO2noPXbwPNcVALN_JbtcRzKveOvdiD7K3c6zjgeJ&s")]

Read Excel File and Extract Products


In [58]:
df = pd.read_excel('Combination_Upd_Sheet240907074747.xlsx')

products = []
for _, row in df.iterrows():
    product = Product(
        id=row['Comid'],
        name=row['Item'],
        brand=row['Brand'],
        mrp=row['Mrp'],
        barcode=row['Barcode'],
        has_image=False
    )
    products.append(product)

Generate Associations

In [None]:
with open('associations_backup.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'brand', 'name', 'mrp', 'association_json', 'assoc_id', 'categories', 'shortDesc', 'longDesc', 'images', 'confidence']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

associations = []
try:
    for product in products:
        try:
            imagelist = ImageServices.get_images(f"{product.brand} {product.name} Rs {product.mrp} JPG OR PNG")
            display(imagelist)
            
            association = AIServices.generate_association(product, imagelist)
            display(association)
            
            associations.append(association)
            display(associations)
            
            # Save the association to CSV
            with open('associations_backup.csv', 'a', newline='') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writerow({
                    'id': product.id,
                    'brand': product.brand,
                    'name': product.name,
                    'mrp': product.mrp,
                    'association_json': json.dumps(association),
                    
                    "assoc_id": association.id,
                    "categories": association.categories,
                    "shortDesc": association.short_desc,
                    "longDesc": association.long_desc,
                    "images": [img.get_cleaned_image_link() for img in association.images],
                    "confidence": association.confidence
                })
            
            time.sleep(5)  # Rate limit of 5 seconds
        
        except Exception as e:
            print(f"An error occurred while processing product {product.name}: {e}")
            continue  # Skip to the next product

except Exception as e:
    print(f"An error occurred: {e}")
    # Save the progress to a file or database
    with open('associations_backup_on_error.json', 'w') as f:
        json.dump([assoc for assoc in associations], f)
    print("Progress saved.")

Save Associations to CSV

In [None]:
data = [{
    "id": assoc.id,
    "categories": assoc.categories,
    "shortDesc": assoc.short_desc,
    "longDesc": assoc.long_desc,
    "images": [img.get_cleaned_image_link() for img in assoc.images],
    "confidence": assoc.confidence
} for assoc in associations]

df = pd.DataFrame(data)
df.to_csv('associations.csv', index=False)