In [1]:
import json
from collections import defaultdict
from google import genai
from dotenv import load_dotenv
import re

load_dotenv()

True

In [2]:
advertiser_file_path = './data/advertisers_using_your_activity_or_information.json'
with open(advertiser_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

In [3]:
class AdvertiserClassifier:
    
    def __init__(self, advertiser_file_path: str):
        with open(advertiser_file_path, 'r', encoding='utf-8') as file:
            self.data = json.load(file)
        
        self.all_advertisers = self._get_advertisers_cleaned()
        self.gemini_client = genai.Client()
        self.categories = ["Retail", "Fashion", "Technology", "Financial Services", "Entertainment", "Health", "Food", "Travel", "Education", "Miscellaneous"]
    
    def _get_advertisers_cleaned(self) -> list[str]:
        all_advertisers = [ad_map['value'] for ad_map in self.data['label_values'][0]['vec']]

        cleaned_advertisers = []
        for advertiser in all_advertisers:
            advertiser = advertiser.strip()
            advertiser = re.sub(r'[^A-Za-z0-9 ]+', '', advertiser)
            advertiser = advertiser.lower()
            if advertiser != '':
                cleaned_advertisers.append(advertiser)
        return cleaned_advertisers
    
    def _get_advertiser_batches(self, advertisers: list[str], batch_size: int = 30) -> list[list[str]]:
        batches = []
        for i in range(0, len(advertisers), batch_size):
            batches.append(advertisers[i:i + batch_size])
        return batches

    def _get_gemini_prompt(self, batch_advertisers: list[str], categories: list[str]) -> str:
        prompt = (
            "For each of the following advertisers, classify them into one of these categories: "
            + ", ".join(categories)
            + ".\n\n"
        )
        for adv in batch_advertisers:
            prompt += f"- {adv}\n"
        prompt += "\nProvide your answer in the format 'Advertiser: Category' with a newline after each one.\nDo not include any other text in your response.\nDo not use miscellaneous unless absolutely necessary.\n"
        prompt += "Feel free to use online search to help with classification.\n"
        return prompt

    def _classify_batch(self, batch_advertisers: list[str], model_id: str) -> dict[str, str]:
        prompt = self._get_gemini_prompt(batch_advertisers, self.categories)
        response = self.gemini_client.models.generate_content(
            model=model_id, contents=prompt
        )
        if not response.text:
            raise ValueError("No response from Gemini API")
        
        response_text = response.text.strip()
        classifications = {}
        for line in response_text.split('\n'):
            if ': ' in line:
                adv, category = line.split(': ', 1)
                classifications[adv.strip()] = category.strip()
        return classifications

    def classify_all_advertisers_with_rate_limiting(self, qpm: int = 30, batch_size: int = 30, model_id: str = "gemini-2.0-flash-lite") -> dict[str, str]:
        import time
        from tqdm import tqdm

        advertiser_batches = self._get_advertiser_batches(self.all_advertisers, batch_size=batch_size)
        all_classifications = {}
        interval = 60 / qpm  

        for batch in tqdm(advertiser_batches, desc="Classifying advertisers"):
            try:
                batch_classifications = self._classify_batch(batch, model_id=model_id)
                all_classifications.update(batch_classifications)
            except Exception as e:
                print(f"Error classifying batch: {e}")
            time.sleep(interval)  # Rate limiting

        return all_classifications

In [5]:
advertiser_classifier = AdvertiserClassifier(advertiser_file_path)
classifications = advertiser_classifier.classify_all_advertisers_with_rate_limiting(qpm=10, batch_size=50, model_id="gemini-2.5-flash")

Classifying advertisers: 100%|██████████| 32/32 [15:34<00:00, 29.20s/it]


In [6]:
classifications

{'experian marketing services audiences': 'Technology',
 'zorbas coney island': 'Food',
 'oracle data cloud': 'Technology',
 'cross screen media': 'Technology',
 'doordash': 'Food',
 'mediacom usa': 'Technology',
 'omd usa': 'Technology',
 'walmartcom': 'Retail',
 'hearts science': 'Technology',
 'alaska airlines': 'Travel',
 'uniagency': 'Technology',
 'bluefocus': 'Technology',
 'jackpot party casino community': 'Entertainment',
 'webpals mobile': 'Technology',
 'yellowhead': 'Technology',
 'xtreme xperience': 'Entertainment',
 'meetsocial hk digital marketing coltd1': 'Technology',
 'stripe': 'Financial Services',
 'shopify': 'Technology',
 'shein metaall': 'Fashion',
 'jw pei int': 'Fashion',
 'wp brandstudio': 'Technology',
 'washington post': 'Entertainment',
 'usa today': 'Entertainment',
 'staypineapple hotels': 'Travel',
 'vrbo': 'Travel',
 'expedia': 'Travel',
 'chicago loop alliance': 'Entertainment',
 'clemenger bbdo': 'Technology',
 'linkedin emea latam': 'Technology',
 'l

In [7]:
advertisers_labeled = {
    'advertisers': []
}

for adv, category in classifications.items():
    advertisers_labeled['advertisers'].append({
        'name': adv,
        'category': category
    })

with open('./data/targeted_advertisers_labeled.json', 'w', encoding='utf-8') as f:
    json.dump(advertisers_labeled, f, indent=4)
