In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re
import json
import os
from urllib.parse import urljoin
from typing import List, Dict, Any, Optional

class IncideCoderScraper:
    def __init__(self, base_url: str = "https://incidecoder.com", delay_range: tuple = (1, 3)):
        """
        Initialize the IncideCoder scraper.
        
        Args:
            base_url: The base URL of the IncideCoder website
            delay_range: Tuple of (min, max) seconds to wait between requests
        """
        self.base_url = base_url
        self.delay_range = delay_range
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        self.products = []
        
    def get_soup(self, url: str) -> Optional[BeautifulSoup]:
        """
        Make a request to the URL and return a BeautifulSoup object.
        
        Args:
            url: The URL to request
            
        Returns:
            BeautifulSoup object or None if request failed
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.exceptions.RequestException as e:
            print(f"Error retrieving {url}: {e}")
            return None
    
    def scrape_product_links(self, category_url: str, max_pages: int = 5) -> List[str]:
        """
        Scrape product links from category pages.
        
        Args:
            category_url: URL of the category to scrape
            max_pages: Maximum number of pages to scrape
            
        Returns:
            List of product URLs
        """
        product_links = []
        page = 1
        
        while page <= max_pages:
            url = f"{category_url}/{page}" if page > 1 else category_url
            print(f"Scraping page {page}: {url}")
            
            soup = self.get_soup(url)
            if not soup:
                break
                
            # Find product links on the page
            product_elements = soup.select('.detailPageLinkOnly')
            if not product_elements:
                print(f"No product elements found on page {page}")
                break
                
            for element in product_elements:
                link = element.get('href')
                if link:
                    product_links.append(urljoin(self.base_url, link))
            
            print(f"Found {len(product_elements)} products on page {page}")
            
            # Check if there's a next page
            next_page = soup.select_one('.pagination .next')
            if not next_page or 'disabled' in next_page.get('class', []):
                print("No more pages available")
                break
                
            page += 1
            # Add delay to avoid overloading the server
            time.sleep(random.uniform(*self.delay_range))
            
        return product_links
    
    def extract_ingredients(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
        """
        Extract ingredients from the product page.
        
        Args:
            soup: BeautifulSoup object of the product page
            
        Returns:
            List of ingredient dictionaries with name and function
        """
        ingredients_section = soup.select_one('.detailpage-content-ingredientlist')
        if not ingredients_section:
            return []
        
        ingredients = []
        ingredient_items = ingredients_section.select('.ingred-cell')
        
        for item in ingredient_items:
            ingredient_data = {}
            
            # Extract ingredient name
            name_elem = item.select_one('.ingred-link')
            if name_elem:
                ingredient_data['name'] = name_elem.text.strip()
            else:
                continue  # Skip if no name found
                
            # Extract ingredient function if available
            func_elem = item.select_one('.itemprop-func')
            if func_elem:
                ingredient_data['function'] = func_elem.text.strip()
            else:
                ingredient_data['function'] = ''
                
            ingredients.append(ingredient_data)
        
        return ingredients
    
    def extract_product_attributes(self, soup: BeautifulSoup) -> Dict[str, Any]:
        """
        Extract product attributes like safety, suitability for skin types, and functions.
        
        Args:
            soup: BeautifulSoup object of the product page
            
        Returns:
            Dictionary of product attributes
        """
        attributes = {
            'safety': 'N/A',
            'oily': False,
            'dry': False,
            'sensitive': False,
            'comedogenic': False,
            'acne_fighting': False,
            'anti_aging': False,
            'brightening': False,
            'uv': False
        }
        
        # Safety rating
        safety_element = soup.select_one('.product-safety-rating')
        if safety_element:
            safety_text = safety_element.text.strip()
            safety_match = re.search(r'(\d+)/\d+', safety_text)
            if safety_match:
                attributes['safety'] = safety_match.group(1)
        
        # Product details and description
        product_details = soup.select('.product-details-text')
        details_text = ' '.join([detail.text.lower().strip() for detail in product_details])
        
        product_description = soup.select_one('.product-description')
        if product_description:
            description_text = product_description.text.lower().strip()
            details_text += ' ' + description_text
        
        # Skin types
        if any(term in details_text for term in ['oily skin', 'for oily', 'oily complexion']):
            attributes['oily'] = True
        if any(term in details_text for term in ['dry skin', 'for dry', 'dry complexion']):
            attributes['dry'] = True
        if any(term in details_text for term in ['sensitive skin', 'for sensitive']):
            attributes['sensitive'] = True
            
        # Functions
        if any(term in details_text for term in ['comedogenic', 'non-comedogenic', 'noncomedogenic']):
            attributes['comedogenic'] = True
        if any(term in details_text for term in ['acne', 'pimple', 'breakout', 'blemish']):
            attributes['acne_fighting'] = True
        if any(term in details_text for term in ['anti-aging', 'anti aging', 'wrinkle', 'fine lines', 'aging']):
            attributes['anti_aging'] = True
        if any(term in details_text for term in ['brighten', 'brightening', 'illuminating', 'radiance', 'glow']):
            attributes['brightening'] = True
        if any(term in details_text for term in ['spf', 'sunscreen', 'uv protection', 'sun protection']):
            attributes['uv'] = True
            
        return attributes
    
    def scrape_product_details(self, product_url: str) -> Optional[Dict[str, Any]]:
        """
        Scrape details for a single product.
        
        Args:
            product_url: URL of the product to scrape
            
        Returns:
            Dictionary of product details or None if scraping failed
        """
        print(f"Scraping product: {product_url}")
        soup = self.get_soup(product_url)
        if not soup:
            return None
            
        # Extract basic product information
        product_info = {
            'id': product_url.split('/')[-1],
            'url': product_url,
            'name': '',
            'brand': '',
            'type': '',
            'image': '',
            'ingredients': [],
            'raw_ingredients_text': ''
        }
        
        # Product name
        name_element = soup.select_one('h1.ingredientHeading')
        if name_element:
            product_info['name'] = name_element.text.strip()
            
        # Brand
        brand_element = soup.select_one('.detailpage-brand a')
        if brand_element:
            product_info['brand'] = brand_element.text.strip()
            
        # Product type
        type_element = soup.select_one('.product-details-category')
        if type_element:
            product_info['type'] = type_element.text.strip()
            
        # Product image
        image_element = soup.select_one('.detailpage-img img')
        if image_element and image_element.has_attr('src'):
            product_info['image'] = urljoin(self.base_url, image_element['src'])
            
        # Raw ingredients text
        ingredients_text_element = soup.select_one('.show-more-text')
        if ingredients_text_element:
            product_info['raw_ingredients_text'] = ingredients_text_element.text.strip()
            
        # Ingredients
        product_info['ingredients'] = self.extract_ingredients(soup)
        
        # Other attributes
        attributes = self.extract_product_attributes(soup)
        product_info.update(attributes)
        
        # Add delay to avoid overloading the server
        time.sleep(random.uniform(*self.delay_range))
        
        return product_info
    
    def scrape_category(self, category_url: str, max_pages: int = 5, max_products: int = 100) -> None:
        """
        Scrape products from a category.
        
        Args:
            category_url: URL of the category to scrape
            max_pages: Maximum number of pages to scrape
            max_products: Maximum number of products to scrape
        """
        product_links = self.scrape_product_links(category_url, max_pages)
        print(f"Found {len(product_links)} product links")
        
        count = 0
        for link in product_links:
            if count >= max_products:
                break
                
            product_info = self.scrape_product_details(link)
            if product_info:
                self.products.append(product_info)
                count += 1
                
                # Save progress every 10 products
                if count % 10 == 0:
                    print(f"Scraped {count} products so far")
                    self.save_to_json(f"incidecoder_products_progress_{len(self.products)}.json")
    
    def save_to_csv(self, filename: str = 'incidecoder_products.csv') -> None:
        """
        Save scraped products to CSV file.
        
        Args:
            filename: Name of the CSV file to save
        """
        if not self.products:
            print("No products to save")
            return
            
        fieldnames = ['id', 'name', 'brand', 'type', 'image', 'safety', 
                      'oily', 'dry', 'sensitive', 'comedogenic', 
                      'acne_fighting', 'anti_aging', 'brightening', 'uv', 
                      'url', 'raw_ingredients_text']
        
        # Add ingredients as separate columns
        products_with_flat_ingredients = []
        
        for product in self.products:
            product_copy = product.copy()
            
            # Extract ingredients and flatten
            ingredients_list = product_copy.pop('ingredients', [])
            ingredient_names = [ing['name'] for ing in ingredients_list if 'name' in ing]
            product_copy['ingredients'] = ', '.join(ingredient_names)
            
            products_with_flat_ingredients.append(product_copy)
        
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            
            for product in products_with_flat_ingredients:
                writer.writerow(product)
                
        print(f"Saved {len(self.products)} products to {filename}")
    
    def save_to_json(self, filename: str = 'incidecoder_products.json') -> None:
        """
        Save scraped products to JSON file.
        
        Args:
            filename: Name of the JSON file to save
        """
        if not self.products:
            print("No products to save")
            return
            
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.products, f, indent=2, ensure_ascii=False)
            
        print(f"Saved {len(self.products)} products to {filename}")
    
    def load_from_json(self, filename: str) -> None:
        """
        Load products from a JSON file.
        
        Args:
            filename: Name of the JSON file to load
        """
        if not os.path.exists(filename):
            print(f"File {filename} does not exist")
            return
            
        with open(filename, 'r', encoding='utf-8') as f:
            self.products = json.load(f)
            
        print(f"Loaded {len(self.products)} products from {filename}")


def main():
    # Create scraper instance
    scraper = IncideCoderScraper()
    
    # Define categories to scrape
    categories = [
        'https://incidecoder.com/products/product-type/moisturizer',
        'https://incidecoder.com/products/product-type/serum',
        'https://incidecoder.com/products/product-type/cleanser',
        'https://incidecoder.com/products/product-type/mask'
    ]
    
    # Scrape each category
    for category in categories:
        # Extract category name from URL for logging
        category_name = category.split('/')[-1]
        print(f"\nScraping category: {category_name}")
        
        # Scrape the category
        scraper.scrape_category(category, max_pages=2, max_products=25)
        
        # Save intermediate results
        scraper.save_to_json(f"incidecoder_{category_name}.json")
    
    # Save final results in both formats
    scraper.save_to_csv("incidecoder_products.csv")
    scraper.save_to_json("incidecoder_products.json")


if __name__ == "__main__":
    main()