In [6]:
"""
Diviser en chunks et traiter par batches
‚Üí Mieux contr√¥ler la charge
‚Üí Pause entre batches (√©viter ban)
"""

import csv
import requests
from bs4 import BeautifulSoup
import time
import random
from concurrent.futures import ThreadPoolExecutor

# --- CONFIGURATION ---
input_file = '../Amazon_Best_Seller_2021_June 2.csv'
output_file = 'resultats_titres2.csv'
CHUNK_SIZE = 50       # ASINs par chunk
THREADS_PER_CHUNK = 10  # Threads parall√®les par chunk
PAUSE_BETWEEN_CHUNKS = 30  # Pause entre chunks (secondes)


def get_amazon_title(asin):
    """R√©cup√®re titre Amazon"""
    url = f"https://www.amazon.com/dp/{asin}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept-Language": "en-US,en;q=0.9"
    }
    
    try:
        response = requests. get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            title_tag = soup.find("span", id="productTitle")
            
            if title_tag: 
                return asin, title_tag.get_text().strip()
        
        return asin, "Titre non trouv√©"
        
    except: 
        return asin, "Erreur"
    
    finally:
        time.sleep(random.uniform(0.5, 1.5))


def process_chunk(asins_chunk, chunk_num, total_chunks):
    """Traite un chunk d'ASINs en parall√®le"""
    
    print(f"\nüì¶ CHUNK {chunk_num}/{total_chunks} ({len(asins_chunk)} ASINs)")
    print("-" * 60)
    
    results = []
    
    with ThreadPoolExecutor(max_workers=THREADS_PER_CHUNK) as executor:
        futures = {executor.submit(get_amazon_title, asin): asin for asin in asins_chunk}
        
        for i, future in enumerate(futures):
            try:
                asin, titre = future.result()
                results.append((asin, titre))
                print(f"  [{i+1}/{len(asins_chunk)}] ‚úÖ {asin}:  {titre[:40]}...")
                
            except Exception as e:
                print(f"  ‚ùå Erreur: {e}")
    
    return results


def main():
    """Fonction principale avec chunking"""
    
    print("="*60)
    print("üöÄ EXTRACTION TITRES (MODE CHUNKING)")
    print("="*60)
    
    # Lire ASINs
    asins = []
    with open(input_file, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f)
        next(reader, None)
        
        for row in reader: 
            if row and row[0]:
                asins.append(row[0])
    
    total = len(asins)
    total_chunks = (total + CHUNK_SIZE - 1) // CHUNK_SIZE
    
    print(f"\nüìä Configuration:")
    print(f"   Total ASINs: {total}")
    print(f"   Chunk size: {CHUNK_SIZE}")
    print(f"   Total chunks: {total_chunks}")
    print(f"   Threads/chunk: {THREADS_PER_CHUNK}")
    print(f"   Pause entre chunks: {PAUSE_BETWEEN_CHUNKS}s")
    
    # Estimation
    time_per_chunk = (CHUNK_SIZE * 1.5 / THREADS_PER_CHUNK) + PAUSE_BETWEEN_CHUNKS
    estimated_minutes = (time_per_chunk * total_chunks) / 60
    print(f"\n‚è±Ô∏è Estimation: {estimated_minutes:.1f} minutes\n")
    
    # Traiter par chunks
    start_time = time.time()
    all_results = []
    
    for i in range(0, total, CHUNK_SIZE):
        chunk_num = (i // CHUNK_SIZE) + 1
        asins_chunk = asins[i:i+CHUNK_SIZE]
        
        # Traiter chunk
        results = process_chunk(asins_chunk, chunk_num, total_chunks)
        all_results.extend(results)
        
        # Pause entre chunks (sauf dernier)
        if chunk_num < total_chunks: 
            print(f"\n‚è∏Ô∏è Pause {PAUSE_BETWEEN_CHUNKS}s avant prochain chunk...")
            time.sleep(PAUSE_BETWEEN_CHUNKS)
    
    # √âcrire r√©sultats
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['ASIN', 'Titre R√©cup√©r√©'])
        writer.writerows(all_results)
    
    elapsed = time. time() - start_time
    
    print("\n" + "="*60)
    print(f"‚úÖ TERMIN√â en {elapsed/60:.1f} minutes")
    print(f"üìÅ R√©sultats: {output_file}")
    print(f"üìä ASINs trait√©s: {len(all_results)}/{total}")
    print("="*60)
main()



üöÄ EXTRACTION TITRES (MODE CHUNKING)

üìä Configuration:
   Total ASINs: 707
   Chunk size: 50
   Total chunks: 15
   Threads/chunk: 10
   Pause entre chunks: 30s

‚è±Ô∏è Estimation: 9.4 minutes


üì¶ CHUNK 1/15 (50 ASINs)
------------------------------------------------------------
  [1/50] ‚úÖ B079QHML21:  Fire TV Stick 4K streaming device with A...
  [2/50] ‚úÖ B07FZ8S74R:  Echo Dot (3rd Gen, 2018 release) - Smart...
  [3/50] ‚úÖ B07XJ8C8F5:  Titre non trouv√©...
  [4/50] ‚úÖ B07WVFCVJN:  Roku Express | HD Roku Streaming Device ...
  [5/50] ‚úÖ B08YT2N5SX:  Echo Dot (4th Gen) | Charcoal with Sengl...
  [6/50] ‚úÖ B07TMJ1R3X:  Titre non trouv√©...
  [7/50] ‚úÖ B07X6C9RMF:  Titre non trouv√©...
  [8/50] ‚úÖ B08BX7FV5L:  Amazon Fire HD 10 tablet, 10.1", 1080p F...
  [9/50] ‚úÖ B075XLWML4:  Roku Streaming Stick+ | HD/4K/HDR Stream...
  [10/50] ‚úÖ B089DR29T6:  Amazon Smart Plug | Works with Alexa | S...
  [11/50] ‚úÖ B07WDDT3G5:  Fire HD 8 Kids tablet, 8" HD display, ag...
  [12/50]

KeyboardInterrupt: 

In [None]:
# amazon_extractor_chunked.py

"""
Extracteur de titres Amazon avec architecture par batches
Chaque thread g√®re un batch complet ind√©pendamment
"""

import csv
import requests
from bs4 import BeautifulSoup
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from datetime import datetime
import os

# ==========================================
# CONFIGURATION
# ==========================================

# Fichiers
INPUT_FILE = 'Amazon_Best_Seller_2021_June 2.csv'
OUTPUT_FILE = 'resultats_titres2.csv'
CACHE_FILE = 'cache_progress.txt'

# Param√®tres batches
BATCH_SIZE = 50              # ASINs par batch
MAX_PARALLEL_BATCHES = 3     # Nombre de batches en parall√®le
PAUSE_BETWEEN_ROUNDS = 30    # Pause entre rounds (secondes)

# Param√®tres requ√™tes
REQUEST_TIMEOUT = 10
DELAY_BETWEEN_REQUESTS = (1, 2)  # D√©lai al√©atoire (min, max)

# User Agents (rotation)
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
]

# Lock pour √©criture
write_lock = Lock()
progress_lock = Lock()


# ==========================================
# CLASSE BATCH
# ==========================================

class Batch:
    """Repr√©sente un batch d'ASINs √† traiter"""
    
    def __init__(self, batch_id, asins):
        self.batch_id = batch_id
        self.asins = asins
        self. results = []
        self.success_count = 0
        self. error_count = 0
        self.start_time = None
        self. end_time = None
    
    def __len__(self):
        return len(self.asins)
    
    def duration(self):
        """Dur√©e de traitement du batch"""
        if self.start_time and self.end_time:
            return self.end_time - self.start_time
        return 0


# ==========================================
# EXTRACTION TITRE
# ==========================================

def get_amazon_title(asin):
    """
    R√©cup√®re le titre Amazon pour un ASIN
    
    Returns:
        tuple: (asin, title, status)
        status: 'success' | 'not_found' | 'error' | 'blocked'
    """
    
    url = f"https://www.amazon.com/dp/{asin}"
    headers = {
        "User-Agent": random. choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept":  "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests":  "1"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        
        # V√©rifier si bloqu√©
        if response.status_code == 503:
            return asin, "BLOQU√â PAR AMAZON", "blocked"
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Chercher le titre
            title_tag = soup.find("span", id="productTitle")
            
            if title_tag:
                title = title_tag.get_text().strip()
                return asin, title, "success"
            else:
                return asin, "Titre non trouv√©", "not_found"
        
        else:
            return asin, f"HTTP {response.status_code}", "error"
    
    except requests. Timeout:
        return asin, "Timeout", "error"
    
    except Exception as e:
        return asin, f"Erreur:  {str(e)[:50]}", "error"


# ==========================================
# TRAITEMENT D'UN BATCH (PAR THREAD)
# ==========================================

def process_batch(batch, round_num, total_rounds):
    """
    Traite un batch complet dans un thread d√©di√©
    
    Args: 
        batch: Instance de Batch
        round_num:  Num√©ro du round
        total_rounds: Total de rounds
    
    Returns:
        Batch:  Batch avec r√©sultats
    """
    
    batch.start_time = time.time()
    
    print(f"\n{'='*60}")
    print(f"üîµ BATCH #{batch.batch_id} (Round {round_num}/{total_rounds})")
    print(f"   ASINs: {len(batch)} | Thread: {id(batch) % 10000}")
    print(f"{'='*60}")
    
    for i, asin in enumerate(batch.asins, 1):
        # Extraction
        asin_id, title, status = get_amazon_title(asin)
        
        # Stocker r√©sultat
        batch. results.append((asin_id, title, status))
        
        # Compteurs
        if status == "success":
            batch. success_count += 1
            icon = "‚úÖ"
        elif status == "blocked":
            batch.error_count += 1
            icon = "üö´"
        else: 
            batch.error_count += 1
            icon = "‚ö†Ô∏è"
        
        # Affichage
        print(f"  [{i: 2d}/{len(batch)}] {icon} {asin}: {title[:60]}")
        
        # D√©lai al√©atoire (√©viter ban)
        if i < len(batch):  # Pas de d√©lai apr√®s le dernier
            delay = random.uniform(*DELAY_BETWEEN_REQUESTS)
            time.sleep(delay)
    
    batch.end_time = time.time()
    
    # R√©sum√© batch
    print(f"\n{'‚îÄ'*60}")
    print(f"‚úÖ BATCH #{batch.batch_id} TERMIN√â")
    print(f"   Dur√©e: {batch.duration():.1f}s")
    print(f"   Succ√®s: {batch.success_count}/{len(batch)}")
    print(f"   Erreurs: {batch.error_count}/{len(batch)}")
    print(f"{'‚îÄ'*60}\n")
    
    return batch


# ==========================================
# GESTION ROUNDS
# ==========================================

def process_round(batches, round_num, total_rounds):
    """
    Traite un round (plusieurs batches en parall√®le)
    
    Args:
        batches: Liste de Batch √† traiter
        round_num:  Num√©ro du round
        total_rounds: Total de rounds
    
    Returns:
        list: Liste de Batch compl√©t√©s
    """
    
    print(f"\n{'#'*60}")
    print(f"üöÄ ROUND {round_num}/{total_rounds}")
    print(f"   Batches en parall√®le: {len(batches)}")
    print(f"   Total ASINs ce round: {sum(len(b) for b in batches)}")
    print(f"{'#'*60}")
    
    round_start = time.time()
    completed_batches = []
    
    # Lancer les batches en parall√®le
    with ThreadPoolExecutor(max_workers=len(batches)) as executor:
        # Soumettre chaque batch √† un thread
        future_to_batch = {
            executor.submit(process_batch, batch, round_num, total_rounds): batch
            for batch in batches
        }
        
        # Attendre compl√©tion
        for future in as_completed(future_to_batch):
            batch = future_to_batch[future]
            
            try:
                completed_batch = future.result()
                completed_batches.append(completed_batch)
                
            except Exception as e:
                print(f"‚ùå Erreur batch #{batch.batch_id}: {e}")
    
    round_duration = time.time() - round_start
    
    # R√©sum√© round
    total_success = sum(b.success_count for b in completed_batches)
    total_errors = sum(b.error_count for b in completed_batches)
    total_processed = sum(len(b) for b in completed_batches)
    
    print(f"\n{'#'*60}")
    print(f"‚úÖ ROUND {round_num} TERMIN√â")
    print(f"   Dur√©e: {round_duration:.1f}s")
    print(f"   ASINs trait√©s: {total_processed}")
    print(f"   Succ√®s: {total_success} ({total_success/total_processed*100:.1f}%)")
    print(f"   Erreurs: {total_errors} ({total_errors/total_processed*100:.1f}%)")
    print(f"{'#'*60}\n")
    
    return completed_batches


# ==========================================
# SAUVEGARDE R√âSULTATS
# ==========================================

def save_results(all_batches, output_file):
    """Sauvegarde tous les r√©sultats dans un CSV"""
    
    print(f"\nüíæ Sauvegarde r√©sultats dans {output_file}...")
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['ASIN', 'Titre', 'Statut'])
        
        for batch in all_batches: 
            for asin, title, status in batch. results:
                writer.writerow([asin, title, status])
    
    print(f"‚úÖ {sum(len(b) for b in all_batches)} lignes sauvegard√©es")


# ==========================================
# FONCTION PRINCIPALE
# ==========================================

def main():
    """Pipeline complet"""
    
    print("\n" + "="*60)
    print("üöÄ EXTRACTEUR TITRES AMAZON - MODE CHUNKING")
    print("="*60)
    print(f"‚öôÔ∏è  Configuration:")
    print(f"   - Taille batch: {BATCH_SIZE} ASINs")
    print(f"   - Batches parall√®les: {MAX_PARALLEL_BATCHES}")
    print(f"   - Pause entre rounds: {PAUSE_BETWEEN_ROUNDS}s")
    print(f"   - D√©lai entre requ√™tes: {DELAY_BETWEEN_REQUESTS}s")
    print("="*60)
    
    # √âTAPE 1 : Charger ASINs
    print(f"\nüìÇ Chargement {INPUT_FILE}...")
    
    asins = []
    with open(INPUT_FILE, 'r', encoding='utf-8', errors='replace') as f:
        reader = csv.reader(f)
        next(reader, None)  # Skip header
        
        for row in reader:
            if row and row[0]. strip():
                asins.append(row[0].strip())
    
    total_asins = len(asins)
    print(f"‚úÖ {total_asins} ASINs charg√©s")
    
    # √âTAPE 2 : Cr√©er batches
    print(f"\nüì¶ Cr√©ation des batches...")
    
    all_batches = []
    for i in range(0, total_asins, BATCH_SIZE):
        batch_id = (i // BATCH_SIZE) + 1
        batch_asins = asins[i: i+BATCH_SIZE]
        batch = Batch(batch_id, batch_asins)
        all_batches.append(batch)
    
    total_batches = len(all_batches)
    print(f"‚úÖ {total_batches} batches cr√©√©s")
    
    # √âTAPE 3 : Cr√©er rounds
    rounds = []
    for i in range(0, total_batches, MAX_PARALLEL_BATCHES):
        round_batches = all_batches[i:i+MAX_PARALLEL_BATCHES]
        rounds.append(round_batches)
    
    total_rounds = len(rounds)
    print(f"‚úÖ {total_rounds} rounds planifi√©s")
    
    # Estimation temps
    avg_time_per_batch = (BATCH_SIZE * 1.5)  # 1.5s par ASIN
    estimated_minutes = (avg_time_per_batch * total_batches / MAX_PARALLEL_BATCHES + 
                        PAUSE_BETWEEN_ROUNDS * (total_rounds - 1)) / 60
    print(f"\n‚è±Ô∏è  Estimation: {estimated_minutes:.1f} minutes")
    
    input(f"\n‚ñ∂Ô∏è  Appuyez sur Entr√©e pour d√©marrer...")
    
    # √âTAPE 4 :  Traiter rounds
    start_time = time.time()
    completed_batches = []
    
    for round_num, round_batches in enumerate(rounds, 1):
        # Traiter round
        round_results = process_round(round_batches, round_num, total_rounds)
        completed_batches.extend(round_results)
        
        # Pause entre rounds (sauf dernier)
        if round_num < total_rounds: 
            print(f"\n‚è∏Ô∏è  PAUSE {PAUSE_BETWEEN_ROUNDS}s avant prochain round...")
            print(f"   Progression: {len(completed_batches)}/{total_batches} batches")
            print(f"   ASINs trait√©s: {sum(len(b) for b in completed_batches)}/{total_asins}")
            time.sleep(PAUSE_BETWEEN_ROUNDS)
    
    total_duration = time.time() - start_time
    
    # √âTAPE 5 : Sauvegarder
    save_results(completed_batches, OUTPUT_FILE)
    
    # STATISTIQUES FINALES
    total_success = sum(b.success_count for b in completed_batches)
    total_errors = sum(b.error_count for b in completed_batches)
    
    print("\n" + "="*60)
    print("üéâ EXTRACTION TERMIN√âE")
    print("="*60)
    print(f"‚è±Ô∏è  Dur√©e totale: {total_duration/60:.1f} minutes")
    print(f"üìä R√©sultats:")
    print(f"   - Total trait√©: {total_asins}")
    print(f"   - Succ√®s: {total_success} ({total_success/total_asins*100:.1f}%)")
    print(f"   - Erreurs:  {total_errors} ({total_errors/total_asins*100:.1f}%)")
    print(f"   - Vitesse: {total_asins/(total_duration/60):.1f} ASINs/minute")
    print(f"\nüìÅ Fichier r√©sultat: {OUTPUT_FILE}")
    print("="*60)


# ==========================================
# LANCEMENT
# ==========================================

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\n‚ö†Ô∏è  Interruption utilisateur")
        print("üíæ R√©sultats partiels peuvent √™tre perdus")
    except Exception as e:
        print(f"\n‚ùå Erreur fatale: {e}")
        import traceback
        traceback.print_exc()


üöÄ EXTRACTEUR TITRES AMAZON - MODE CHUNKING
‚öôÔ∏è  Configuration:
   - Taille batch: 50 ASINs
   - Batches parall√®les: 3
   - Pause entre rounds: 30s
   - D√©lai entre requ√™tes: (1, 2)s

üìÇ Chargement Amazon_Best_Seller_2021_June 2.csv...

‚ùå Erreur fatale: [Errno 2] No such file or directory: 'Amazon_Best_Seller_2021_June 2.csv'


Traceback (most recent call last):
  File "C:\Users\khalid\AppData\Local\Temp\ipykernel_19356\4172197904.py", line 380, in <module>
    main()
  File "C:\Users\khalid\AppData\Local\Temp\ipykernel_19356\4172197904.py", line 295, in main
    with open(INPUT_FILE, 'r', encoding='utf-8', errors='replace') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\khalid\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 324, in _modified_open
    return io_open(file, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'Amazon_Best_Seller_2021_June 2.csv'
