In [11]:
import pandas as pd
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import nest_asyncio

# Apply this to allow asyncio in Jupyter notebooks
nest_asyncio.apply()

# Load Excel and URLs
df = pd.read_excel("076 - Scrape Amazon Images.xlsx")
urls = df['Page URL'].dropna().tolist()

# Limit concurrent connections
semaphore = asyncio.Semaphore(50)

async def fetch(session, url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        async with semaphore:
            async with session.get(url, headers=headers, timeout=10) as response:
                if response.status != 200:
                    return url, ["NA"]
                html = await response.text()
                return url, extract_images(html, url)
    except Exception:
        return url, ["NA"]

def extract_images(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    image_tags = soup.find_all('img')
    jpg_links = []
    for img in image_tags:
        src = img.get('src') or img.get('data-src')
        if src and '.jpg' in src.lower():
            if 'thumb' not in src.lower():
                if not src.startswith("http"):
                    src = urljoin(base_url, src)
                jpg_links.append(src)
    return jpg_links[:6] if jpg_links else ["NA"]

async def scrape_all(urls):
    results = []
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        for task in asyncio.as_completed(tasks):
            result = await task
            results.append(result)
    return results

# Run the async scraping and get results
results = asyncio.run(scrape_all(urls))

# Convert results to dict for easy mapping
url_to_images = dict(results)

# Prepare DataFrame columns
image_cols = [f'Image {i}' for i in range(1, 7)]
df[image_cols] = None

# Fill DataFrame with image URLs or "NA"
for idx, row in df.iterrows():
    images = url_to_images.get(row['Page URL'], ["NA"])
    for i in range(6):
        df.at[idx, image_cols[i]] = images[i] if i < len(images) else "NA"

# Save final Excel file
df.to_excel("output_async_with_images.xlsx", index=False)


In [12]:
import os

download_dir = "sample_images"
os.makedirs(download_dir, exist_ok=True)


In [13]:
import requests

def download_image(url, save_path):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
    except:
        pass
    return False


In [14]:
showcase_count = 50  # Adjust this number as needed
downloaded = 0

for idx, row in df.iterrows():
    if downloaded >= showcase_count:
        break
    image_url = row["Image 1"]
    if image_url != "NA":
        filename = f"product_{idx}_img1.jpg"
        filepath = os.path.join(download_dir, filename)
        if download_image(image_url, filepath):
            downloaded += 1
            print(f"Downloaded: {filename}")


Downloaded: product_44_img1.jpg
Downloaded: product_71_img1.jpg
Downloaded: product_81_img1.jpg
Downloaded: product_120_img1.jpg
Downloaded: product_183_img1.jpg
Downloaded: product_204_img1.jpg
Downloaded: product_237_img1.jpg
Downloaded: product_239_img1.jpg
Downloaded: product_287_img1.jpg
Downloaded: product_328_img1.jpg
Downloaded: product_346_img1.jpg
Downloaded: product_363_img1.jpg
Downloaded: product_368_img1.jpg
Downloaded: product_372_img1.jpg
Downloaded: product_379_img1.jpg
Downloaded: product_442_img1.jpg
Downloaded: product_502_img1.jpg
Downloaded: product_505_img1.jpg
Downloaded: product_541_img1.jpg
Downloaded: product_568_img1.jpg
Downloaded: product_586_img1.jpg
Downloaded: product_588_img1.jpg
Downloaded: product_591_img1.jpg
Downloaded: product_605_img1.jpg
Downloaded: product_629_img1.jpg
Downloaded: product_645_img1.jpg
Downloaded: product_662_img1.jpg
Downloaded: product_699_img1.jpg
Downloaded: product_741_img1.jpg
Downloaded: product_763_img1.jpg
Downloaded: p