# ✅ Minar Internship – Web Scraping Assignment

In [1]:
# STEP 1: Import Libraries
!pip install requests lxml  
import requests
from lxml import html
from pprint import pprint
import json

Defaulting to user installation because normal site-packages is not writeable


In [52]:
!pip install selenium


Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3~=2.5.0 (from urllib3[socks]~=2.5.0->selenium)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.6.15 (from selenium)
  Downloading certifi-2025.7.9-py3-none-any.whl.metadata (2.4 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting sortedcontainers (from trio~=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (

In [34]:
# STEP 2: HTML Fetch Utility
def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return html.fromstring(response.content)
    else:
        print(f"Failed to fetch {url} - Status Code:", response.status_code)
        return None

In [58]:
# STEP 3: Scraper for NewMe
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from lxml import html

# 🧠 HTML fetch utility using Selenium for JS-rendered sites
def fetch_html_selenium(url):
    options = Options()
    options.add_argument("--headless")  # Run browser silently
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")

    # Initialize driver
    driver = webdriver.Chrome(options=options)

    # Load page and wait for JS
    driver.get(url)
    time.sleep(4)  # Adjust if needed for page to fully load

    # Get the full HTML and parse with lxml
    page_source = driver.page_source
    tree = html.fromstring(page_source)

    driver.quit()
    return tree

# 🛒 Scraper for NewMe using Selenium
def scrape_newme(url):
    tree = fetch_html_selenium(url)
    if tree is None:
        print("❌ Tree is None")
        return []

    products = []
    product_blocks = tree.xpath('//div[div/h3 and .//a[contains(@href, "/products/")]]')
    print("✅ Total Product Blocks Found:", len(product_blocks))

    for product in product_blocks:
        try:
            title = product.xpath('.//h3/text()')[0].strip()
        except:
            title = None
        try:
            product_link = product.xpath('.//a/@href')[0]
            if not product_link.startswith('http'):
                product_link = 'https://newme.asia' + product_link
        except:
            product_link = None
        try:
            image_url = product.xpath('.//img/@src')[0]
        except:
            image_url = None
        try:
            price = product.xpath('.//span[contains(@class, "price")]/text()')[0]
        except:
            price = None
        try:
            discount = product.xpath('.//span[contains(@class, "discount")]/text()')[0]
        except:
            discount = None

        full_text = ' '.join(product.xpath('.//text()')).strip()
        products.append({
            "title": title,
            "url": product_link,
            "image_url": image_url,
            "price": price,
            "discount": discount,
            "rating": None,
            "text": full_text
        })

    return products





In [43]:
# STEP 4: Scraper for JioMart
def scrape_jiomart(url):
    tree = fetch_html(url)
    if tree is None:
        return []
    
    products = []
    product_blocks = tree.xpath('//li[contains(@class, "ais-InfiniteHits-item")]')

    
    for product in product_blocks:
        try:
            title = product.xpath('.//span[@class="clsgetname"]/text()')[0].strip()
        except:
            title = None

        try:
            price = product.xpath('.//span[@id="final_price"]/text()')[0].strip()
        except:
            price = None

        try:
            discount = product.xpath('.//span[contains(@class,"discount")]/text()')[0].strip()
        except:
            discount = None

        try:
            product_link = product.xpath('.//a/@href')[0]
            if not product_link.startswith('http'):
                product_link = 'https://www.jiomart.com' + product_link
        except:
            product_link = None

        try:
            image_url = product.xpath('.//img/@data-src')[0]
        except:
            image_url = None

        full_text = ' '.join(product.xpath('.//text()')).strip()

        products.append({
            "title": title,
            "url": product_link,
            "image_url": image_url,
            "price": price,
            "discount": discount,
            "rating": None,
            "text": full_text
        })
    
    return products


In [44]:
# STEP 5: Scraper for Croma
def scrape_croma(url):
    tree = fetch_html(url)
    if tree is None:
        return []
    products = []
    product_blocks = tree.xpath('//li[contains(@class, "product-item")]')
    for product in product_blocks:
        try: title = product.xpath('.//h3[contains(@class, "product-title")]/text()')[0].strip()
        except: title = None
        try: price = product.xpath('.//span[contains(@class, "amount")]/text()')[0].strip()
        except: price = None
        try:
            product_link = product.xpath('.//a/@href')[0]
            if not product_link.startswith('http'):
                product_link = 'https://www.croma.com' + product_link
        except: product_link = None
        try: image_url = product.xpath('.//img/@src')[0]
        except: image_url = None
        full_text = ' '.join(product.xpath('.//text()')).strip()
        products.append({
            "title": title, "url": product_link, "image_url": image_url,
            "price": price, "discount": None, "rating": None, "text": full_text
        })
    return products

In [45]:
# STEP 6: Scraper for Nike
def scrape_nike(url):
    tree = fetch_html(url)
    if tree is None:
        return []
    products = []
    product_blocks = tree.xpath('//div[contains(@class, "product-card")]')
    for product in product_blocks:
        try: title = product.xpath('.//div[contains(@class, "product-card__title")]/text()')[0].strip()
        except: title = None
        try: price = product.xpath('.//div[contains(@class, "product-price")]/text()')[0].strip()
        except: price = None
        try:
            product_link = product.xpath('.//a/@href')[0]
            if not product_link.startswith('http'):
                product_link = 'https://www.nike.com' + product_link
        except: product_link = None
        try: image_url = product.xpath('.//img/@src')[0]
        except: image_url = None
        full_text = ' '.join(product.xpath('.//text()')).strip()
        products.append({
            "title": title, "url": product_link, "image_url": image_url,
            "price": price, "discount": None, "rating": None, "text": full_text
        })
    return products

In [46]:
# STEP 7: Run All Scrapers
newme_url = "https://newme.asia/collection/cannes-2025?product_cat=&orderby=menu_order&p=1&subCategory="
jiomart_url = "https://www.jiomart.com/c/homeandkitchen/home-furnishing/bedding/31421"

croma_url = "https://www.croma.com/computers-tablets/laptops/gaming-laptops/c/806?q=%3Arelevance&srsltid=AfmBOorNua7Lm5pY1gIccoWx_DjcorgjhEhEjsEgItVLHnXE_lHwnwMT"
nike_url = "https://www.nike.com/in/w/mens-nik1"

newme_data = scrape_newme(newme_url)
jiomart_data = scrape_jiomart(jiomart_url)
croma_data = scrape_croma(croma_url)
nike_data = scrape_nike(nike_url)

all_products = {
    "NewMe": newme_data,
    "JioMart": jiomart_data,
    "Croma": croma_data,
    "Nike": nike_data
}

In [47]:
# STEP 8: Save JSON (Optional)
with open("minar_scraped_products.json", "w", encoding="utf-8") as f:
    json.dump(all_products, f, indent=2, ensure_ascii=False)

In [50]:
from pprint import pprint

# ✅ STEP 10: Preview Some Results
def safe_preview(data, site_name):
    print(f"\n✅ Sample Products from {site_name}:")
    if data and isinstance(data, list):
        pprint(data[:2])
    else:
        print("⚠️ No data extracted.")

safe_preview(newme_data, "NewMe")
safe_preview(jiomart_data, "JioMart")
safe_preview(croma_data, "Croma")
safe_preview(nike_data, "Nike")




✅ Sample Products from NewMe:
⚠️ No data extracted.

✅ Sample Products from JioMart:
⚠️ No data extracted.

✅ Sample Products from Croma:
[{'discount': None,
  'image_url': None,
  'price': '',
  'rating': None,
  'text': 'Compare HP Omen 16 xd0020AX AMD Ryzen 7 Gaming Laptop (16GB, 1TB '
          'SSD, Windows 11 Home, 8GB Graphics, 16.1 inch 165 Hz FHD Display, '
          'NVIDIA GeForce RTX 4060, MS Office 2021, Shadow Black, 2.37 KG) Rs '
          '6000 Instant Discount   ₹1,07,990   (Incl. all Taxes) ₹1,32,644 '
          '(Save ₹ 24,654 ) 19%   Off Rs 6000 Instant Discount',
  'title': None,
  'url': 'https://www.croma.com/hp-omen-16-xd0020ax-amd-ryzen-7-gaming-laptop-16gb-1tb-ssd-windows-11-home-8gb-graphics-16-1-inch-165-hz-fhd-display-nvidia-geforce-rtx-4060-ms-office-2021-shadow-black-2-37-kg-/p/307123'},
 {'discount': None,
  'image_url': None,
  'price': '',
  'rating': None,
  'text': 'Compare ASUS TUF Gaming F15 Intel Core i7 13th Gen Gaming Laptop '
          '(16GB,

In [60]:
newme_url = "https://newme.asia/collection/cannes-2025?subCategory=&product_cat=&orderby=menu_orders"
newme_data = scrape_newme(newme_url)
pprint(newme_data[:2])


✅ Total Product Blocks Found: 0
[]
