# <Center> Web Scraping using Selenium

***Summary of this lecture:*** We are gonna scrap the website named `Smartprix` using Selenium

The thing with this website is that it only shows 20 mobile at a time and you have to click load more to load more mobile specs<br>
So more html code comes from the server after you click load more, but request library only load html one at a time hence we cannot load more html code after that that why we need other tools like selenium.

> First download the `chromedriver`

<center> <span style="font-size: 44px;"> <b> Basic level automation

<center> Below code are not from Nitish sir lecture, they are from chatgpt(hence self learning)

In [None]:
# Future zain view the selenium notes for revising Selenium
# testing selenium

from selenium import webdriver

driver = webdriver.Chrome()   # Selenium auto-downloads driver
driver.get("https://www.google.com")

In [None]:
# opening wikipedia

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.get("https://www.wikipedia.com")

In [None]:
# reading page title

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.get("https://www.wikipedia.com")

print(driver.title)

input("Press enter to quit...")
driver.quit()

Wikipedia


In [None]:
# reading current url

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.get("https://www.wikipedia.com")

print(driver.title)

driver.get("https://www.google.com")

print(driver.current_url)

input("Press enter to quit...")
driver.quit()


Wikipedia
https://www.google.com/


In [None]:
# Page Source (HTML)

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.get("https://www.wikipedia.com")

html = driver.page_source
print(html[:500])

input("Press enter to quit...")
driver.quit()

<html lang="en" class="js-enabled"><head>
<meta charset="utf-8">
<title>Wikipedia</title>
<meta name="description" content="Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.">
<script>
document.documentElement.className = document.documentElement.className.replace( /(^|\s)no-js(\s|$)/, "$1js-enabled$2" );
</script>
<meta name="viewport" content="initial-scale=1,user-scalable=yes">
<link rel="apple-touch-icon" href="


In [None]:
# going back, forward and refresh

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.get("https://www.wikipedia.com")

driver.back()
driver.forward()
driver.refresh()

input("Press enter to quit...")
driver.quit()

In [None]:
# maximizing browser window and setting it size

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.maximize_window()

driver.get("https://www.wikipedia.com")
driver.set_window_size(1200, 800)

input("Press enter to quit...")
driver.quit()

In [None]:
# Close vs Quit 
'''
Quit (entire browser session)
Close (current tab only)
'''

In [None]:
# Add a Delay (Temporary, Not Best Practice)
'''
This is NOT recommended long-term, but okay for now.

We’ll replace this with waits in Module 3.
'''

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.maximize_window()

driver.get("https://www.wikipedia.com")
time.sleep(2)
driver.set_window_size(1200, 800)

input("Press enter to quit...")
driver.quit()

<span style="font-size: 18px;"> <b>Question: Open wikipedia.com and then print its title and url

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()

driver = webdriver.Chrome(service = service)
driver.maximize_window()

driver.get("https://www.wikipedia.com")
time.sleep(2)
print(driver.title)
print(driver.current_url)

input("Press enter to quit...")
driver.quit()

Wikipedia
https://www.wikipedia.org/


In [None]:
# Locator to select elements in the html page
'''
Locator	        Use When
ID	            Unique & stable
Name	        Form fields
Class Name	    Simple cases
Tag Name	    Bulk elements
Link Text	    <a> tags
XPath	        Complex/dynamic
CSS Selector	Fast & clean
'''

In [None]:
# opening google and typing Selenium Web Scraping
from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://www.google.com")

# search_box = driver.find_element(By.NAME, "q")
search_box = driver.find_element(By.ID, "APjFqb")

search_box.send_keys("Selenium Web Scraping")

input("Press enter to quit...")
driver.quit()


In [None]:
# The Core Interaction Methods
'''
element.click()
element.send_keys()
element.clear()
'''

# Keyboard Actions
'''
Key	            Use
---------------------
ENTER	        Submit
TAB	            Move to next field
ESCAPE	        Close popups
CONTROL + A	    Select all
DELETE	        Clear
'''

In [None]:
# opening google and typing Selenium Web Scraping pressing enter
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()
driver.get("https://www.google.com")

# search_box = driver.find_element(By.NAME, "q")
search_box = driver.find_element(By.ID, "APjFqb")

search_box.send_keys("Selenium Web Scraping" + Keys.ENTER)

input("Press enter to quit...")
driver.quit()

In [None]:
# Selenium provides two types of waits:

'''
Type	        Use
---------------------
Implicit Wait	Global, simple
Explicit Wait	Targeted, professional
'''

In [None]:
# scrolling one time ()
# Scrolling to Page Height:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()
driver.get("https://www.google.com")

search_box = driver.find_element(By.NAME, "q")
search_box.send_keys("Selenium Web Scraping" + Keys.ENTER)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")

input("Press enter to quit...")
driver.quit()

In [None]:
# open google and search Ali Zain and then scroll down and click next

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

service = Service()
driver = webdriver.Chrome(service = service)
driver.maximize_window()

wait = WebDriverWait(driver, 3)

driver.get("https://www.google.com")

search = wait.until(EC.visibility_of_element_located((By.NAME,"q")))
search.send_keys("Ali Zain" + Keys.ENTER)

input("Press enter to Continue...")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")

next_btn = wait.until(EC.visibility_of_element_located((By.CLASS_NAME,"oeN89d"))).click()

input("Press enter to quit...")
driver.quit()

InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: unable to send message to renderer
  (Session info: chrome=143.0.7499.170); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7c16b88e5
	0x7ff7c16b8940
	0x7ff7c149165d
	0x7ff7c147d82c
	0x7ff7c147d78c
	0x7ff7c147bfaa
	0x7ff7c147cbcf
	0x7ff7c1496ef6
	0x7ff7c14979fa
	0x7ff7c149c3b4
	0x7ff7c149c47f
	0x7ff7c14e92e5
	0x7ff7c14e9d3c
	0x7ff7c153df67
	0x7ff7c153ac97
	0x7ff7c14dac29
	0x7ff7c14dba93
	0x7ff7c19d0640
	0x7ff7c19caf80
	0x7ff7c19e96e6
	0x7ff7c16d5de4
	0x7ff7c16ded8c
	0x7ff7c16c2004
	0x7ff7c16c21b5
	0x7ff7c16a7ee2
	0x7ffc9e377614
	0x7ffc9f1026a1


In [None]:
# Headless Chrome Setup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

options = Options()
service = Service()

options.add_argument(" --headless")
options.add_argument(" --window-size=1920,1080")

driver = webdriver.Chrome(service= service, options=options)

driver.get("https://www.google.com")
print(driver.title)
print(driver.current_url)
driver.quit()


Google
https://www.google.com/


In [None]:
# avoiding CAPTCHA 

import undetected_chromedriver as uc
import time

# Initialize undetected chrome
driver = uc.Chrome()

driver.get("https://www.google.com")
time.sleep(2)  # Add realistic delays

# Search for something
search_box = driver.find_element("name", "q")
search_box.send_keys("web scraping tutorial")
search_box.submit()

time.sleep(3)
input("Press Enter to quit...")
driver.quit()


---
---
---


<center> Nitish sir lecture code begin here

In [None]:
# i) opening google.com automatically
# ii) then search campusx automatically
# learnwith.campusx.in
# open dsmp page

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36") # custom user agent

service = Service()
driver = webdriver.Chrome(service = service, options = options)

wait = WebDriverWait(driver, 10)

driver.get("https://www.google.com")
driver.implicitly_wait(2) 

search_bar = driver.find_element(By.NAME, "q")

search_bar.send_keys("campusx" + Keys.ENTER)

input("Please complete the captcha")

campusx_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//cite[text()='https://learnwith.campusx.in']/ancestor::a")))
campusx_link.click()

course_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='1698390585510d']/div/div[1]/div/div/div/div[1]/div/div/div[2]/a[2]")))
course_link.click()

<span style="font-size: 18px;"> <b>Question: Open ajio website from google and then open `Caps & Hats` section then scroll until the end after that save the html.

In [19]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

service = Service()

options = Options()
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36")

driver = webdriver.Chrome(service=service, options=options)

wait = WebDriverWait(driver, 10)
actions = ActionChains(driver)

driver.get("https://www.google.com")
driver.maximize_window()

search_bar = wait.until(EC.visibility_of_element_located((By.NAME, "q")))
search_bar.send_keys("ajio" + Keys.ENTER)

input("Please complete the captcha")

ajio_link = wait.until(
    EC.element_to_be_clickable(
        (By.XPATH, "//cite[contains(text(),'ajio')]/ancestor::a")
    )
)
ajio_link.click()

men_menu = wait.until(
    EC.element_to_be_clickable(
        (By.XPATH,
         "/html/body/div[1]/div[2]/div/div[2]/div/header/div[3]/div[1]/ul/li[1]")
    )
)

actions.move_to_element(men_menu).perform()

backpack_menu = wait.until(
    EC.element_to_be_clickable(
        (By.XPATH, "/html[1]/body[1]/div[1]/div[2]/div[1]/div[2]/div[1]/header[1]/div[3]/div[1]/ul[1]/li[1]/div[2]/ul[1]/li[1]/div[2]/div[1]/div[3]/div[2]/div[2]/div[3]/span[1]/a[1]")
    )
)
backpack_menu.click()

last_height = driver.execute_script('return document.body.scrollHeight')

while True:
    driver.implicitly_wait(2)

    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') # direct scrolling to the end, do not work use passive scrolling
    driver.implicitly_wait(4)

    new_height = driver.execute_script('return document.body.scrollHeight')

    if new_height == last_height:
        break
    new_height = last_height

InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=143.0.7499.170); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7aa3e88e5
	0x7ff7aa3e8940
	0x7ff7aa1c165d
	0x7ff7aa1ad202
	0x7ff7aa1d27af
	0x7ff7aa249a29
	0x7ff7aa26a5c2
	0x7ff7aa20ac29
	0x7ff7aa20ba93
	0x7ff7aa700640
	0x7ff7aa6faf80
	0x7ff7aa7196e6
	0x7ff7aa405de4
	0x7ff7aa40ed8c
	0x7ff7aa3f2004
	0x7ff7aa3f21b5
	0x7ff7aa3d7ee2
	0x7ffc9e377614
	0x7ffc9f1026a1


<span style="font-size: 18px;"> <b>Above is open the Caps & Hats section using Google but that is for practice, now below i am gonna open the ajio Caps& Hats section directly by url.

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import undetected_chromedriver as uc 
service = Service()


driver = uc.Chrome(service=service)

wait = WebDriverWait(driver, 10)
driver.maximize_window()

driver.get("https://www.ajio.com/men-caps-hats/c/830202001")

time.sleep(5)

print("✅ Page loaded successfully")

# Infinite Scrolling (Slow Scroll)
print("Starting slow scroll to load all products...")



# Scrolling configuration
SCROLL_PAUSE_TIME = 2.5  # Wait time after each scroll
scroll_increment = 300    # Pixels to scroll each time
last_height = driver.execute_script('return document.body.scrollHeight')
current_position = 0
no_change_count = 0
scroll_count = 0

while True:
    # Scroll down incrementally
    current_position += scroll_increment
    driver.execute_script(f'window.scrollTo(0, {current_position})')
    scroll_count += 1
    
    # Wait for content to load
    time.sleep(SCROLL_PAUSE_TIME)
    
    # Get current page dimensions
    new_height = driver.execute_script('return document.body.scrollHeight')
    current_scroll_position = driver.execute_script('return window.pageYOffset + window.innerHeight')
    
    # Progress indicator
    if scroll_count % 5 == 0:
        print(f"Scrolled {scroll_count} times... Current height: {new_height}px")
    
    
    # Check if we've reached the bottom
    if current_scroll_position >= new_height - 200:
        time.sleep(4)
        final_height = driver.execute_script('return document.body.scrollHeight')
        
        if final_height == last_height:
            no_change_count += 1
            if no_change_count >= 3:
                print(f"✅ Scrolling completed! Total scrolls: {scroll_count}")
                print(f"✅ Final page height: {final_height}px")
                break
        else:
            no_change_count = 0
            last_height = final_height
    else:
        last_height = new_height

print("\n✅ All products loaded! Ready for extraction.")


KeyboardInterrupt: 

In [27]:
print("Extracting product data...")

products = []

try:
    # Find all product cards
    product_elements = driver.find_elements(By.CSS_SELECTOR, ".item")
    
    print(f"Found {len(product_elements)} products\n")
    
    for index, product in enumerate(product_elements, 1):
        try:
            # Brand
            try:
                brand = product.find_element(By.CSS_SELECTOR, ".brand").text.strip()
            except:
                brand = "N/A"
            
            # Product Name
            try:
                name = product.find_element(By.CSS_SELECTOR, ".nameCls").text.strip()
            except:
                name = "N/A"
            
            # Price
            try:
                price = product.find_element(By.CSS_SELECTOR, ".price").text.strip()
            except:
                price = "N/A"
            
            # Original Price
            try:
                original_price = product.find_element(By.CSS_SELECTOR, ".orginal-price").text.strip()
            except:
                original_price = "N/A"
            
            # Discount
            try:
                discount = product.find_element(By.CSS_SELECTOR, ".discount").text.strip()
            except:
                discount = "N/A"
            
            # Product Link
            try:
                link = product.find_element(By.CSS_SELECTOR, "a.rilrtl-products-list__link").get_attribute("href")
            except:
                try:
                    link = product.find_element(By.TAG_NAME, "a").get_attribute("href")
                except:
                    link = "N/A"
            
            # Image URL
            try:
                image = product.find_element(By.CSS_SELECTOR, "img").get_attribute("src")
            except:
                image = "N/A"
            
            products.append({
                'Serial_No': index,
                'Brand': brand,
                'Product_Name': name,
                'Price': price,
                'Original_Price': original_price,
                'Discount': discount,
                'Product_Link': link,
                'Image_URL': image
            })
            
            # Progress indicator
            if index % 20 == 0:
                print(f"Extracted {index} products...")
            
        except Exception as e:
            print(f"⚠️ Error extracting product {index}: {e}")
            continue
    
    print(f"\n✅ Successfully extracted {len(products)} products!")
    
except Exception as e:
    print(f"❌ Error during extraction: {e}")


Extracting product data...
Found 1350 products

Extracted 20 products...
Extracted 40 products...
Extracted 60 products...
Extracted 80 products...
Extracted 100 products...
Extracted 120 products...
Extracted 140 products...
Extracted 160 products...
Extracted 180 products...
Extracted 200 products...
Extracted 220 products...
Extracted 240 products...
Extracted 260 products...
Extracted 280 products...
Extracted 300 products...
Extracted 320 products...
Extracted 340 products...
Extracted 360 products...
Extracted 380 products...
Extracted 400 products...
Extracted 420 products...
Extracted 440 products...
Extracted 460 products...
Extracted 480 products...
Extracted 500 products...
Extracted 520 products...
Extracted 540 products...
Extracted 560 products...
Extracted 580 products...
Extracted 600 products...
Extracted 620 products...
Extracted 640 products...
Extracted 660 products...
Extracted 680 products...
Extracted 700 products...
Extracted 720 products...
Extracted 740 produc

In [None]:
import pandas as pd
if products:
    # Create DataFrame
    df = pd.DataFrame(products)
    
    # Display basic info
    print(f"Total products: {len(df)}")
    print(f"\nDataFrame shape: {df.shape}")
    print(f"\nColumn names: {list(df.columns)}")
    
    # Save to CSV
    filename = 'ajio_mens_caps_hats.csv'
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"\n✅ Data saved to '{filename}'")
    
    # Display first few rows
    print("\n" + "="*80)
    print("Preview of extracted data:")
    print("="*80)
    display(df.head())
    
    # Display summary statistics
    print("\n" + "="*80)
    print("Data Summary:")
    print("="*80)
    print(df.info())
    
else:
    print("❌ No products found")




Total products: 1350

DataFrame shape: (1350, 8)

Column names: ['Serial_No', 'Brand', 'Product_Name', 'Price', 'Original_Price', 'Discount', 'Product_Link', 'Image_URL']

✅ Data saved to 'ajio_mens_caps_hats.csv'

Preview of extracted data:


Unnamed: 0,Serial_No,Brand,Product_Name,Price,Original_Price,Discount,Product_Link,Image_URL
0,1,Decathlon,WEDZE - Men Warm Winter Beanie,₹399,₹699,(43% off),https://www.ajio.com/decathlon-wedze--men-warm...,https://assets-jiocdn.ajio.com/medias/sys_mast...
1,2,Decathlon,WEDZE - Men Warm Winter Headband,₹249,₹299,(17% off),https://www.ajio.com/decathlon-wedze--men-warm...,https://assets-jiocdn.ajio.com/medias/sys_mast...
2,3,Decathlon,WEDZE - Unisex Wide Winter Headband,₹249,₹499,(50% off),https://www.ajio.com/decathlon-wedze--unisex-w...,https://assets-jiocdn.ajio.com/medias/sys_mast...
3,4,Decathlon,WEDZE - Unisex Solid Winter Beanie,₹149,₹199,(25% off),https://www.ajio.com/decathlon-wedze--unisex-s...,https://assets-jiocdn.ajio.com/medias/sys_mast...
4,5,Puma,Essential Running Cap,₹270,₹599,(55% off),https://www.ajio.com/puma-essential-running-ca...,https://assets-jiocdn.ajio.com/medias/sys_mast...



Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Serial_No       1350 non-null   int64 
 1   Brand           1350 non-null   object
 2   Product_Name    1350 non-null   object
 3   Price           1350 non-null   object
 4   Original_Price  1350 non-null   object
 5   Discount        1350 non-null   object
 6   Product_Link    1350 non-null   object
 7   Image_URL       561 non-null    object
dtypes: int64(1), object(7)
memory usage: 84.5+ KB
None


<span style="font-size: 22px;"> <b> Now third part of the class which is scrapping data from `https://www.smartprix.com/mobiles`

As there are too many phones in this website so in the availability checkbox we are gonna tick the `Exclude Out Of Stock` and `Exclude Upcoming`.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

service = Service()

driver = webdriver.Chrome(service = service)
wait = WebDriverWait(driver, 10)

driver.maximize_window()
time.sleep(1)
driver.get('https://www.smartprix.com/mobiles')
time.sleep(3)

driver.find_element(By.XPATH, "//span[normalize-space()='Exclude Out Of Stock']").click()
time.sleep(3)
driver.find_element(By.XPATH, "//span[normalize-space()='Exclude Upcoming']").click()


print("Loading all products...")

last_height = driver.execute_script('return document.body.scrollHeight')
load_count = 0

while True:
    try:
        # Find and click the "Load More" button
        load_more_button = driver.find_element(By.XPATH, '//*[@id="app"]/main/div[1]/div[2]/div[3]')
        load_more_button.click()
        load_count += 1
        
        print(f"Clicked Load More: {load_count} times")
        time.sleep(3)
        
        # Check if page height changed
        new_height = driver.execute_script('return document.body.scrollHeight')
        
        if new_height == last_height:
            print("No more products to load")
            break
            
        last_height = new_height
        
    except Exception as e:
        print(f"Load More button not found or error: {e}")
        break

print(f"\n✅ All products loaded! Total clicks: {load_count}")


Loading all products...
Clicked Load More: 1 times
Clicked Load More: 2 times
Clicked Load More: 3 times
Clicked Load More: 4 times
Clicked Load More: 5 times
Clicked Load More: 6 times
Clicked Load More: 7 times
Clicked Load More: 8 times
Clicked Load More: 9 times
Clicked Load More: 10 times
Clicked Load More: 11 times
Clicked Load More: 12 times
Clicked Load More: 13 times
Clicked Load More: 14 times
Clicked Load More: 15 times
Clicked Load More: 16 times
Clicked Load More: 17 times
Clicked Load More: 18 times
Clicked Load More: 19 times
Clicked Load More: 20 times
Clicked Load More: 21 times
Clicked Load More: 22 times
Clicked Load More: 23 times
Clicked Load More: 24 times
Clicked Load More: 25 times
Clicked Load More: 26 times
Clicked Load More: 27 times
Clicked Load More: 28 times
Clicked Load More: 29 times
Clicked Load More: 30 times
Clicked Load More: 31 times
Clicked Load More: 32 times
Clicked Load More: 33 times
Clicked Load More: 34 times
Clicked Load More: 35 times
Click

In [11]:
from bs4 import BeautifulSoup
import time

print("Extracting product data with BeautifulSoup...")
start_time = time.time()

# Get entire page HTML once
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

# Find all product cards
products = soup.find_all('div', class_='sm-product')
print(f"Found {len(products)} products\n")

mobiles_data = []

for i, product in enumerate(products, 1):
    try:
        # Extract name
        name_tag = product.find('h2')
        name = name_tag.text.strip() if name_tag else "N/A"
        
        # Extract price
        price_tag = product.find('span', class_='price')
        price = price_tag.text.strip() if price_tag else "N/A"
        
        # Extract rating
        rating_tag = product.find('span', class_='sm-rating')
        if rating_tag and rating_tag.get('style'):
            style = rating_tag['style']
            rating_value = style.split('--rating:')[1].split(';')[0] if '--rating:' in style else "N/A"
        else:
            rating_value = "N/A"
        
        # Extract specifications
        specs_list = product.find('ul', class_='sm-feat specs')
        if specs_list:
            specs = [li.text.strip() for li in specs_list.find_all('li')]
            specifications = " | ".join(specs)
        else:
            specifications = "N/A"
        
        # Extract Spec Score
        score_tag = product.find('div', class_='score')
        if score_tag:
            score_b = score_tag.find('b')
            spec_score = score_b.text.strip() if score_b else "N/A"
        else:
            spec_score = "N/A"
        
        # Extract link
        link_tag = product.find('a')
        link = link_tag['href'] if link_tag and link_tag.get('href') else "N/A"
        if link != "N/A" and not link.startswith('http'):
            link = 'https://www.smartprix.com' + link
        
        # Extract image
        img_tag = product.find('img', class_='sm-img')
        image_url = img_tag['src'] if img_tag and img_tag.get('src') else "N/A"
        
        mobile_info = {
            'Serial_No': i,
            'Product_Name': name,
            'Price': price,
            'Rating': rating_value,
            'Spec_Score': spec_score,
            'Specifications': specifications,
            'Product_Link': link,
            'Image_URL': image_url
        }
        
        mobiles_data.append(mobile_info)
        
        if i % 100 == 0:
            print(f"Extracted {i} products...")
        
    except Exception as e:
        print(f"Error extracting product {i}: {e}")
        continue

elapsed_time = time.time() - start_time
print(f"\n✅ Successfully extracted {len(mobiles_data)} products in {elapsed_time:.2f} seconds!")


Extracting product data with BeautifulSoup...
Found 1028 products

Extracted 100 products...
Extracted 200 products...
Extracted 300 products...
Extracted 400 products...
Extracted 500 products...
Extracted 600 products...
Extracted 700 products...
Extracted 800 products...
Extracted 900 products...
Extracted 1000 products...

✅ Successfully extracted 1028 products in 6.75 seconds!


In [14]:
import pandas as pd
# Create DataFrame
df = pd.DataFrame(mobiles_data)

# Display basic info
print(f"Total products: {len(df)}")
print(f"DataFrame shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")

# Save to CSV
filename = 'smartprix_mobiles.csv'
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"\n✅ Data saved to '{filename}'")

# Display first 5 rows
print("\n" + "="*80)
print("Preview of data:")
print("="*80)
display(df.head())


Total products: 1028
DataFrame shape: (1028, 8)

Column names: ['Serial_No', 'Product_Name', 'Price', 'Rating', 'Spec_Score', 'Specifications', 'Product_Link', 'Image_URL']

✅ Data saved to 'smartprix_mobiles.csv'

Preview of data:


Unnamed: 0,Serial_No,Product_Name,Price,Rating,Spec_Score,Specifications,Product_Link,Image_URL
0,1,Samsung Galaxy S25 Ultra,"₹1,08,460",4.15,93,"Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, Wi-Fi, NFC ...",https://www.smartprix.com/mobiles/samsung-gala...,https://cdn1.smartprix.com/rx-i3jZEfawx-w280-h...
1,2,Samsung Galaxy S25 FE,"₹59,999",4.7,86,"Dual Sim, 3G, 4G, 5G, VoLTE, Vo5G, Wi-Fi, NFC ...",https://www.smartprix.com/mobiles/samsung-gala...,https://cdn1.smartprix.com/rx-igNNWnxPy-w280-h...
2,3,OnePlus Nord CE 5 5G,"₹24,998",4.1,83,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, IR Blaster...",https://www.smartprix.com/mobiles/oneplus-nord...,https://cdn1.smartprix.com/rx-i3qMJS8fr-w280-h...
3,4,Realme Narzo 90,"₹16,999",4.4,72,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi | Dimensity...",https://www.smartprix.com/mobiles/realme-narzo...,https://cdn1.smartprix.com/rx-iuYfvkRUc-w280-h...
4,5,Vivo T3 Ultra,"₹22,799",4.1,82,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi | Dimensity...",https://www.smartprix.com/mobiles/vivo-t3-ultr...,https://cdn1.smartprix.com/rx-iKhtBU6Ja-w280-h...


In [19]:
df['Rating'].unique()

array([' 4.15', ' 4.7', ' 4.1', ' 4.4', ' 4.3', ' 4', ' 4.45', ' 4.35',
       ' 4.6', ' 4.25', ' 4.65', ' 4.75', ' 4.5', ' 4.55', ' 4.05',
       ' 4.2', ' 3.95', ' 3.9', 'N/A'], dtype=object)

In [22]:
df.query("Rating == 'N/A'")

Unnamed: 0,Serial_No,Product_Name,Price,Rating,Spec_Score,Specifications,Product_Link,Image_URL
1020,1021,,"₹94,800",,,,https://www.smartprix.com/mobiles/samsung-gala...,https://cdn1.smartprix.com/rx-iz5ZKOGY0-w280-h...
1021,1022,,"₹1,59,990",,,,https://www.smartprix.com/mobiles/samsung-gala...,https://cdn1.smartprix.com/rx-irL2QXDwz-w280-h...
1022,1023,,"₹1,49,900",,,,https://www.smartprix.com/mobiles/apple-iphone...,https://cdn1.smartprix.com/rx-ipwKK83hK-w280-h...
1023,1024,,"₹1,09,999",,,,https://www.smartprix.com/mobiles/vivo-x300-pr...,https://cdn1.smartprix.com/rx-iCS5Q1pzT-w280-h...
1024,1025,,"₹72,999",,,,https://www.smartprix.com/mobiles/oneplus-15-p...,https://cdn1.smartprix.com/rx-iAe49sioe-w280-h...
1025,1026,,"₹68,690",,,,https://www.smartprix.com/mobiles/samsung-gala...,https://cdn1.smartprix.com/rx-iGH33GWY9-w280-h...
1026,1027,,"₹1,09,999",,,,https://www.smartprix.com/mobiles/oppo-find-x9...,https://cdn1.smartprix.com/rx-in2v0JdJP-w280-h...
1027,1028,,"₹75,999",,,,https://www.smartprix.com/mobiles/vivo-x300-pp...,https://cdn1.smartprix.com/rx-iP8wcGkpt-w280-h...
