### Import Necessary Libraries

* **selenium** : `selenium` is an open-source tool that automates web browsers for testing web applications and performing various web-based tasks
* **BeautifulSoup** : `BeautifulSoup` is a Python library for parsing HTML and XML documents, enabling easy navigation, searching,etc
* **requests** : The `requests` is a Python library used for making HTTP requests to interact with web services and retrieve web content.
* **time** : The `time` library in Python provides functions for handling time-related tasks,like accessing current time, pausing execution, etc.
* **numpy** : `numpy` is a fundamental Python library for numerical computing, providing support for large, multi-dimensional arrays and matrices, along with a wide range of mathematical functions to operate on them.
* **pandas** : `pandas` is a powerful Python library for data manipulation and analysis, providing data structures like DataFrames and Series to efficiently handle and analyze structured data.
* **re** : The `re` module in Python provides support for regular expressions, allowing for pattern matching, searching, and manipulating strings.


In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

import requests
import time
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

### Defining functions to extract/scrape data from webpage

In [1]:
# Function to get the name of mobile devices.
def get_name(new_soup):
    try:
        temp_name = new_soup.find('h1', attrs={'class': "h1_pro_head"})
        product_name = temp_name.text.strip()
    except:
        product_name = "N/A"
    return product_name

# Function to get the specification score of mobile devices.
def get_specsscore(new_soup):
    try:
        temp_specs_score = new_soup.find('div', class_=re.compile(r'^rating_box_new_list'))
        specs_score = temp_specs_score.text.strip()
    except:
        specs_score = "N/A"
    return specs_score

# Function to get the rating of mobile devices.
def get_rating(new_soup):
    try:
        temp_rating = new_soup.find('span', attrs={'class': 'ratpt'})
        rating = temp_rating.text.strip()
    except:
        rating = "N/A"
    return rating

# Function to get the price of mobile devices.
def get_price(new_soup):
    try:
        temp_price = new_soup.find('div', attrs={'class': "price_div"})
        price = temp_price.text.split('(onwards)')[0].replace('\nRs.\xa0 ', '').replace('\n ', '').strip()
    except:
        price = "N/A"
    return price

# Funtion to get specifications of the mobile
def get_specs(soup):
    specs = {}
    for spec_key in sample_phone_specs.keys():
        spec = soup.find(text=re.compile(spec_key))
        if spec:
            spec_value = spec.find_next().text.strip()
            specs[to_valid_variable_name(spec_key)] = spec_value
        else:
            specs[to_valid_variable_name(spec_key)] = 'N/A'
    return specs

# Function to get the RAM of mobile devices.
def get_ram(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*RAM\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the screen resolution of mobile devices.
def get_resolution(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Resolution\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the refresh rate of mobile devices.
def get_refresh_rate(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Refresh\s*Rate\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the display type of mobile devices.    
def get_displaytype(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Display\s*Type\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the screen size of mobile devices.
def get_screensize(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Screen\s*Size\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the processor of mobile devices.
def get_processor(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Processor\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the rear camera information of mobile devices.
def get_Rear_Camera(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Rear\s*Camera\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the front camera information of mobile devices.
def get_front_camera(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Front\s*Camera\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the battery of mobile devices.
def get_battery(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Battery\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the display of mobile devices.
def get_display(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Display\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the operating system of mobile devices.
def get_os(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Operating\s*System\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the RAM-type of mobile devices.
def get_ramtype(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*RAM\s*TYPE\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the chipset information of mobile devices.
def get_Chipset(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Chipset\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the internal memory of mobile devices.
def get_internal_memory(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Internal\s*Memory\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the color of mobile devices.
def get_color(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Color\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

# Function to get the weight of mobile devices.
def get_Weight(soup):
    try:
        spec_divs = soup.select("div.spec_box")
        for div in spec_divs:
            tr_elements = div.find_all('tr')
            for tr in tr_elements:
                td_elements = tr.find_all('td')
                if len(td_elements) >= 2:
                    first_td_text = td_elements[0].get_text(strip=True)
                    second_td_text = td_elements[1].get_text(strip=True)
                    if re.search(r'^\s*Weight\s*$', first_td_text, re.IGNORECASE):
                        return second_td_text
    except:
        return "N/A"

### Python dictionary named *`HEADERS`* that contains various HTTP headers commonly used in web requests.

In [12]:
HEADERS = ({
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}
)

###  Web scraping using *Selenium* and *BeautifulSoup* in Python

In [23]:
# Setting up Chrome WebDriver with Custom Options:
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")  
chrome_options.add_argument("--headless")  

chromedriver_path = r"C:\Users\Shashank\Downloads\chromedriver-win64\chromedriver.exe"  
driver = webdriver.Chrome(service=Service(chromedriver_path), options=chrome_options)

# Defining URL:
URL = "https://www.91mobiles.com/phonefinder.php"
driver.get(URL)

# Initializing Lists:
names = []
specs_scores = []
ratings = []
prices = []
ram = []
ramtype = []
resolution = []
refresh_rate = []
display_type = []
screen_size = []
chipset = []
cpu = []
architecture = []
fabrication = []
graphics = []
pixel_density = []
quick_charging = []
type_c = []
internal_memory = []
expandable_memory = []
sim_slot = []
wifi = []
wifi_features = []
bluetooth = []
gps = []
nfc = []
radio = []
fingerprint_sensor = []

# Setting up a Requests Session:
page = 1
session = requests.Session()
session.max_redirects = 10000  

# Main Scraping Loop. Scraping Data from Each Page. Handling Errors and Pagination:
while True:
    print(f"Processing page {page}")

    
    while True:
        try:
            
            WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME, "listing-btns4")))
            print("Element with class name 'listing-btns4' is visible on the screen.")
            break  
        except Exception as e:
            
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            print("Scrolling...")

    try:
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        links = soup.find_all("a", class_='hover_blue_link name gaclick')
        links_list = [link.get('href') for link in links]

        HEADERS = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

        for link in links_list:
            full_url = "https://www.91mobiles.com/" + link
            try:
                response = session.get(full_url, headers=HEADERS, allow_redirects=True)
                new_soup = BeautifulSoup(response.content, "html.parser")
                names.append(get_name(new_soup))
                specs_scores.append(get_specsscore(new_soup))
                ratings.append(get_rating(new_soup))
                prices.append(get_price(new_soup))
                ram.append(get_ram(new_soup))
                ramtype.append(get_ramtype(new_soup))
                resolution.append(get_resolution(new_soup))
                refresh_rate.append(get_refresh_rate(new_soup))
                display_type.append(get_displaytype(new_soup))
                screen_size.append(get_screensize(new_soup))
                chipset.append(get_Chipset(new_soup))
                cpu.append(get_CPU(new_soup))
                architecture.append(get_architecture(new_soup))
                fabrication.append(get_Fabrication(new_soup))
                graphics.append(get_graphics(new_soup))
                pixel_density.append(get_pixel_density(new_soup))
                quick_charging.append(get_quick_charging(new_soup))
                type_c.append(get_type_c(new_soup))
                internal_memory.append(get_internal_memory(new_soup))
                expandable_memory.append(get_expandable_Memory(new_soup))
                sim_slot.append(get_sim_slot(new_soup))
                wifi.append(get_wifi(new_soup))
                wifi_features.append(get_wifi_features(new_soup))
                bluetooth.append(get_bluetooth(new_soup))
                gps.append(get_gps(new_soup))
                nfc.append(get_nfc(new_soup))
                radio.append(get_radio(new_soup))
                fingerprint_sensor.append(get_fingerprint(new_soup))
            except requests.exceptions.TooManyRedirects as e:
                print(f"Too many redirects for URL: {full_url}")
            except Exception as e:
                print(f"Error fetching URL {full_url}: {e}")

        try:
            next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "listing-btns4")))
            next_button.click()
            time.sleep(4)  
            page += 1
        except Exception as e:
            print("No more pages or error occurred while clicking next button:", e)
            break  

    except Exception as e:
        print(f"Error on page {page}: {e}")
        break

# Quitting the WebDriver:
driver.quit()

Processing page 1
Scrolling...
Element with class name 'listing-btns4' is visible on the screen.
Processing page 2
Element with class name 'listing-btns4' is visible on the screen.
Processing page 3
Element with class name 'listing-btns4' is visible on the screen.
Processing page 4
Element with class name 'listing-btns4' is visible on the screen.
Processing page 5
Element with class name 'listing-btns4' is visible on the screen.
Processing page 6
Element with class name 'listing-btns4' is visible on the screen.
Processing page 7
Element with class name 'listing-btns4' is visible on the screen.
Processing page 8
Element with class name 'listing-btns4' is visible on the screen.
Processing page 9
Element with class name 'listing-btns4' is visible on the screen.
Processing page 10
Element with class name 'listing-btns4' is visible on the screen.
Processing page 11
Element with class name 'listing-btns4' is visible on the screen.
Processing page 12
Element with class name 'listing-btns4' is

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Processing page 385
Element with class name 'listing-btns4' is visible on the screen.
Processing page 386
Element with class name 'listing-btns4' is visible on the screen.
Processing page 387
Element with class name 'listing-btns4' is visible on the screen.
Processing page 388
Element with class name 'listing-btns4' is visible on the screen.
Processing page 389
Element with class name 'listing-btns4' is visible on the screen.
Processing page 390
Element with class name 'listing-btns4' is visible on the screen.
Processing page 391
Element with class name 'listing-btns4' is visible on the screen.
Processing page 392
Element with class name 'listing-btns4' is visible on the screen.
Processing page 393
Element with class name 'listing-btns4' is visible on the screen.
Processing page 394
Element with class name 'listing-btns4' is visible on the screen.
Processing page 395
Element with class name 'listing-btns4' is visible on the screen.
Processing page 396
Element with class name 'listing-b

KeyboardInterrupt: 

### Creating the DataFrame using initialized lists which contains scraped data

In [24]:
df = pd.DataFrame({
    'Name': names,
    'Specs Score': specs_scores,
    'Rating': ratings,
    'Price': prices,
    'RAM': ram,
    'RAM Type': ramtype,
    'Resolution': resolution,
    'Refresh Rate': refresh_rate,
    'Display Type': display_type,
    'Screen Size': screen_size,
    'Chipset': chipset,
    'CPU': cpu,
    'Architecture': architecture,
    'Fabrication': fabrication,
    'Graphics': graphics,
    'Pixel Density': pixel_density,
    'Quick Charging': quick_charging,
    'Type-C': type_c,
    'Internal Memory': internal_memory,
    'Expandable Memory': expandable_memory,
    'SIM Slot': sim_slot,
    'WiFi': wifi,
    'WiFi Features': wifi_features,
    'Bluetooth': bluetooth,
    'GPS': gps,
    'NFC': nfc,
    'Radio': radio,
    'Fingerprint Sensor': fingerprint_sensor
})

### Viewing the created dataframe

In [53]:
df

Unnamed: 0,Name,Specs Score,Rating,Price,RAM,RAM Type,Resolution,Refresh Rate,Display Type,Screen Size,...,Internal Memory,Expandable Memory,SIM Slot,WiFi,WiFi Features,Bluetooth,GPS,NFC,Radio,Fingerprint Sensor
0,AGM A7,50%,,"15,271(Last Known Price)",2 GB,,480x800 px,,TFT,4 inches (10.16 cm),...,16 GBVery Good▾,"Yes, Up to 32 GB","Dual SIM, GSM+GSM","Yes, Wi-Fi 4 (802.11 b/g/n)",Mobile Hotspot,"Yes, v4.0",Yes with A-GPS,,,No
1,AGM M1,86%,,"5,893(Last Known Price)",128 MB,,176x220 px,,TFT,2 inches (5.08 cm),...,64 MB,"Yes, Up to 32 GB","Dual SIM, GSM+GSM",,,"Yes, v2.1",,,,
2,UHANS U300,68%,,"20,402(Last Known Price)",4 GB,,1080x1920 px (FULL HD),,IPS LCD,5.5 inches (13.97 cm),...,32 GBExcellent▾,"Yes, Up to 128 GB","Dual SIM, GSM+GSM","Yes, Wi-Fi 4 (802.11 b/g/n)",Mobile Hotspot,"Yes, v4.0",Yes with A-GPS,,Yes,Yes
3,UHANS Note 4,65%,,"14,745(Last Known Price)",3 GB,,720x1280 px (HD),,IPS LCD,5.5 inches (13.97 cm),...,32 GBExcellent▾,"Yes, Up to 128 GB","Dual SIM, GSM+GSM","Yes, Wi-Fi 4 (802.11 a/b/g/n)","Wi-Fi Direct, Mobile Hotspot","Yes, v4.0",Yes with A-GPS,,,Yes
4,Yuho H1,86%,,"8,999(Last Known Price)",2 GB,,720x1280 px (HD),,IPS LCD,5 inches (12.7 cm),...,16 GB,"Yes, Up to 64 GB","Dual SIM, GSM+GSM","Yes, Wi-Fi 4 (802.11 a/b/g/n)",,Yes,Yes,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5122,VOX Mobile VGS-603,45%,4.1 / 5,"1,609(Last Known Price)",,,240 x 400 pixels,,LCD,3 inches (7.62 cm),...,,"Yes, Up to 2 GB","Dual SIM, GSM+GSM",No,,"Yes, v2",,,Yes,
5123,Onida F950,86%,,"1,099(Last Known Price)",,,176 x 220 pixels,,LCD,1.9 inches (4.83 cm),...,,"Yes, Up to 2 GB","Single SIM, GSM",No,,Yes,,,,
5124,HTC Tattoo,45%,1.9 / 5,"22,900(Last Known Price)",256 MB,,240 x 320 pixels,,LCD,2.8 inches (7.11 cm),...,512 MB,Yes,"Single SIM, GSM","Yes, Wi-Fi 3 (802.11 a/b/g)",,"Yes, v2",Yes with A-GPS,,"Yes, RDS",
5125,Karbonn K20,45%,5 / 5,"1,489(Last Known Price)",,,320 x 240 pixels,,,2.4 inches (6.1 cm),...,259 KB,"Yes, Up to 8 GB","Dual SIM, GSM+GSM",No,,Yes,,,Yes,


### Exporting the DataFrame into csv(Comma seperated file).

In [77]:
df.to_csv("mobiles_data.csv",index=False)