# Q1

In [2]:
import requests
from bs4 import BeautifulSoup

def search_amazon(product_name):
    search_url = f"https://www.amazon.in/s?k={product_name.replace(' ', '+')}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Example usage
product_name = input("Enter the product name to search on Amazon: ")
soup = search_amazon(product_name)

# Display the titles of the products in the search results
products = soup.find_all('span', class_='a-size-medium a-color-base a-text-normal')
for idx, product in enumerate(products, 1):
    print(f"{idx}. {product.get_text(strip=True)}")


Enter the product name to search on Amazon: guitar


# Q2

In [4]:
import pandas as pd

def get_product_details(soup):
    product_list = []
    
    products = soup.find_all('div', {'data-component-type': 's-search-result'})
    
    for product in products:
        title = product.find('span', class_='a-size-medium a-color-base a-text-normal')
        price = product.find('span', class_='a-price-whole')
        url = product.find('a', class_='a-link-normal a-text-normal')['href']
        product_url = f"https://www.amazon.in{url}"
        try:
            brand = product.find('span', class_='a-size-base-plus a-color-base').get_text(strip=True)
        except:
            brand = "-"
        
        product_details = {
            'Brand Name': brand if brand else "-",
            'Name of the Product': title.get_text(strip=True) if title else "-",
            'Price': price.get_text(strip=True) if price else "-",
            'Return/Exchange': "-",
            'Expected Delivery': "-",
            'Availability': "-",
            'Product URL': product_url
        }
        product_list.append(product_details)
    
    return product_list

def scrape_amazon(product_name, pages=3):
    all_products = []
    for page in range(1, pages + 1):
        search_url = f"https://www.amazon.in/s?k={product_name.replace(' ', '+')}&page={page}"
        response = requests.get(search_url, 'headers=headers')
        soup = BeautifulSoup(response.content, 'html.parser')
        products = get_product_details(soup)
        all_products.extend(products)
    return all_products

# Scrape details for the first 3 pages of search results
product_details = scrape_amazon(product_name, pages=3)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(product_details)
df.to_csv(f"{product_name}_amazon_products.csv", index=False)
print(f"Scraped data saved to {product_name}_amazon_products.csv")

# Example usage
print(df.head())


Scraped data saved to guitar_amazon_products.csv
Empty DataFrame
Columns: []
Index: []


# Q3

In [8]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
"from webdriver_manager.chrome import ChromeDriverManager"
import time

def scrape_google_images(keywords, num_images=10):
    # Set up the Selenium WebDriver
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    image_urls = {}
    
    for keyword in keywords:
        driver.get("https://images.google.com/")
        search_box = driver.find_element(By.NAME, "q")
        search_box.send_keys(keyword)
        search_box.send_keys(Keys.RETURN)
        time.sleep(2)  # Allow time for the search results to load
        
        image_urls[keyword] = []
        thumbnails = driver.find_elements(By.CSS_SELECTOR, "img.Q4LuWd")
        
        for i in range(num_images):
            try:
                thumbnails[i].click()
                time.sleep(2)  # Allow time for the full image to load
                images = driver.find_elements(By.CSS_SELECTOR, "img.n3VNCb")
                for image in images:
                    src = image.get_attribute("src")
                    if src and "http" in src:
                        image_urls[keyword].append(src)
                        if len(image_urls[keyword]) >= num_images:
                            break
                if len(image_urls[keyword]) >= num_images:
                    break
            except Exception as e:
                print(f"Error fetching image {i+1} for {keyword}: {e}")
                continue
    
    driver.quit()
    return image_urls

# Example usage
keywords = ['fruits', 'cars', 'Machine Learning', 'Guitar', 'Cakes']
image_urls = scrape_google_images(keywords)

# Display the image URLs
for keyword, urls in image_urls.items():
    print(f"\nImages for '{keyword}':")
    for url in urls:
        print(url)


NameError: name 'ChromeDriverManager' is not defined

# Q4

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_flipkart(smartphone_name):
    search_url = f"https://www.flipkart.com/search?q={smartphone_name.replace(' ', '+')}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    products = soup.find_all('div', {'class': '_1AtVbE'})
    
    product_list = []
    
    for product in products:
        try:
            name = product.find('div', {'class': '_4rR01T'}).get_text(strip=True)
            link = "https://www.flipkart.com" + product.find('a', {'class': '_1fQZEK'})['href']
            price = product.find('div', {'class': '_30jeq3 _1_WHN1'}).get_text(strip=True)
            details = product.find_all('li', {'class': 'rgWa7D'})
            details_text = [detail.get_text(strip=True) for detail in details]
            
            product_details = {
                'Brand Name': name.split(' ')[0],
                'Smartphone name': name,
                'Colour': details_text[0] if len(details_text) > 0 else "-",
                'RAM': details_text[1] if len(details_text) > 1 else "-",
                'Storage(ROM)': details_text[2] if len(details_text) > 2 else "-",
                'Primary Camera': details_text[3] if len(details_text) > 3 else "-",
                'Secondary Camera': details_text[4] if len(details_text) > 4 else "-",
                'Display Size': details_text[5] if len(details_text) > 5 else "-",
                'Battery Capacity': details_text[6] if len(details_text) > 6 else "-",
                'Price': price,
                'Product URL': link
            }
            product_list.append(product_details)
        except Exception as e:
            continue
    
    return product_list

# Example usage
smartphone_name = "Oneplus Nord"
product_details = scrape_flipkart(smartphone_name)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(product_details)
df.to_csv(f"{smartphone_name}_flipkart_products.csv", index=False)
print(f"Scraped data saved to {smartphone_name}_flipkart_products.csv")

# Display some scraped data
print(df.head())


Scraped data saved to Oneplus Nord_flipkart_products.csv
Empty DataFrame
Columns: []
Index: []


# Q5

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

def get_coordinates(city_name):
    # Set up the Selenium WebDriver
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    driver.get("https://www.google.com/maps")
    search_box = driver.find_element(By.ID, "searchboxinput")
    search_box.send_keys(city_name)
    search_box.send_keys(Keys.RETURN)
    time.sleep(3)  # Allow time for the search results to load
    
    current_url = driver.current_url
    driver.quit()
    
    try:
        lat_long = current_url.split('@')[1].split(',')[:2]
        latitude = lat_long[0]
        longitude = lat_long[1]
    except Exception as e:
        latitude, longitude = "-", "-"
    
    return latitude, longitude

# Example usage
city_name = "London"
latitude, longitude = get_coordinates(city_name)
print(f"Coordinates of {city_name}: Latitude = {latitude}, Longitude = {longitude}")


# Q6

In [None]:
def scrape_digit_best_gaming_laptops():
    url = "https://www.digit.in/top-products/best-gaming-laptops-40.html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    laptops = soup.find_all('div', {'class': 'TopNumbeHeading sticky-footer'})
    
    laptop_list = []
    
    for laptop in laptops:
        try:
            name = laptop.find('h2').get_text(strip=True)
            details = laptop.find_all('li')
            details_text = [detail.get_text(strip=True) for detail in details]
            laptop_details = {
                'Name': name,
                'Details': details_text
            }
            laptop_list.append(laptop_details)
        except Exception as e:
            continue
    
    return laptop_list

# Example usage
laptop_details = scrape_digit_best_gaming_laptops()

# Convert to DataFrame and save as CSV
df = pd.DataFrame(laptop_details)
df.to_csv("best_gaming_laptops_digit.csv", index=False)
print("Scraped data saved to best_gaming_laptops_digit.csv")

# Display some scraped data
print(df.head())


# Q7

In [None]:
def scrape_forbes_billionaires():
    url = "https://www.forbes.com/billionaires/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    billionaires = soup.find_all('div', {'class': 'personName'})
    
    billionaire_list = []
    
    for billionaire in billionaires:
        try:
            rank = billionaire.find('div', {'class': 'rank'}).get_text(strip=True)
            name = billionaire.find('div', {'class': 'name'}).get_text(strip=True)
            net_worth = billionaire.find('div', {'class': 'netWorth'}).get_text(strip=True)
            age = billionaire.find('div', {'class': 'age'}).get_text(strip=True)
            citizenship = billionaire.find('div', {'class': 'country'}).get_text(strip=True)
            source = billionaire.find('div', {'class': 'source'}).get_text(strip=True)
            industry = billionaire.find('div', {'class': 'category'}).get_text(strip=True)
            billionaire_details = {
                'Rank': rank,
                'Name': name,
                'Net worth': net_worth,
                'Age': age,
                'Citizenship': citizenship,
                'Source': source,
                'Industry': industry
            }
            billionaire_list.append(billionaire_details)
        except Exception as e:
            continue
    
    return billionaire_list

# Example usage
billionaire_details = scrape_forbes_billionaires()

# Convert to DataFrame and save as CSV
df = pd.DataFrame(billionaire_details)
df.to_csv("forbes_billionaires.csv", index=False)
print("Scraped data saved to forbes_billionaires.csv")

# Display some scraped data
print(df.head())


# Q8

In [None]:
from selenium.webdriver.common.action_chains import ActionChains

def scrape_youtube_comments(video_url, num_comments=500):
    # Set up the Selenium WebDriver
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    driver.get(video_url)
    time.sleep(5)  # Allow time for the video page to load
    
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    comments = []
    while len(comments) < num_comments:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(2)
        
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        
        comment_elements = driver.find_elements(By.CSS_SELECTOR, "#content-text")
        for element in comment_elements:
            try:
                comment_text = element.text
                comment_upvotes = element.find_element(By.CSS_SELECTOR, "#vote-count-middle").text
                comment_time = element.find_element(By.CSS_SELECTOR, "#header-author > yt-formatted-string > a").text
                comment_details = {
                    'Comment': comment_text,
                    'Upvotes': comment_upvotes,
                    'Time': comment_time
                }
                if len(comments) < num_comments:
                    comments.append(comment_details)
            except Exception as e:
                continue
    
    driver.quit()
    return comments

# Example usage
video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
comments = scrape_youtube_comments(video_url, num_comments=500)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(comments)
df.to_csv("youtube_comments.csv", index=False)
print("Scraped data saved to youtube_comments.csv")

# Display some scraped data
print(df.head())


# Q9

In [None]:
def scrape_hostelworld(location):
    search_url = f"https://www.hostelworld.com/search?search_keywords={location.replace(' ', '%20')}&country=England"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    hostels = soup.find_all('div', {'class': 'property-card'})
    
    hostel_list = []
    
    for hostel in hostels:
        try:
            name = hostel.find('h2', {'class': 'title'}).get_text(strip=True)
            distance = hostel.find('span', {'class': 'distance'}).get_text(strip=True)
            rating = hostel.find('div', {'class': 'score'}).get_text(strip=True)
            reviews = hostel.find('span', {'class': 'reviews'}).get_text(strip=True)
            overall = hostel.find('div', {'class': 'keyword'}).get_text(strip=True)
            privates_price = hostel.find('a', {'class': 'prices'}).get_text(strip=True)
            dorms_price = hostel.find('a', {'class': 'prices'}).get_text(strip=True)
            facilities = hostel.find('div', {'class': 'facilities'}).get_text(strip=True)
            description = hostel.find('div', {'class': 'desc'}).get_text(strip=True)
            hostel_details = {
                'Hostel Name': name,
                'Distance from City Centre': distance,
                'Rating': rating,
                'Total Reviews': reviews,
                'Overall Reviews': overall,
                'Privates from Price': privates_price,
                'Dorms from Price': dorms_price,
                'Facilities': facilities,
                'Property Description': description
            }
            hostel_list.append(hostel_details)
        except Exception as e:
            continue
    
    return hostel_list

# Example usage
location = "London"
hostel_details = scrape_hostelworld(location)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(hostel_details)
df.to_csv(f"hostels_in_{location}.csv", index=False)
print(f"Scraped data saved to hostels_in_{location}.csv")

# Display some scraped data
print(df.head())


- Summary
- Q1 & Q2: Scrape Amazon product listings based on a user input and save to CSV.
- Q3: Scrape image URLs from Google Images for specified keywords.
- Q4: Scrape smartphone details from Flipkart's first search results page.
- Q5: Scrape geospatial coordinates of a city from Google Maps.
- Q6: Scrape best gaming laptops details from Digit.in.
- Q7: Scrape billionaire details from Forbes.
- Q8: Scrape YouTube comments and save to CSV.
- Q9: Scrape hostel details from Hostelworld for a specified location.

These solutions utilize a combination of requests, BeautifulSoup, selenium, and pandas to automate the web scraping tasks and save the results in a structured format like CSV.