# Web Scraping for Competitor and Price Analysis

## Problem Statement

An online book retailer has identified lower sales in the "Travel" and "Nonfiction" categories. To address this, the company needs to scrape data from the competitor's website, [Books to Scrape](https://books.toscrape.com/), specifically from the "Travel" and "Nonfiction" categories. The objective is to collect detailed information about the books in these categories for competitor and price analysis.

## Project Tasks

### Task 1: Configure and Launch the Browser

1. **Set Up Chrome Options**
   Use Selenium's WebDriver class to define a ChromeOptions object.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Set Chrome options
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-notifications")
options.add_argument("--start-maximized")

2. **Start ChromeDriver Service**
   Initialize the ChromeDriver service with the path to the executable.

In [None]:
service = Service(executable_path='path_to_your_chromedriver')
driver = webdriver.Chrome(service=service, options=options)

### Task 2: Inspect and Scrape the Homepage

1. **Open the Homepage**
   Load the homepage and observe its structure.

In [None]:
import time
SLEEP_TIME = 2

driver.get("http://books.toscrape.com")
time.sleep(SLEEP_TIME)

2. **Extract Category Links**
   Write an XPath query to find elements for "Travel" and "Nonfiction" category links.

In [None]:
category_elements_xpath = "//a[contains(text(), 'Travel') or contains(text(), 'Nonfiction')]"

3. **Scrape Category Links**
   Find the elements using the XPath query and extract the category URLs.

In [None]:
category_elements = driver.find_elements(By.XPATH, category_elements_xpath)
category_urls = [element.get_attribute('href') for element in category_elements]
category_urls

### Task 3: Inspect and Scrape the Category Page

1. **Extract Book Links from Category Page**
   Write an XPath query to find book detail links.

In [None]:
book_elements_xpath = "//div[@class='image_container']//a"

2. **Scrape Book Links**
   Find and extract the book detail links from the category pages, including handling pagination.

In [None]:
MAX_PAGINATION = 3
book_urls = []
for i in range(1, MAX_PAGINATION + 1):
    update_url = category_urls[1].replace("index", f"page-{i}") if i > 1 else category_urls[1]
    driver.get(update_url)
    book_elements = driver.find_elements(By.XPATH, book_elements_xpath)
    if not book_elements:
        break
    book_urls.extend([element.get_attribute('href') for element in book_elements])
book_urls

### Task 4: Scrape Product Detail Pages

1. **Extract Book Details**
   For each book detail page, scrape the following information:
   - Book Title
   - Book Price
   - Book Star Rating
   - Book Description
   - Product Information

In [None]:
from bs4 import BeautifulSoup
import re

def get_book_details(driver, book_url):
    driver.get(book_url)
    time.sleep(0.3)
    content_div = driver.find_element(By.XPATH, "//div[@class='content']")
    inner_html = content_div.get_attribute('innerHTML')
    soup = BeautifulSoup(inner_html, 'html.parser')

    book_name = soup.find('h1').get_text()
    book_price = soup.find("p", attrs={"class": "price_color"}).get_text()
    regex = re.compile('^star-rating ')
    book_star_count = soup.find("p", attrs={"class": regex})["class"][-1]
    book_desc = soup.find("div", attrs={"id": "product_description"}).find_next_sibling().get_text()

    product_info = {}
    table_rows = soup.find("table").find_all("tr")
    for row in table_rows:
        key = row.find("th").get_text()
        value = row.find("td").get_text()
        product_info[key] = value

    return {
        "Name": book_name,
        "Price": book_price,
        "Stars": book_star_count,
        "Description": book_desc,
        "Product Information": product_info
    }

### Task 5: Automate and Organize the Process

1. **Function Implementation**
   Define functions to automate the scraping and data collection process.

In [None]:
def setup_driver():
    # Set up and return the WebDriver instance
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--disable-notifications")
    service = Service(executable_path='path_to_your_chromedriver')
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def get_category_urls(driver):
    category_elements_xpath = "//a[contains(text(), 'Travel') or contains(text(), 'Nonfiction')]"
    category_elements = driver.find_elements(By.XPATH, category_elements_xpath)
    category_urls = [element.get_attribute('href') for element in category_elements]
    category_names = [element.text.strip() for element in category_elements]  # Kategori isimlerini de alıyoruz
    return category_urls, category_names

def get_book_urls(driver, category_url, max_pagination=3):
    book_urls = []
    for i in range(1, max_pagination + 1):
        update_url = category_url if i == 1 else category_url.replace("index", f"page-{i}")
        driver.get(update_url)
        time.sleep(0.3)
        book_elements_xpath = "//div[@class='image_container']//a"
        book_elements = driver.find_elements(By.XPATH, book_elements_xpath)
        if not book_elements:
            break
        temp_urls = [element.get_attribute('href') for element in book_elements]
        book_urls.extend(temp_urls)
    return book_urls

def scrape_books(driver):
    driver.get("http://books.toscrape.com")
    category_urls, category_names = get_category_urls(driver)

    all_books = []
    for category_url, category_name in zip(category_urls, category_names):
        book_urls = get_book_urls(driver, category_url)
        for book_url in book_urls:
            book_details = get_book_details(driver, book_url)
            book_details['Category'] = category_name  # Kategori ismini de ekliyoruz
            all_books.append(book_details)

    import pandas as pd
    # DataFrame oluşturma
    df = pd.DataFrame(all_books)
    return df

2. **Run the Scraping Process**
   Use the functions to collect data and create a DataFrame.

In [None]:
driver = setup_driver()
books_df = scrape_books(driver)

### Data Visualization

1. **Price vs. Star Ratings**
   Create a scatter plot to visualize the relationship between book prices and star ratings.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert star ratings to numerical values
star_mapping = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
books_df['Stars'] = books_df['Stars'].map(star_mapping)

# Convert price to float
books_df['Price'] = books_df['Price'].replace('£', '', regex=True).astype(float)

# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Price', y='Stars', data=books_df)
plt.title('Book Prices vs. Star Ratings')
plt.xlabel('Price (£)')
plt.ylabel('Star Ratings')
plt.legend(title='Category')
plt.show()

2. **Price Distribution by Category**
   Create a box plot to show price distribution by category.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="Stars", y='Price', data=books_df)
plt.title('Price Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Price (£)')
plt.show()

3. **Distribution of Prices and Star Ratings**
   Plot histograms for the distribution of prices and star ratings.

In [None]:
plt.figure(figsize=(14, 6))

# Price distribution
plt.subplot(1, 2, 1)
sns.histplot(books_df['Price'], kde=True)
plt.title('Price Distribution')
plt.xlabel('Price (£)')

# Star rating distribution
plt.subplot(1, 2, 2)
sns.histplot(books_df['Stars'], kde=True, bins=5)
plt.title('Star Rating Distribution')
plt.xlabel('Stars')

plt.tight_layout()
plt.show()

4. **Close the Browser**
   After completing the scraping and analysis, close the browser.

In [None]:
driver.quit()