## LXML SCRAPING

In [11]:
import lxml.html as web
from lxml.etree import XPath
import math
import csv

### Making URLs and Columns

In [35]:
baseUrl = "http://books.toscrape.com/"
bookUrl = baseUrl + "catalogue/category/books/mystery_3/index.html"
pageUrl = baseUrl + "catalogue/category/books/mystery_3/page-"  # page-1,page-2 found
columns = ["title", "price", "stock", "imageUrl", "rating", "url"]  # for CSV header

### Making empty Dataset and default page values

In [36]:
dataSet = []
page=1
totalPages = 1

### Now save dataset to csv file

In [37]:
def writeto_csv(data, filename, columns):
    with open(filename, "w+", newline="", encoding="UTF-8") as file:
        writer = csv.DictWriter(file, fieldnames=columns)
        writer.writeheader()
        writer = csv.writer(file)
        for element in data:
            writer.writerows([element])

### Loop through pages

In [38]:
while page <= totalPages:
    source = web.parse(pageUrl + str(page) + ".html").getroot()  # read and parse
    if page == 1:  # pagination
        perpageArticles = source.xpath(
            '//form[@class="form-horizontal"]/strong[3]/text()'
        )  # 20
        totalArticles = source.xpath(
            '//form[@class="form-horizontal"]/strong[1]/text()'
        )  # 29
        totalPages = math.ceil(
            int(totalArticles[0]) / int(perpageArticles[0])
        )  # 1.45 ceil up
        print("TotalPages found:", totalPages)
    print("Processing Page " + str(page) + " from ", totalPages)

    # individual path for chosen elements
    articles = XPath("//ol[contains(@class,'row')]/li[position()>0]")  # block
    titlePath = XPath(".//article[contains(@class,'product_pod')]/h3/a/@title")
    linkPath = XPath(".//article[contains(@class,'product_pod')]/h3/a/@href")
    pricePath = XPath(".//article/div[2]/p[contains(@class,'price_color')]/text()")
    stockPath = XPath(
        ".//article/div[2]/p[contains(@class,'availability')]/text()[normalize-space()]"
    )
    imagePath = XPath(
        ".//article/div[1][contains(@class,'image_container')]/a/img/@src"
    )
    ratingPath = XPath(".//article/p[contains(@class,'star-rating')]/@class")

    # iterate through all articles and individual element path
    for row in articles(source):
        title = titlePath(row)[0].strip()
        link = linkPath(row)[0].replace("../../../", baseUrl + "catalogue/").strip()
        price = pricePath(row)[0]
        availability = stockPath(row)[0].strip()
        image = imagePath(row)[0].replace("../../../../", baseUrl).strip()
        rating = ratingPath(row)[0].replace("star-rating", "").strip()

        # if title is not missing, add to dataSet
        if len(title) > 0:
            dataSet.append([title, price, availability, image, rating, link])

    print("Rows in Dataset: ", len(dataSet))
    page += 1  # increment page for loop

TotalPages found: 2
Processing Page 1 from  2
Rows in Dataset:  20
Processing Page 2 from  2
Rows in Dataset:  32


### Convert list(dataSet) to csv file

In [40]:
writeto_csv(dataSet, "MysteryBooks.csv", columns)