## Part-1 LXML Basics

In [1]:
from lxml import etree


In [43]:
tree = etree.parse("web_page.html")
print(tree )

<lxml.etree._ElementTree object at 0x000002234B91E048>


In [44]:
print (etree.tostring(tree, pretty_print=True).decode("utf-8"))

<html lang="en">

<head>
    <title>This is the title</title>
</head>

<body>
    <p>Hello World</p>
    <ul>
        <li id="myID">Web Scraping with Python using Requests, LXML and Splash</li>
        <li class="myClass">Created by:
            <a href="https://twitter.com/AhmedRafik__">Ahmed Rafik</a>
        </li>
    </ul>
</body>

</html>



In [51]:
title_element = tree.find("head/title")
print(title_element.text)

paragraph_element = tree.find("body/p")
print( paragraph_element.text )


list_items = tree.findall("body/ul/li")

# list all paragraph
print("---------")
for li in list_items:
    print(li.text)

print("------------")
for li in list_items:
    a = li.find('a')
    if a is not None:
        print(f"{li.text.strip()} {a.text}")
    else:
        print(li.text)

This is the title
Hello World
---------
Web Scraping with Python using Requests, LXML and Splash
Created by:
            
------------
Web Scraping with Python using Requests, LXML and Splash
Created by: Ahmed Rafik


In [59]:
# method 2: LXML with XPATH

tree = etree.parse('web_page.html')

title_element = tree.xpath('//title/text()')[0]
print(title_element)

paragraph_element = tree.xpath('//p/text()')[0]
print(paragraph_element)

#list all paragraph;
list_items = tree.xpath('//li')

for li in list_items:
    # remove spaces and \n
    # print in strings;
    text = ' '.join( map( str.strip, li.xpath(".//text()") ) )
    print( text)
    


This is the title
Hello World
Web Scraping with Python using Requests, LXML and Splash
Created by: Ahmed Rafik 


In [69]:
# method 3: LXML with CSS selectors

tree = etree.parse('web_page.html')
html = tree.getroot()

title_element = html.cssselect("title")[0]
print(title_element.text)

paragraph_element = html.cssselect('p')[0]
print(paragraph_element.text)

print("-------")
list_items = html.cssselect("li")
for li in list_items:
    a = li.cssselect('a')
   
    if len(a) == 0:
        print(li.text)
    else:
        print(f"{li.text.strip()} {a[0].text} ")

This is the title
Hello World
-------
Web Scraping with Python using Requests, LXML and Splash
Created by: Ahmed Rafik 


In [37]:
# 
for li in list_items:
    a = li.find("a")
    if a is not None:
        print(f"{li.text.strip()} {a.text}")
    else:
        print(f"{li.text}" )

Web Scraping with Python using Requests, LXML and Splash
Created by: Ahmed Rafik


## Part-2 XPATH and CSS Selectors

XPATH: XML file<br>

https://scrapinghub.github.io/xpath-playground/ <br>

CSS Selectors: HTML <br>

https://try.jsoup.org/

**Parse data from the web page**

In [27]:
import pandas as pd

df = pd.read_html("http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
df[0]

Unnamed: 0,0,1
0,UPC,a897fe39b1053632
1,Product Type,Books
2,Price (excl. tax),£51.77
3,Price (incl. tax),£51.77
4,Tax,£0.00
5,Availability,In stock (22 available)
6,Number of reviews,0


In [10]:
import requests
from lxml import html
import re


In [30]:
import requests

bookurl ="http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"
resp = requests.get(url=bookurl, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    })
resp

<Response [200]>

In [14]:
tree = html.fromstring(html=resp.text)
product_main = tree.xpath("//div[contains(@class, 'product_main')]")[0]
print(etree.tostring(product_main, pretty_print=True).decode("utf-8"))

<div class="col-sm-6 product_main">
            
            
            <h1>A Light in the Attic</h1>

            
                






    
        <p class="price_color">&#194;&#163;51.77</p>
    

<p class="instock availability">
    <i class="icon-ok"/>
    
        In stock (22 available)
    
</p>

            

            
                



    <p class="star-rating Three">
        <i class="icon-star"/>
        <i class="icon-star"/>
        <i class="icon-star"/>
        <i class="icon-star"/>
        <i class="icon-star"/>

        <!-- <small><a href="/catalogue/a-light-in-the-attic_1000/reviews/">
        
                
                    0 customer reviews
                
        </a></small>
         -->&#160;


<!-- 
    <a id="write_review" href="/catalogue/a-light-in-the-attic_1000/reviews/add/#addreview" class="btn btn-success btn-sm">
        Write a review
    </a>

 --></p>

            

            <hr/>



            
                






       

In [17]:
title = product_main.xpath(".//h1/text()")[0]
title

'A Light in the Attic'

In [18]:
price = product_main.xpath(".//p[1]/text()")[0]


In [22]:
availability = product_main.xpath(".//p[2]/text()")[1].strip()
availability


'In stock (22 available)'

In [23]:
in_stock = ''.join(list(filter(lambda x: x.isdigit(), availability)))
in_stock

'22'

In [21]:
description = tree.xpath(
    "//div[@id='product_description']/following-sibling::p/text()")[0]
description

"It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounde

In [25]:
book_information = {
        'title': title,
        'price': price,
        'in_stock': in_stock,
        'description': description
    }
book_information

{'title': 'A Light in the Attic',
 'price': 'Â£51.77',
 'in_stock': '22',
 'description': "It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And y

In [32]:

web_page = "https://www.ebay.com/trending"
#Response object
resp = requests.get(url= web_page, headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
})

resp

<Response [200]>

In [33]:
#Element Tree object
tree = html.fromstring(html=resp.text)

#Get trending items
trending_items = tree.xpath("//div[contains(@id, 'topic')]/div[@class='topic-container']")

trending_items

[<Element div at 0x19be80a5a48>,
 <Element div at 0x19be8015d68>,
 <Element div at 0x19be8015b88>,
 <Element div at 0x19be8015db8>,
 <Element div at 0x19be8015c78>,
 <Element div at 0x19be8015a98>,
 <Element div at 0x19be8015408>,
 <Element div at 0x19be8015e08>,
 <Element div at 0x19be8015e58>,
 <Element div at 0x19be8015ea8>]

In [36]:

all_products = []

for trending_item in trending_items:
    item = {
        'name': trending_item.xpath(".//h2[@class='title']/a/text()")[0],
        'url': trending_item.xpath(".//h2[@class='title']/a/@href")[0],
        'info': trending_item.xpath(".//div[@class='info']/p/text()")[0],
        'searches': trending_item.xpath(".//div[@class='info']/div[@class='graph']/div[@class='stats']/div/strong/text()")[0]
    }
    all_products.append(item)

In [38]:
all_products

[{'name': 'Mahomes Magic Crunch',
  'url': 'https://www.ebay.com/sch/i.html?_nkw=mahomes+magic+crunch&_trksid=p2245348.m3768.l6570&_trkparms=topicId%3D7749&_trksid=p2245348.m3768.l6569&_trkparms=topicId%3D7749',
  'info': 'The reigning NFL MVP, Kansas City Chiefs quarterback Patrick Mahomes, just got his face on a cereal box. And not just any cereal, but a custom one, created just for him. Will he eat it from a super bowl?',
  'searches': '7950'},
 {'name': "Hot Wheels '55 Chevy Gasser",
  'url': 'https://www.ebay.com/sch/180506/i.html?_nkw=hot+wheels+rlc+gasser&_trksid=p2245348.m3769.l6569&_trkparms=topicId%3D7747',
  'info': 'Hot, indeed. The black and green FLYNTGR is the latest ‘55 Chevy Bel Air Gasser released exclusively for members of the Hot Wheels Red Line Club.',
  'searches': '5732'},
 {'name': 'Starter Jacket',
  'url': 'https://www.ebay.com/sch/i.html?_nkw=starter+jacket&_trksid=p2245348.m3770.l6569&_trkparms=topicId%3D7737',
  'info': "Platform sandals, scrunchies, choker