In [1]:
from lxml import etree

In [2]:
tree = etree.parse("code.html")
tree

<lxml.etree._ElementTree at 0x1111625dec8>

In [3]:
etree.tostring(tree)

b'<!DOCTYPE html>\n<html lang="en">\n        <head>\n            <title>XPath and CSS Selectors</title>\n        </head>\n        <body>\n            <h1>CSS Selectors simplified</h1>\n            <div class="intro">\n                <p>\n                    I\'m paragraph within a div with a class set to intro\n                    <span id="location">I\'m a span with ID set to location and i\'m within a paragraph</span>\n                </p>\n                <p id="outside">I\'m a paragraph with ID set to outside and i\'m within a div with a class set to intro</p>\n            </div>\n            <p>Hi i\'m placed immediately after a div with a class set to intro</p>\n            <span class="intro">Div with a class attribute set to intro</span>\n\n            <ul id="items">\n                <li data-identifier="7">Item 1</li>\n                <li>Item 2</li>\n                <li>Item 3</li>\n                <li>Item 4</li>\n            </ul>\n\n            <a href="https://www.googl

In [4]:
html = tree.getroot()
html

<Element html at 0x111162b1088>

## fundamentals of css selectors

In [5]:
## scrap the header
html.cssselect("h1")[0].text

'CSS Selectors simplified'

We need to be specific while selecting the css selectors. Otherwise if a generic name is provided then there might be too many elements of same tags. for example if we want to call elements with class = "intro":

In [6]:
for elem in html.cssselect(".intro"):
    print(etree.tostring(elem))
    print("-"*30)

b'<div class="intro">\n                <p>\n                    I\'m paragraph within a div with a class set to intro\n                    <span id="location">I\'m a span with ID set to location and i\'m within a paragraph</span>\n                </p>\n                <p id="outside">I\'m a paragraph with ID set to outside and i\'m within a div with a class set to intro</p>\n            </div>\n            '
------------------------------
b'<span class="intro">Div with a class attribute set to intro</span>\n\n            '
------------------------------


This will give two elements:
- div
- span
if we want a specific tag say for example, div with class intro then we can type:

In [7]:
for elem in html.cssselect("div.intro"):
    print(etree.tostring(elem))
    print("-"*30)

b'<div class="intro">\n                <p>\n                    I\'m paragraph within a div with a class set to intro\n                    <span id="location">I\'m a span with ID set to location and i\'m within a paragraph</span>\n                </p>\n                <p id="outside">I\'m a paragraph with ID set to outside and i\'m within a div with a class set to intro</p>\n            </div>\n            '
------------------------------


Similarly we can call by the element ids. remember ids are unique so just calling the ids is enough for an element to be unique:

In [8]:
for elem in html.cssselect("#location"):
    print(etree.tostring(elem))
    print("-"*30)

b'<span id="location">I\'m a span with ID set to location and i\'m within a paragraph</span>\n                '
------------------------------


Sometimes we might wanna call a element with two classes:

In [9]:
for elem in html.cssselect(".bold.italic"):
    print(etree.tostring(elem))
    print("-"*30)

b'<p class="bold italic">Hi, I have two classes</p>\n            '
------------------------------


### css selectors by attributes

like we want to select the below tag:

```html
<li data-identifier="7">Item 1</li>
```

In [10]:
html.cssselect("li[data-identifier='7']")

[<Element li at 0x11116393e08>]

Alternatively we can select it using directly:

In [11]:
html.cssselect("[data-identifier='7']")

[<Element li at 0x11116393e08>]

### css selectors with conditions

what if we want all a tag elements with href staring with "https":

In [12]:
elem = html.cssselect("a[href^=https]")
print(elem)
etree.tostring(elem[0])

[<Element a at 0x111163d3e48>]


b'<a href="https://www.google.com">Google</a>\n            '

Alternatively, a tag with href ending with "fr":

In [13]:
elem = html.cssselect("a[href$=fr]")
print(elem)
etree.tostring(elem[0])

[<Element a at 0x111163d3cc8>]


b'<a href="http://www.google.fr">Google France</a>\n\n            '

And finally finding some tag with attributes containg some string in between:

In [14]:
elems = html.cssselect("a[href*=google]")
print(elems,"\n")
for elem in elems:
    print(etree.tostring(elem))

[<Element a at 0x111163f0088>, <Element a at 0x111163d3cc8>] 

b'<a href="https://www.google.com">Google</a>\n            '
b'<a href="http://www.google.fr">Google France</a>\n\n            '


### selecting elements by positions:

what if we want a all the "p" tags inside the div tag with class="intro"

```html
<div class="intro">
    <p>
        I'm paragraph within a div with a class set to intro
    <span id="location">I'm a span with ID set to location and i'm within a paragraph</span>
    </p>
    <p id="outside">I'm a paragraph with ID set to outside and i'm within a div with a class set to intro</p>
</div>
```

In [15]:
elems = html.cssselect("div.intro p")
print(elems)
print("-"*30)

for elem in elems:
    print(elem.text)
#     print(etree.tostring(elem))
    print("-"*30)

[<Element p at 0x111163f05c8>, <Element p at 0x111163f0688>]
------------------------------

                    I'm paragraph within a div with a class set to intro
                    
------------------------------
I'm a paragraph with ID set to outside and i'm within a div with a class set to intro
------------------------------


Here the span is not selected which is inside the p tag. We can specify other tags for example span here to be selected like this:

In [16]:
elems = html.cssselect("div.intro p, span#location")
print(elems)
print("-"*30)

for elem in elems:
    print(elem.text)
#     print(etree.tostring(elem))
    print("-"*30)

[<Element p at 0x111163f05c8>, <Element span at 0x111163f0808>, <Element p at 0x111163f0688>]
------------------------------

                    I'm paragraph within a div with a class set to intro
                    
------------------------------
I'm a span with ID set to location and i'm within a paragraph
------------------------------
I'm a paragraph with ID set to outside and i'm within a div with a class set to intro
------------------------------


Alternatively we can specify the direct children symbol ">", which will give the same result. This will return the direct children of the parent tag.

To select element placed immediately after a tag, we use + symbol:

In [17]:
elems = html.cssselect("div.intro + p")
print(elems)
print("-"*30)

for elem in elems:
    print(elem.text)
#     print(etree.tostring(elem))
    print("-"*30)

[<Element p at 0x111163f0bc8>]
------------------------------
Hi i'm placed immediately after a div with a class set to intro
------------------------------


Alternatively, if we want to select elements after an element that are not immediately placed after it:

In [18]:
elems = html.cssselect("div.intro ~ p")
print(elems)
print("-"*30)

for elem in elems:
    print(elem.text)
#     print(etree.tostring(elem))
    print("-"*30)

[<Element p at 0x111163f0bc8>, <Element p at 0x111163e8088>, <Element p at 0x111163e80c8>]
------------------------------
Hi i'm placed immediately after a div with a class set to intro
------------------------------
Hi, I have two classes
------------------------------
Hi i'm bold
------------------------------


**Selecting list items by their index:**

In [19]:
elems = html.cssselect("li:nth-child(1)")
elems[0].text

'Item 1'

Selecting two or more:

In [20]:
elems = html.cssselect("li:nth-child(1),li:nth-child(3)")
for e in elems:
    print(e.text)

Item 1
Item 3


Selecting only odd rows:

In [21]:
elems = html.cssselect("li:nth-child(odd)")
for e in elems:
    print(e.text)

Item 1
Item 3


Similarly, selecting only even rows:

In [22]:
elems = html.cssselect("li:nth-child(even)")
for e in elems:
    print(e.text)

Item 2
Item 4


## Fundamentals of xpath selectors

In [23]:
tree = etree.parse("code2.html")
tree

<lxml.etree._ElementTree at 0x111163e8648>

selecting header:

In [24]:
elems = tree.xpath("//h1")
print(elems)
elems[0].text

[<Element h1 at 0x111163f01c8>]


'XPath Selectors simplified'

Now let's say we want all the ```<p>``` tags inside a particular ```<div>``` tag:

```html
<div class="intro">
    <p>
        I'm paragraph within a div with a class set to intro
        <span id="location">I'm a span with ID set to location and i'm within a paragraph</span>
    </p>
    <p id="outside">I'm a paragraph with ID set to outside and i'm within a div with a class set to intro</p>
</div>

<div class="outro">
    <p id="unique">I'm in a div with a class attribute set to outro</p>
</div>
```

In [25]:
elems = tree.xpath("//div[@class='intro']/p/text()")
for e in elems:
    print(e.strip())

I'm paragraph within a div with a class set to intro

I'm a paragraph with ID set to outside and i'm within a div with a class set to intro


And suppose we want ```<p>``` tags from both the ```<div>``` tags:

In [26]:
elems = tree.xpath("//div[@class='intro' or @class='outro']/p/text()")
for e in elems:
    print(e.strip())

I'm paragraph within a div with a class set to intro

I'm a paragraph with ID set to outside and i'm within a div with a class set to intro
I'm in a div with a class attribute set to outro


To get the value of attributes inside a tag:

In [27]:
elems = tree.xpath("//a/@href")
for e in elems:
    print(e.strip())

https://www.google.com
http://www.google.fr


Suppose we want ```<a>``` tag with attribute starting with ```https```:

In [28]:
elems = tree.xpath("//a[starts-with(@href,'https')]")
print(elems)
for e in elems:
    print(e.text)

[<Element a at 0x111163e8b48>]
Google


**Suppose we want ```<a>``` tag with attribute ending with ```fr```:**

Using ```ends-with``` function will throw an error. Because this function is not present in xpath 1.0, it is in xpath 2.0.

```python
elems = tree.xpath("//a[ends-with(@href,'fr')]")
print(elems)
for e in elems:
    print(e.text)
    
XPathEvalError: Unregistered function
```
And lxml library only supports xpath 1.0 version. Also google chrome supports xpath 1.0 version.


Now lets look at the tags containing some string value in their attributes using ```contains()``` function:

In [29]:
elems = tree.xpath("//a[contains(@href,'google')]")
print(elems)
for e in elems:
    print(e.text)

[<Element a at 0x111163e8b48>, <Element a at 0x111162b1208>]
Google
Google France


Tags containing some value as text inside:

In [30]:
elems = tree.xpath("//a[contains(text(),'France')]")
print(elems)
for e in elems:
    print(e.text)

[<Element a at 0x111162b1208>]
Google France


**Note:** the text value is case sensitive


Now selecting elements by position usign xpath:

In [31]:
elems = tree.xpath("//ul[@id='items']/li")
print(elems)
for e in elems:
    print(e.text)

[<Element li at 0x111163e8c48>, <Element li at 0x111163e8348>, <Element li at 0x111163e84c8>, <Element li at 0x111163e8ac8>]
Item 1
Item 2
Item 3
Item 4


In [32]:
elems = tree.xpath("//ul[@id='items']/li[1]")
print(elems)
for e in elems:
    print(e.text)

[<Element li at 0x111163e8c48>]
Item 1


In [33]:
elems = tree.xpath("//ul[@id='items']/li[position()=1 or position()=4]")
print(elems)
for e in elems:
    print(e.text)

[<Element li at 0x111163e8c48>, <Element li at 0x111163e8d08>]
Item 1
Item 4


In [63]:
elems = tree.xpath("//ul[@id='items']/li[position()>1]")
print(elems)
for e in elems:
    print(e.text)

[<Element li at 0x1111647d188>, <Element li at 0x1111647d048>, <Element li at 0x111163e8d08>]
Item 2
Item 3
Item 4


## Xpath Axes

### Navigating using xpath selectors (going up)

**These are also down as xpath axes**:

for going down, these axes will be covered in this sections:
- parent
- ancestor
- preceding
- preceding-sibling

Sometimes we might want to return the parent element from the child element.

Suppose we want to know the parent node from the ```<p>``` element:

```html
<div class="outro">
    <p id="unique">I'm in a div with a class attribute set to outro</p>
</div>
```

In [62]:
parent = tree.xpath("//p[@id='unique']/parent::div")
print(parent)

[<Element div at 0x111163e8dc8>]


sometimes we might not know what is the parent element is. In that case we can write ```node()``` function:

In [36]:
parent = tree.xpath("//p[@id='unique']/parent::node()")
print(parent)

[<Element div at 0x111163e8dc8>]


We can also get the html tags that are above the current child tad by using ```ancester```:

This will return the parent as well as the grand parent's node and so on...

In [61]:
parent = tree.xpath("//p[@id='unique']/ancestor::node()")
[print(x) for x in parent]

<Element html at 0x111163e86c8>
<Element body at 0x111163e8ec8>
<Element div at 0x111163e8dc8>


[None, None, None]

we get the tags that are all above the current tag. Also if we want to include the child element:

In [39]:
parent = tree.xpath("//p[@id='unique']/ancestor-or-self::node()")
[print(x) for x in parent]

<Element html at 0x111163e86c8>
<Element body at 0x111163e8ec8>
<Element div at 0x111163e8dc8>
<Element p at 0x111163e8e48>


[None, None, None, None]

There is another tag called ```preceding```:

In [60]:
parent = tree.xpath("//p[@id='unique']/preceding::node()")
y = [print(x) for x in parent]


        
<Element head at 0x111163f01c8>

            
<Element title at 0x111163e8888>
XPath and CSS Selectors

        

        

            
<Element h1 at 0x11116400108>
XPath Selectors simplified


            
<Element div at 0x11116400288>

                
<Element p at 0x11116400188>

                    I'm paragraph within a div with a class set to intro
                    
<Element span at 0x111164002c8>
I'm a span with ID set to location and i'm within a paragraph

                

                
<Element p at 0x111164003c8>
I'm a paragraph with ID set to outside and i'm within a div with a class set to intro

            


            

                


This returns all the html elements except for the ancestors

There is another one called ```preceding-sibling``` tag. That returns the previous sibling having the same parent:

for example:

```html
<div class="intro">
    <p>
        I'm paragraph within a div with a class set to intro
        <span id="location">I'm a span with ID set to location and i'm within a paragraph</span>
    </p>
    <p id="outside">I'm a paragraph with ID set to outside and i'm within a div with a class set to intro</p>
</div>
```

here the two ```<p>``` tags share the same parent. We can get the preceding ```<p>``` tag from the latter one by:

In [55]:
elem = tree.xpath("//p[@id='outside']/preceding-sibling::node()")
for x in elem:
    if type(x) is etree._Element:
        print(x)
        print(x.text.strip())

<Element p at 0x11116400188>
I'm paragraph within a div with a class set to intro


## Navigating using xpath (going down)

**These are also down as xpath axes**:

for going down, these axes will be covered in this sections:
- child
- following
- following-sibling
- descendant

navigating down is quite straight forward which we have been doing from the first xpath i.e.:

```python
tree.xpath("//div[@class='intro']/p")
```

alternatively we can call the following tags by using ```child```:


In [64]:
elems = tree.xpath("//div[@class='intro']/child::node()")
elems

['\n                ',
 <Element p at 0x11116400188>,
 '\n                ',
 <Element p at 0x111164003c8>,
 '\n            ']

If we want all the elements following a closing html tag, we can use ```following```:

In [58]:
elems = tree.xpath("//div[@class='intro']/following::node()")
elems

['\n\n            ',
 <Element div at 0x111163e8dc8>,
 '\n                ',
 <Element p at 0x111162b6548>,
 "I'm in a div with a class attribute set to outro",
 '\n            ',
 '\n\n            ',
 <Element p at 0x1111644ae88>,
 "Hi i'm placed immediately after a div",
 '\n            \n            ',
 <Element span at 0x1111644afc8>,
 'Div with a class attribute set to intro',
 '\n\n            ',
 <Element ul at 0x1111644af08>,
 '\n                ',
 <Element li at 0x1111644a6c8>,
 'Item 1',
 '\n                ',
 <Element li at 0x1111647d188>,
 'Item 2',
 '\n                ',
 <Element li at 0x1111647d048>,
 'Item 3',
 '\n                ',
 <Element li at 0x111163e8d08>,
 'Item 4',
 '\n            ',
 '\n\n            ',
 <Element a at 0x1111647d248>,
 'Google',
 '\n            ',
 <Element a at 0x1111647d288>,
 'Google France',
 '\n        ',
 '\n    ']

Also we can get the html tags that share the same parent and follows a particular tag:

In [59]:
elems = tree.xpath("//div[@class='intro']/following-sibling::node()")
elems

['\n\n            ',
 <Element div at 0x111163e8dc8>,
 '\n\n            ',
 <Element p at 0x1111644ae88>,
 '\n            \n            ',
 <Element span at 0x1111644afc8>,
 '\n\n            ',
 <Element ul at 0x1111644af08>,
 '\n\n            ',
 <Element a at 0x1111647d248>,
 '\n            ',
 <Element a at 0x1111647d288>,
 '\n        ']

To get the children of children of a parent html tag, we use ```descendant```:

In [65]:
elems = tree.xpath("//div[@class='intro']/descendant::node()")
elems

['\n                ',
 <Element p at 0x11116400188>,
 "\n                    I'm paragraph within a div with a class set to intro\n                    ",
 <Element span at 0x11116268108>,
 "I'm a span with ID set to location and i'm within a paragraph",
 '\n                ',
 '\n                ',
 <Element p at 0x111164003c8>,
 "I'm a paragraph with ID set to outside and i'm within a div with a class set to intro",
 '\n            ']