<h1>Getting product links from subcategory_lvl2_link</h1>

In [3]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup as BSoup
import pandas as pd
import re

<h3>Get links from single product page</h3>
<p>First of all will get all product cards from plain html. From this particular page we should get 24 cards.</p>
<img src="screenshots/4x6_catalog_page.png" align="left" width="300">

<p>Try extracting link of a single card.</p>

In [4]:
#typical products page
url = 'https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c'

products_page = requests.get(url)
products_page_soup = BSoup(products_page.text, 'html.parser')

#the extraction of link of the product card is made two-staged
product_cards = products_page_soup.select('div.catalog-grid div.product-item_img-box')
product_link_raw = product_cards[0].select('a.no-border-product')
product_link = 'https://www.oma.by' + product_link_raw[0].attrs['href']
print(product_link)


https://www.oma.by/drel-shurupovert-akkumulyatornaya-werker-ewcdl814-1-244685-p


<p>Make a function for links extraction from a page and test it.</p>

In [5]:
def extract_product_links(soup):
    product_cards = soup.select('div.catalog-grid \
                                 div.product-item_img-box')
    link_array = []
    for card in product_cards:
        link_raw = card.select('a.no-border-product')
        link = 'https://www.oma.by' + link_raw[0].attrs['href']
        link_array.append(link)
        
    return link_array
    

link_array_test = extract_product_links(products_page_soup)
for counter, link in enumerate(link_array_test):
    print(counter, link)

0 https://www.oma.by/drel-shurupovert-akkumulyatornaya-werker-ewcdl814-1-244685-p
1 https://www.oma.by/shurupovert-otvertka-akkumulyatornaya-werker-ewcd001-1-244684-p
2 https://www.oma.by/drel-shurupovert-bosch-easydrill-1200-06039a210b-1-247717-p
3 https://www.oma.by/perforator-makita-hr-2470-1-149453-p
4 https://www.oma.by/perforator-bytovoy-werker-ewrh-606-1-229957-p
5 https://www.oma.by/drel-shurupovert-akkumulyatornaya-bosch-easydrill-12-2-060397290x-1-247573-p
6 https://www.oma.by/drel-werker-ewid-652-1-229966-p
7 https://www.oma.by/drel-shurupovert-werker-ewed-651-1-229975-p
8 https://www.oma.by/perforator-bosch-pbh-2100-re-2-zubila-2-bura-1-216760-p
9 https://www.oma.by/perforator-bosch-gbh-2-26-dfr--1-127233-p
10 https://www.oma.by/setevaya-drel-shurupovert-werker-dse-280-1-260565-p
11 https://www.oma.by/akkumulyatornaya-drel-shurupovert-werker-cd-1411-li-1-261826-p
12 https://www.oma.by/perforator-bytovoy-bosch-pbh-2000-re-06033a9322-1-257514-p
13 https://www.oma.by/drel-bosc

<p>The function works fine and outputs 24 product links from the first page as expected.</p>
<h3 style="color:red"><i>Problem</i></h3>
<p>There is a problem by the way. Normally products take more than one page and expanding the page happens with AJAX call. The idea is to expand the page until it expands and only then extract all links. Will try to use selenium for that.</p>
<img src="screenshots/expand_buttons.png" align="left" width="600">

In [6]:
#driver = webdriver.Firefox()
#driver.get(url)
#html = driver.execute_script("BX.ajax.insertToNode('/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=2&amp;bxajaxid=eefedf70222bb04a3200139a477e17f3', 'comp_eefedf70222bb04a3200139a477e17f3'); return false;")
#sel_soup = BSoup(html,'html.parser')


In [7]:
# class Client(QWebPage):
    
#     def __init__(self, url):
#         #we define an application
#         self.app = QApplication(sys.argv)
#         #start a QWebPage
#         QWebPage.__init__(self)
#         self.loadFinished.connect(self.on_page_load)
#         self.mainFrame().load(QUrl(url))
#         self.app.exec_()
        
#     def on_page_load(self):
#         self.app.quit()

# client_response = Client(url)   
# source = client_response.mainFrame().toHtml()
# products_page_soup = BSoup(source, 'lxml')
# link_array_test_qt = extract_product_links(products_page_soup)
# for counter, link in enumerate(link_array_test_qt):
#     print(counter, link)

<h3 style="color:green"><i>Solution?</i></h3>
<p>Get the number of subpages and iterate over them.</p>
<img src="screenshots/page_icons_block.png" align="left" width="600">

In [8]:
button_combo_object = products_page_soup.select('div.btn-combo \
                                                 div.hide a')

for a_tag in button_combo_object:
    print(a_tag.attrs["href"])


/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=1
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=2
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=3
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=4
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=5
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=6
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=7
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=8
/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=9


In [9]:
def page_links(soup):
    page_link_array = []
    button_combo_object = soup.select('div.btn-combo \
                                       div.hide a')
    for a_tag in button_combo_object:
        page_link_array.append('https://www.oma.by' + a_tag.attrs["href"])
    
    return page_link_array

test = page_links(products_page_soup)
for link in test:
    print(link)

https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=1
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=2
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=3
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=4
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=5
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=6
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=7
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=8
https://www.oma.by/dreli-shurupoverty-perforatory-meshalki-13236-c?PAGEN_1=9


In [21]:
def product_url_list(subcat_lvl_2_url):
    product_urls = []
    subcat_lvl2_page = requests.get(subcat_lvl_2_url)
    subcat_lvl_2_soup = BSoup(subcat_lvl2_page.text, 'html.parser')
    
    for page_link in page_links(subcat_lvl_2_soup):
        page = requests.get(page_link)
        page_soup = BSoup(page.text, 'html.parser')
        product_urls.extend(extract_product_links(page_soup))
    
    return product_urls
    

In [22]:
%%time
ankera_urls = product_url_list('https://www.oma.by/bolty-ankernye-13253-c')

CPU times: user 5.43 s, sys: 310 ms, total: 5.74 s
Wall time: 8.52 s


In [23]:
for counter, link in enumerate(ankera_urls):
    print(counter, link)

0 https://www.oma.by/ankernyy-bolt-s-gaykoy-10kh130-mm-1-211171-p
1 https://www.oma.by/ankernyy-bolt-s-gaykoy-8kh85-mm-1-115013-p
2 https://www.oma.by/ankernyy-bolt-s-gaykoy-10-0kh-95-97-mm-10-sht-1-158119-p
3 https://www.oma.by/ankernyy-bolt-s-kryuchkom-8kh40-mm-1-117543-p
4 https://www.oma.by/ankernyy-bolt-s-gaykoy-12kh150-mm-1-211175-p
5 https://www.oma.by/ankernyy-bolt-s-gaykoy-10kh95-97-mm-1-115017-p
6 https://www.oma.by/ankernyy-bolt-s-gaykoy-16kh147-150-mm-1-115022-p
7 https://www.oma.by/ankernyy-bolt-s-gaykoy-12kh129-130-mm-1-115020-p
8 https://www.oma.by/ankernyy-bolt-s-gaykoy-8kh65-mm-1-115012-p
9 https://www.oma.by/ankernyy-bolt-s-gaykoy-16kh110-111-mm-1-115021-p
10 https://www.oma.by/ankernyy-bolt-s-gaykoy-12kh99-100-mm-1-115019-p
11 https://www.oma.by/ankernyy-bolt-s-gaykoy-10kh75-77-mm-1-115016-p
12 https://www.oma.by/ankernyy-bolt-s-shestigr-gol-8kh80-10-sht--1-158107-p
13 https://www.oma.by/ankernyy-bolt-s-koltsom-8kh40-10-sht--1-158126-p
14 https://www.oma.by/ankernyy-