<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/scraping_with_beautiful_soup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import requests
from tabulate import tabulate
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [3]:
link_base_wohnen = 'https://www.avocadostore.at/wohnen/wohnen-und-leben?page='
result_list_wohnen = list()
# the limit variable restricts the number of pages the function is going to
# iterate through. Full page scrapping would require this number to be 200,
# but that could take up to several hours.
limit_wohnen = 3

In [4]:
def scrap_page(link_base, limit, result_list, start_page):
  '''
  the base link has to be defined before; it is completed by addition of page;
  limit specifies how many pages should be covered;
  result list is defined outside of the function so that the function can
  be easily restarted in case of an error (frustrating when a function ran for
  a longer time);
  start_page is similarly specified in case the function has to be called again
  because of an error before all the pages have been scraped.
  '''
  page_no = start_page
  while page_no <= limit:
    link = link_base + str(page_no)
    page = urllib.request.urlopen(link)
    page_no += 1
    soup = BeautifulSoup(page, 'html.parser')
    # find all product item elements on the page of the store
    for element in soup.findAll('a', attrs={'class': 'product-item product-item-content'}):
      product_internal_link = element['href']
      product_link = 'https://www.avocadostore.at'+product_internal_link
      # create a soup of the product page
      product_soup = BeautifulSoup(requests.get(product_link).content, 'html.parser')
      # retrieve specific information about the product: name, price, brand, etc.
      name = product_soup.find('h1').text
      price = product_soup.find('div', attrs={'class': 'product-price'}).text
      try:
        postage = product_soup.find('p', attrs={'class': 'shipping-cost-general'}).text
      except:
        postage = 0
      try:
        brand = product_soup.find('span', attrs={'itemprop': 'seller'}).text
      except:
        brand = 'Brand not given'
      try:
        metadescription = product_soup.find('meta', attrs={'name': 'description'})['content']
      except:
        metadescription = product_soup.find('meta', attrs={'property': 'og:description'})['content']
      try:
        description = product_soup.find('div', attrs={'class': 'col-product-description'}).text
      except:
        description = 'no description'
      try:
        criteria = product_soup.find('div', attrs={'class': 'col-product-criteria'}).text
      except:
        criteria = 'no criteria given'
      result_list.append([name, product_internal_link, price, postage, brand,
                          metadescription, description, criteria])
      print('processed product {}!\n'.format(name))
    print('{}\nProcessed page {}!\n{}\n'.format('*'*60, page_no-1, '*'*60))

In [5]:
column_names = ['name', 'link', 'price', 'postage', 'brand', 'metadescription',
                'description', 'criteria']

In [6]:
scrap_page(link_base_wohnen, limit_wohnen, result_list_wohnen, start_page = 1)
wohnen_df = pd.DataFrame(result_list_wohnen, columns = column_names)
wohnen_df['category'] = 'wohnen'

processed product 
vegane Zahnbürste bpa frei
!

processed product 
Veggie Berries
!

processed product 
PARSA Vita Mikrofaser-Abschminktuch
!

processed product 
Power-Guarana
!

processed product 
Ingwer Kurkuma Kaugummi von True Gum
!

processed product 
Veganes Kaugummi Himbeere Vanille
!

processed product 
Briefumschläge mit Sichtfenster und Klebestreifen
!

processed product 
True Gum - Minze Matcha Kaugummi
!

processed product 
Bio-Arganöl, ungeröstet 250 ml, kaltgepresst, DLG-GOLD prämiert
!

processed product 
Postkarte "A Plätzchen a day"
!

processed product 
Untersetzer Wasser unterm Kiel
!

processed product 
Pflanzbare Grußkarten Wildblumen 2
!

processed product 
Bio-Arganöl, geröstet 250 ml
!

processed product 
Lakritz Eukalyptus von True Gum
!

processed product 
Edelstahlbehälter mit Deckel
!

processed product 
Tischset Seepferdchen
!

processed product 
BioCase - Smartphone Hülle aus nachhaltigem Bio-Material
!

processed product 
INSIGHT LENITIVE VEGAN ZERTIFIZI

In [16]:
# The most basic way to clean this dataframe is simply getting rid of
# additional linebreaks
wohnen_df = wohnen_df.replace('\\n\\n', '\\n', regex=True)

In [17]:
print(tabulate(wohnen_df.head(), headers='keys'))

    name                                  link                                                                            price          postage                        brand           metadescription                                                                                                                                           description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           