In [54]:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import re
from datetime import datetime
import numpy as np
from configparser import ConfigParser
import unidecode

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

# get all lidl offer url

In [55]:
def get_all_link_lidl():
    
    offer_page = 'https://www.lidl.hu/ajanlataink'
    
    page = urllib.request.urlopen(offer_page)
    soup = bs(page)
    
    divs_body = soup.body.findAll('div', {'class' : ['tabnavaccordion__content']})

    all_link = []
    for div in divs_body:
        for a in div.find_all('a', href=True):
            url = 'https://www.lidl.hu' + a['href']
            #print(url)
            all_link.append(url)
            
    return all_link

In [56]:
all_link = get_all_link_lidl()

In [57]:
all_link

['https://www.lidl.hu/c/akcioink-07-07-csutortoktol/c3530/w1',
 'https://www.lidl.hu/c/nemzetkozi-sor-ajanlataink/c3532/w1',
 'https://www.lidl.hu/c/barkacs-ajanlataink/c3537/w1',
 'https://www.lidl.hu/c/strand-es-nyari-textil-ajanlataink/c3538/w1',
 'https://www.lidl.hu/c/akcioink-07-11-hetfotol/c3539/w1',
 'https://www.lidl.hu/c/virag-ajanlataink-07-11-hetfotol/c3540/w1',
 'https://www.lidl.hu/c/xxl-ajanlataink/c3541/w1',
 'https://www.lidl.hu/c/akcioink-07-14-csutortoktol/c3543/w1',
 'https://www.lidl.hu/c/jatek-es-iroszer-ajanlataink/c3550/w1',
 'https://www.lidl.hu/c/zoldseg-gyumolcs-akcioink-07-14-csutortoktol/c3544/w1',
 'https://www.lidl.hu/c/virag-ajanlataink-07-14-csutortoktol/c3545/w1',
 'https://www.lidl.hu/c/ajanlataink-froccsozeshez/c3546/w1',
 'https://www.lidl.hu/c/textil-es-lakberendezesi-ajanlataink/c3547/w1',
 'https://www.lidl.hu/c/szuper-hetvege-07-16-szombattol/c3549/w1',
 'https://www.lidl.hu/c/akcioink-07-14-csutortoktol/c3543/w2',
 'https://www.lidl.hu/c/ajanla

In [58]:
def get_all_offer_lidl(all_link):
    
    def find_nth_occurrence(string, char, occurrence):

        val = -1
        for i in range(0, occurrence):
            val = string.find(char, val + 1)
        return val

    all_items = []
    
    counter = 1

    for url in all_link:

        print(f'crawl url: {url} done {counter} from {len(all_link)}')
        counter = counter + 1

        page = urllib.request.urlopen(url)
        soup = bs(page)

        divs = soup.body.findAll('div', {'class' : ['nuc-a-flex-item']})

        for div in divs:
            articles = div.findAll('article', {'class' : 'ret-o-card'})
            for article in articles:

                item_dict = {}

                #print(article['data-id'])
                item_dict['itemId'] = article['data-id']

                #print(article['data-name'])
                brand = article.find('p', {'class' : 'ret-o-card__content'})

                if brand != None:
                    #print(brand.get_text().strip())
                    item_dict['itemName'] = article['data-name'] + ' - ' + brand.get_text().strip()
                else:
                    item_dict['itemName'] = article['data-name']
                    
                item_dict['itemCleanName'] = unidecode.unidecode(item_dict['itemName']).lower()
                
                images = article.findAll('img')
                img_url = np.nan
                for img in images:
                    img_url = img['src']
                    break
                    
                item_dict['imageUrl'] = img_url

                #print(article['data-price'])
                item_dict['price'] = article['data-price']
                
                measure = article.find('div', {'class' : 'lidl-m-pricebox__basic-quantity'})
                
                if measure != None:
                    item_dict['measure'] = measure.get_text()
                else:
                    item_dict['measure'] = np.nan

                #print(article['data-list'])

                sales_from_pattern = r'(?P<group_1>[\d]{2}.[\d]{2})'

                if (article['data-list'] != None) and (re.search(sales_from_pattern, article['data-list']) != None):

                    sales_data = article['data-list']
                    item_dict['salesStart'] = str(datetime.now().year) + '.' + re.search(sales_from_pattern,
                                                                                          article['data-list'])[0]
                else:
                    item_dict['salesStart'] = np.nan

                cut_url = url[find_nth_occurrence(url, '/', 4)+1:]

                item_dict['source'] = cut_url[:cut_url.find('/')]
                item_dict['runDate'] = datetime.now().strftime('%Y.%m.%d-%H:%M:%S')
                item_dict['shopName'] = 'lidl'

                if len(item_dict) > 0:
                    all_items.append(item_dict)

                #print('-----')

    df = pd.DataFrame(all_items)
    
    return df

In [59]:
df = get_all_offer_lidl(all_link)

crawl url: https://www.lidl.hu/c/akcioink-07-07-csutortoktol/c3530/w1 done 1 from 22
crawl url: https://www.lidl.hu/c/nemzetkozi-sor-ajanlataink/c3532/w1 done 2 from 22
crawl url: https://www.lidl.hu/c/barkacs-ajanlataink/c3537/w1 done 3 from 22
crawl url: https://www.lidl.hu/c/strand-es-nyari-textil-ajanlataink/c3538/w1 done 4 from 22
crawl url: https://www.lidl.hu/c/akcioink-07-11-hetfotol/c3539/w1 done 5 from 22
crawl url: https://www.lidl.hu/c/virag-ajanlataink-07-11-hetfotol/c3540/w1 done 6 from 22
crawl url: https://www.lidl.hu/c/xxl-ajanlataink/c3541/w1 done 7 from 22
crawl url: https://www.lidl.hu/c/akcioink-07-14-csutortoktol/c3543/w1 done 8 from 22
crawl url: https://www.lidl.hu/c/jatek-es-iroszer-ajanlataink/c3550/w1 done 9 from 22
crawl url: https://www.lidl.hu/c/zoldseg-gyumolcs-akcioink-07-14-csutortoktol/c3544/w1 done 10 from 22
crawl url: https://www.lidl.hu/c/virag-ajanlataink-07-14-csutortoktol/c3545/w1 done 11 from 22
crawl url: https://www.lidl.hu/c/ajanlataink-froc

In [60]:
df = df.fillna('N.a')

In [61]:
df.loc[~df['price'].str.isnumeric()]

Unnamed: 0,itemId,itemName,itemCleanName,imageUrl,price,measure,salesStart,source,runDate,shopName


In [62]:
df[df['imageUrl'].isnull()]

Unnamed: 0,itemId,itemName,itemCleanName,imageUrl,price,measure,salesStart,source,runDate,shopName


In [63]:
df.to_excel('lidl_result.xlsx', index=False)

# backup

In [8]:
all_items = []

divs = soup.body.findAll('div', {'class' : ['nuc-a-flex-item']})

for div in divs:
    articles = div.findAll('article', {'class' : 'ret-o-card'})
    for article in articles:
            
            item_dict = {}
            
            #print(article['data-id'])
            item_dict['item_id'] = article['data-id']
            
            #print(article['data-name'])
            brand = article.find('p', {'class' : 'ret-o-card__content'})
            
            if brand != None:
                #print(brand.get_text().strip())
                item_dict['item_name'] = article['data-name'] + ' - ' + brand.get_text().strip()
            else:
                item_dict['item_name'] = article['data-name']
            
            #print(article['data-price'])
            item_dict['price'] = article['data-price']
            
            item_dict['measure'] = np.nan
            
            #print(article['data-list'])
            
            sales_from_pattern = r'(?P<group_1>[\d]{2}.[\d]{2})'
            
            if (article['data-list'] != None) and (re.search(sales_from_pattern, article['data-list']) != None):
            
                sales_data = article['data-list']
                item_dict['sales_start'] = str(datetime.now().year) + '.' + re.search(sales_from_pattern,
                                                                                      article['data-list'])[0]
            else:
                item_dict['sales_start'] = np.nan
                
            cut_url = url[find_nth_occurrence(url, '/', 4)+1:]
            
            item_dict['source'] = cut_url[:cut_url.find('/')]
            item_dict['run_date'] = datetime.now().strftime('%Y.%m.%d-%H:%M:%S')
            
            if len(item_dict) > 0:
                all_items.append(item_dict)
            
            #print('-----')

df = pd.DataFrame(all_items)

NameError: name 'soup' is not defined

In [None]:
df