In [108]:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import re
from datetime import datetime
import numpy as np
from configparser import ConfigParser
import unidecode

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

# get all lidl offer url

In [109]:
def get_all_link_lidl():
    
    offer_page = 'https://www.lidl.hu/ajanlataink'
    
    page = urllib.request.urlopen(offer_page)
    soup = bs(page)
    
    divs_body = soup.body.findAll('div', {'class' : ['tabnavaccordion__content']})

    all_link = []
    for div in divs_body:
        for a in div.find_all('a', href=True):
            url = 'https://www.lidl.hu' + a['href']
            #print(url)
            all_link.append(url)
            
    return all_link

In [110]:
all_link = get_all_link_lidl()

In [111]:
all_link

['https://www.lidl.hu/c/akcioink-07-14-csutortoktol/c3543/w1',
 'https://www.lidl.hu/c/ajanlataink-froccsozeshez/c3546/w1',
 'https://www.lidl.hu/c/akcioink-07-18-hetfotol/c3551/w1',
 'https://www.lidl.hu/c/virag-ajanlataink-07-18-hetfotol/c3552/w1',
 'https://www.lidl.hu/c/kostolja-meg-a-karibi-izvilagot/c3553/w1',
 'https://www.lidl.hu/c/ajanlataink-grillezeshez/c3554/w1',
 'https://www.lidl.hu/c/jegkrem-ajanlataink/c3555/w1',
 'https://www.lidl.hu/c/furdoszobai-es-lakberendezesi-ajanlataink/c3548/w1',
 'https://www.lidl.hu/c/akcioink-07-21-csutortoktol/c3556/w1',
 'https://www.lidl.hu/c/zoldseg-gyumolcs-akcioink-07-21-csutortoktol/c3557/w1',
 'https://www.lidl.hu/c/orchidea-ajanlataink-07-21-csutortoktol/c3558/w1',
 'https://www.lidl.hu/c/ajanlataink-froccsozeshez/c3559/w1',
 'https://www.lidl.hu/c/barkacs-ajanlataink/c3560/w1',
 'https://www.lidl.hu/c/konyhai-ajanlataink/c3561/w1',
 'https://www.lidl.hu/c/szuper-hetvege-07-23-szombattol/c3562/w1',
 'https://www.lidl.hu/c/akcioink-0

In [112]:
def get_all_offer_lidl(all_link):
    
    def find_nth_occurrence(string, char, occurrence):

        val = -1
        for i in range(0, occurrence):
            val = string.find(char, val + 1)
        return val

    all_items = []
    
    counter = 1

    for url in all_link:

        print(f'crawl url: {url} done {counter} from {len(all_link)}')
        counter = counter + 1

        page = urllib.request.urlopen(url)
        soup = bs(page)

        divs = soup.body.findAll('div', {'class' : ['nuc-a-flex-item']})

        for div in divs:
            articles = div.findAll('article', {'class' : 'ret-o-card'})
            for article in articles:

                item_dict = {}

                #print(article['data-id'])
                item_dict['itemId'] = article['data-id']

                #print(article['data-name'])
                brand = article.find('p', {'class' : 'ret-o-card__content'})

                if brand != None:
                    #print(brand.get_text().strip())
                    item_dict['itemName'] = article['data-name'] + ' - ' + brand.get_text().strip()
                else:
                    item_dict['itemName'] = article['data-name']
                    
                item_dict['itemCleanName'] = unidecode.unidecode(item_dict['itemName']).lower()
                
                images = article.findAll('img')
                img_url = np.nan
                for img in images:
                    img_url = img['src']
                    break
                    
                item_dict['imageUrl'] = img_url

                #print(article['data-price'])
                item_dict['price'] = article['data-price']
                
                measure = article.find('div', {'class' : 'lidl-m-pricebox__basic-quantity'})
                
                if measure != None:
                    item_dict['measure'] = measure.get_text()
                else:
                    item_dict['measure'] = np.nan

                #print(article['data-list'])

                sales_from_pattern = r'(?P<group_1>[\d]{2}.[\d]{2})'

                if (article['data-list'] != None) and (re.search(sales_from_pattern, article['data-list']) != None):

                    sales_data = article['data-list']
                    item_dict['salesStart'] = str(datetime.now().year) + '.' + re.search(sales_from_pattern,
                                                                                          article['data-list'])[0]
                else:
                    item_dict['salesStart'] = np.nan

                cut_url = url[find_nth_occurrence(url, '/', 4)+1:]

                item_dict['source'] = cut_url[:cut_url.find('/')]
                item_dict['runDate'] = datetime.now().strftime('%Y.%m.%d-%H:%M:%S')
                item_dict['shopName'] = 'lidl'

                if len(item_dict) > 0:
                    all_items.append(item_dict)

                #print('-----')

    df = pd.DataFrame(all_items)
    
    return df

In [113]:
df = get_all_offer_lidl(all_link)

crawl url: https://www.lidl.hu/c/akcioink-07-14-csutortoktol/c3543/w1 done 1 from 21
crawl url: https://www.lidl.hu/c/ajanlataink-froccsozeshez/c3546/w1 done 2 from 21
crawl url: https://www.lidl.hu/c/akcioink-07-18-hetfotol/c3551/w1 done 3 from 21
crawl url: https://www.lidl.hu/c/virag-ajanlataink-07-18-hetfotol/c3552/w1 done 4 from 21
crawl url: https://www.lidl.hu/c/kostolja-meg-a-karibi-izvilagot/c3553/w1 done 5 from 21
crawl url: https://www.lidl.hu/c/ajanlataink-grillezeshez/c3554/w1 done 6 from 21
crawl url: https://www.lidl.hu/c/jegkrem-ajanlataink/c3555/w1 done 7 from 21
crawl url: https://www.lidl.hu/c/furdoszobai-es-lakberendezesi-ajanlataink/c3548/w1 done 8 from 21
crawl url: https://www.lidl.hu/c/akcioink-07-21-csutortoktol/c3556/w1 done 9 from 21
crawl url: https://www.lidl.hu/c/zoldseg-gyumolcs-akcioink-07-21-csutortoktol/c3557/w1 done 10 from 21
crawl url: https://www.lidl.hu/c/orchidea-ajanlataink-07-21-csutortoktol/c3558/w1 done 11 from 21
crawl url: https://www.lidl.

In [114]:
df = df.fillna('N.a')

In [115]:
df.loc[~df['price'].str.isnumeric()]

Unnamed: 0,itemId,itemName,itemCleanName,imageUrl,price,measure,salesStart,source,runDate,shopName


In [116]:
df[df['imageUrl'].isnull()]

Unnamed: 0,itemId,itemName,itemCleanName,imageUrl,price,measure,salesStart,source,runDate,shopName


In [117]:
df

Unnamed: 0,itemId,itemName,itemCleanName,imageUrl,price,measure,salesStart,source,runDate,shopName
0,1068,Pálcikás mogyorós jégkrém - BALLINO,palcikas mogyoros jegkrem - ballino,https://hu.cat-ret.assets.lidl/catalog5media/h...,139,"120 ml, 1 l = 1159 Ft",2022.07.14,akcioink-07-14-csutortoktol,2022.07.19-11:36:57,lidl
1,7816,Pálcikás jégkrém - OREO / TOBLERONE,palcikas jegkrem - oreo / toblerone,https://hu.cat-ret.assets.lidl/catalog5media/h...,369,"100 / 110 ml, 1 l = 3690 / 3355 Ft",2022.07.14,akcioink-07-14-csutortoktol,2022.07.19-11:36:57,lidl
2,9454,Szendvics jégkrém - OREO,szendvics jegkrem - oreo,https://hu.cat-ret.assets.lidl/catalog5media/h...,479,"135 ml, 1 l = 3549 Ft",2022.07.14,akcioink-07-14-csutortoktol,2022.07.19-11:36:57,lidl
3,2797,Mini csokoládés banán - MISTER CHOC,mini csokolades banan - mister choc,https://hu.cat-ret.assets.lidl/catalog5media/h...,339,"200 g, 1 kg = 1695 Ft",2022.07.14,akcioink-07-14-csutortoktol,2022.07.19-11:36:57,lidl
4,5524399,Granola - ONE DAY MORE,granola - one day more,https://hu.cat-ret.assets.lidl/catalog5media/h...,1299,"450 / 500 g, 1 kg = 2887 / 2598 Ft",2022.07.14,akcioink-07-14-csutortoktol,2022.07.19-11:36:57,lidl
...,...,...,...,...,...,...,...,...,...,...
427,6470,Pom-Bär original,pom-bar original,https://hu.cat-ret.assets.lidl/catalog5media/h...,539,"110 g, 1 kg = 4900 Ft",N.a,nagy-markak-hete,2022.07.19-11:37:18,lidl
428,2539,Pöttyös multipack - ALGIDA,pottyos multipack - algida,https://hu.cat-ret.assets.lidl/catalog5media/h...,1299,"6x40 / 4x90 ml, 1 l = 5413 / 3609 Ft",N.a,nagy-markak-hete,2022.07.19-11:37:18,lidl
429,6405847,Világos sör - KŐBÁNYAI,vilagos sor - kobanyai,https://hu.cat-ret.assets.lidl/catalog5media/h...,129,"0,33 l, 1 l = 391 Ft",N.a,nagy-markak-hete,2022.07.19-11:37:18,lidl
430,9391,All in One mosókapszula - ARIEL,all in one mosokapszula - ariel,https://hu.cat-ret.assets.lidl/catalog5media/h...,1899,"20 db, 1 db = 95 Ft",N.a,nagy-markak-hete,2022.07.19-11:37:18,lidl


In [118]:
df.to_excel('lidl_result.xlsx', index=False)

# backup

In [8]:
all_items = []

divs = soup.body.findAll('div', {'class' : ['nuc-a-flex-item']})

for div in divs:
    articles = div.findAll('article', {'class' : 'ret-o-card'})
    for article in articles:
            
            item_dict = {}
            
            #print(article['data-id'])
            item_dict['item_id'] = article['data-id']
            
            #print(article['data-name'])
            brand = article.find('p', {'class' : 'ret-o-card__content'})
            
            if brand != None:
                #print(brand.get_text().strip())
                item_dict['item_name'] = article['data-name'] + ' - ' + brand.get_text().strip()
            else:
                item_dict['item_name'] = article['data-name']
            
            #print(article['data-price'])
            item_dict['price'] = article['data-price']
            
            item_dict['measure'] = np.nan
            
            #print(article['data-list'])
            
            sales_from_pattern = r'(?P<group_1>[\d]{2}.[\d]{2})'
            
            if (article['data-list'] != None) and (re.search(sales_from_pattern, article['data-list']) != None):
            
                sales_data = article['data-list']
                item_dict['sales_start'] = str(datetime.now().year) + '.' + re.search(sales_from_pattern,
                                                                                      article['data-list'])[0]
            else:
                item_dict['sales_start'] = np.nan
                
            cut_url = url[find_nth_occurrence(url, '/', 4)+1:]
            
            item_dict['source'] = cut_url[:cut_url.find('/')]
            item_dict['run_date'] = datetime.now().strftime('%Y.%m.%d-%H:%M:%S')
            
            if len(item_dict) > 0:
                all_items.append(item_dict)
            
            #print('-----')

df = pd.DataFrame(all_items)

NameError: name 'soup' is not defined

In [None]:
df