# Rema1000 Discount Crawler

## Imports

In [53]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time

## Crawler

In [175]:
url = "https://shop.rema1000.dk/"
driver = webdriver.Firefox()
driver.get(url)

# accept cookies
WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#declineButton"))).click()

# wait and refresh to load correct page content
time.sleep(1)
driver.refresh()
time.sleep(1)
soup = BeautifulSoup(driver.page_source)

# get all urls
items = soup.find_all("a", class_="item")

urls = []

for item in items:
    urls.append(str(item).split('href="')[1].split('"')[0])

urls = [url[:-1] + item for item in urls]

# create an empty dataframe
all_products = pd.DataFrame()
columns = ['product_title', 'product_price', 'product_price_per_unit', 'product_subtitle', 'product_weight', 'product_brand', 'product_discount', 'date']
all_products = pd.DataFrame(columns=columns)

# go through each category and get all products
for category_url in urls:
    driver.get(category_url)
    time.sleep(1)
    driver.refresh()
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source)

    #! ISSUE HERE: the wrappers only load a set amount of products
    #! SOLUTION: enter every subcategory and scrape all products
    #! CONSIDERATION: this will take a lot of time
    #! SOLUTION 2: store and then open all "Se alle" buttons and then scrape all products
    # get all products
    products = soup.find_all("a", class_="product-wrap")
    #! ISSUE ENDS HERE

    # create an empty dataframe
    product_category = pd.DataFrame()
    columns = ['product_title', 'product_price', 'product_price_per_unit', 'product_subtitle', 'product_weight', 'product_brand', 'product_discount', 'date']
    product_category = pd.DataFrame(columns=columns)

    # go through each product
    for product in products:
        current_product = []

        current_product.append(str(product.find("div", class_="title")).split('"">')[1].split('</div>')[0])
        current_product.append(str(product.find("div", class_="price").find("span", class_="price-normal")).split('"">')[1].split('</')[0].replace("<span>","."))
        current_product.append(str(product.find("div", class_="price").find("span", class_="price-per-unit")).split('"">')[1].split('</')[0])
        current_product.append(str(product.find("div", class_="extra")).split('"">')[2].split("</")[0])
        current_product.append(str(product.find("div", class_="extra")).split('"">')[2].split("</")[0].split(" / ")[0])
        current_product.append(str(product.find("div", class_="extra")).split('"">')[2].split("</")[0].split(" / ")[1])
        current_product.append(1 if "avisvare" in str(product.find_all("div", class_="top")) else 0)
        current_product.append(datetime.datetime.now().strftime("%Y-%m-%d"))

        # product df
        current_product = pd.DataFrame(current_product).transpose()
        current_product.columns = columns

        # add product df to category 
        product_category = pd.concat([product_category, current_product], ignore_index=True)

    # add product category to all products 
    all_products = pd.concat([all_products, product_category], ignore_index=True)
    
    # driver.close()

driver.quit()

# RUNTIME: 2m 17s

## Stats

In [179]:
print(f"products scraped: {all_products.shape[0]}")
print(f"products with discount: {all_products[all_products['product_discount'] == 1].shape[0]}")

products scraped: 430
products with discount: 57


## Inspect data

In [178]:
all_products

Unnamed: 0,product_title,product_price,product_price_per_unit,product_subtitle,product_weight,product_brand,product_discount,date
0,FLUTES,6.50,21.67 per Kg.,300 GR. / REMA 1000,300 GR.,REMA 1000,0,2024-01-12
1,CIABATTA,6.95,23.17 per Kg.,300 GR. / REMA 1000,300 GR.,REMA 1000,0,2024-01-12
2,FULDKORNSFLUTES,12.00,40.00 per Kg.,300 GR. / REMA 1000,300 GR.,REMA 1000,0,2024-01-12
3,SOLSIKKEBOLLER,25.50,51.00 per Kg.,500 GR. / DET GODE,500 GR.,DET GODE,0,2024-01-12
4,KRYDDERBOLLER,8.50,24.29 per Kg.,350 GR. / REMA 1000,350 GR.,REMA 1000,0,2024-01-12
...,...,...,...,...,...,...,...,...
425,TYRKISK PEBER 32%,36.95,369.50 per Ltr.,10 CL. / HOT N´SWEET,10 CL.,HOT N´SWEET,0,2024-01-12
426,"SMÅ FUGLE 16,4%",99.00,99.00 per Ltr.,1 LTR. /,1 LTR.,,0,2024-01-12
427,"SOUR BUBBLE FIZZ 16,4%",99.00,99.00 per Ltr.,1 LTR. / SMÅ SURE,1 LTR.,SMÅ SURE,0,2024-01-12
428,EXTRA GAS,12.50,41.67 per Ltr.,300 ML. / LIGHTER,300 ML.,LIGHTER,0,2024-01-12


In [1]:
all_products[all_products['product_discount'] == 1]

NameError: name 'all_products' is not defined

## Save data as CSV

In [None]:
# Get current yearmonth
now = datetime.datetime.now()
week = now.strftime("%U")
month = now.strftime("%b").lower()
year = now.strftime("%Y")

# Save to excel
all_products.to_excel(f"rema1000_scraper{week}_{month}_{year}.xlsx")

# Save weekly excel files in a separate folder
# Keep a consolidated file with all data in root folder

End of document