## import usefull packages and functions

In [None]:
import json
import pandas as pd

from auchan_scraper.database import create_table, select_all
from auchan_scraper.get_categories import get_categories

### create database

In [None]:
create_table()

After creating tables we can start the search for all categories urls. For debugging purposes we use logging package. While scraping the urls of each subcategory, it is also saving the cookies for each, as without this process we would get response 400. After the process both categories and cookies are saved in json files (for future use in crawler)

In [None]:
url = "https://zakupy.auchan.pl/shop/artykuly-spozywcze.c-11908" # URL of the food categories page
categories, cookies = get_categories(url)

with open("auchan_scraper/categories.json", "w") as categories_file:
    json.dump(categories, categories_file)

with open("auchan_scraper/cookies.json", "w") as cookies_file:
    json.dump(cookies, cookies_file)

## preview cookies

In [None]:
with open("auchan_scraper/cookies.json", "r") as cookies_file:
    cookies = json.load(cookies_file)
for cookie in cookies:
    for key in cookie:
        print(key, cookie[key], "\n")

## count number of subcategories

In [None]:
with open('auchan_scraper/categories.json', 'r') as f:
    categories = json.load(f)

url_count = 0
for category in categories:
    for subcategory in category['subcategories']:
        if 'url' in subcategory:
            url_count += 1
print(url_count)

## import spider to run without command line

In [None]:
from auchan_scraper.spiders.shop import ShopSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import nest_asyncio

In ShopSpider we used following settings:
- ROBOTSTXT_OBEY = True (obey the robots.txt file: get response 200)
- DOWNLOAD_DELAY = 3 (3 second delay between each fetch of the data)
- COOKIES_ENABLED = True (without cookies user is not able to acces the api fetching data from the server)
- REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
- TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
- FEED_EXPORT_ENCODING = "utf-8"

We also used additional middlewares to rotate headers:
- 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None,
- 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware' : 400

And custom pipelines:
- "auchan_scraper.pipelines.DuplicatesPipeline": 100, (removes duplicated items)
- "auchan_scraper.pipelines.SavingTosqlitePipeline": 200 (saves yield data to SQLite db)

To run scrapy in jupyter notebook in order to encounter error: "RuntimeError: This event loop is already running" we can use asyncio to create a event loop that can coexist with the one being used by Jupyter

In [None]:
nest_asyncio.apply()
process = CrawlerProcess(get_project_settings())

process.crawl(ShopSpider, number=3)

process.start()


Finally we can select all saved items from the DB and make a pandas dataFrame from it.

In [None]:
rows = select_all()
columns = ["product_id", "product_name", "category_name", "price", "currency", "volume", "unit", "volume_info", "package_unit", "package_size"]

df = pd.DataFrame(rows, columns=columns)
df.head()