# Recipes scraping

## Spiders

In [3]:
import requests
from fake_useragent import UserAgent
from lxml import html, etree
from datetime import datetime
import unicodedata
import locale

locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8')

'ru_RU.UTF-8'

In [6]:
class BaseScrapper:

    def __init__(self, user_agent: str = UserAgent().random, crawl_delay: int = 0):
        self._crawl_delay = crawl_delay
        self._last_item_urls: set = set()
        self._data_to_return: list = []
        self._session = requests.Session()
        self._session.headers.update({'User-Agent': user_agent,})

    def __iter__(self) -> iter:
        return self

    def __next__(self):
        if len(self._data_to_return) == 0:
            data = self._crawl_data()
            item_urls = set([item['url'] for item in data])
            if self._last_item_urls != item_urls:
                self._last_item_urls = item_urls
                self._data_to_return = data
                return self.__next__()
            else:
                raise StopIteration
        else:
            return self._data_to_return.pop()

    def _crawl_data(self) -> set:
        url = self._next_url()
        page = self._get_page(url)
        data = self._extract_data(page)
        return data

    def _get_page(self, url) -> html.HtmlElement:
        time.sleep(self._crawl_delay)
        content = self._session.get(url).text
        if (self._session.status_code != requests.codes.ok):
            raise Exception('Response code', self._session.status_code, 'for url', url)
        parsed = html.fromstring(content)
        return parsed

    def _extract_data(self, parsed: html.HtmlElement) -> list:
        data = []
        items = self._get_items(parsed)
        for item in items:
            try:
                data.append(self._parse_item(item))
            except Exception as e:
                print('item:', etree.tostring(item))
                raise e
        return data

    def _next_url(self) -> str:
        raise NotImplementedError

    def _get_items(self, page: html.HtmlElement) -> list:
        raise NotImplementedError

    def _parse_item(self, item: html.HtmlElement) -> dict:
        raise NotImplementedError

In [7]:
class RussianFoodScrapper(BaseScrapper):

    def __init__(self):
        super().__init__()

    def _next_url(self) -> str:
        host = 'https://www.russianfood.com'
        start_page = 1
        end_page = 220
        if not self._cur_page:
            self._cur_page = start_page
        elif self._cur_page <= 220:
            self._cur_page += 1
        return f'{host}/?page={self._cur_page}'

    def _get_items(self, page: html.HtmlElement) -> list:
        return page.xpath('//div[@class="annonce annonce_orange"]')

    def _parse_item(self, item: html.HtmlElement) -> dict:
        return {
            'raw': etree.tostring(article),
            'url': 'https://www.russianfood.com' + item.xpath('.//table[@class="blog_content_table"]//noindex/a[@class="detail"]/@href')[0], 
            'name': unicodedata.normalize('NFKD', article.xpath('.//span[@class="newsitem__title-inner"]/text()')[0]),
            'ingr': datetime.strptime(article.xpath('.//div[@class="newsitem__params"]/span[contains(@class,"js-ago")]/@datetime')[0], '%Y-%m-%dT%H:%M:%S%z'),
            'recipe': '',
            'rating': unicodedata.normalize('NFKD', article.xpath('.//div[@class="newsitem__params"]/span[@class="newsitem__param"]/text()')[0]),
        }

## Persisting

Create mongodb docker container

In [1]:
!docker run -d -p 27017:27017 --name mongodb mongo

a30840108e9e6d7f28141e005ef0024c6b8d01b52b7cd03c7bc88e7ab564c2b2


Start mongo db

In [None]:
!docker start mongo

Run spiders and fill database

In [None]:
mongo = MongoClient('localhost', 27017)
db = mongo['food_db']

In [None]:
def fill_via_scrapper(recipes, collection):
    for recipe in recipes:
        if collection.update_one({'_id': recipe['url']}, {'$set': recipe}, upsert=True).matched_count != 0:
            break

Get dump from mongodb container

In [None]:
!docker exec -it mongodb mongodump --out=/backup/ --db=food_db --collection=recipes 
!docker exec -it mongodb tar czf dump.mongo.tgz /backup
!docker cp mongodb:/dump.mongo.tgz dump.mongo.tgz
!docker exec -it mongodb rm -rf /backup /dump.mongo.tgz

Put dump into mongodb container

In [None]:
!docker cp dump.mongo.tgz mongodb:/dump.mongo.tgz
!docker exec -it mongodb tar xzf dump.mongo.tgz
!docker exec -it mongodb mongorestore /backup
!docker exec -it mongodb rm -rf /backup /dump.mongo.tgz

Shutdown mongo db

In [None]:
!docker stop mongo

## Data validation

In [None]:
from pprint import pprint

In [2]:
def get_data(ingr, collection, limit):
    for recipe in collection.find({'ingr': ingr}, {'name': 1, 'ingr': 1, 'url': 1}):
        print(recipe['name'])
        print('url:', recipe['url'])
        pprint(recipe['ingr'])
        print()
        limit -= 1
        if limit <= 0:
            break;

In [None]:
get_data('фарш', food_db.recipes, 3)