# Recipes crawler

## Spiders

In [3]:
import requests
from fake_useragent import UserAgent
from lxml import html, etree
from datetime import datetime
import unicodedata
import locale

locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8')

'ru_RU.UTF-8'

In [4]:
class BaseScrapper:

    def __init__(self, url: str, user_agent: str = UserAgent().random):
        self._url: str = url
        self._data_to_return: list = None
        self._current_pos: int = -1
        self._session = requests.Session()
        self._session.headers.update({'User-Agent': user_agent,})

    def __iter__(self) -> iter:
        return self

    def __next__(self):
        if self._data_to_return is None:
            self._data_to_return = self._fetch_data()
        self._current_pos += 1
        if self._current_pos >= len(self._data_to_return):
            raise StopIteration
        return self._data_to_return[self._current_pos]
        
    def _fetch_data(self) -> set:
        page = self._get_page(self._url)
        data = self._extract_data(page)
        return data

    def _get_page(self, url) -> html.HtmlElement:
        content = self._session.get(url).text
        if (self._session.status_code != requests.codes.ok):
            raise Exception('Response code', self._session.status_code, 'for url', url)
        parsed = html.fromstring(content)
        return parsed

    def _extract_data(self, parsed: html.HtmlElement) -> list:
        result = []
        articles = self._get_articles(parsed)
        for article in articles:
            try:
                result.append(self._extract_data_elem(article))
            except Exception as e:
                print('Article:', etree.tostring(article))
                raise e
        return result
    
    def _get_articles(self, parsed: html.HtmlElement) -> list:
        raise NotImplementedError
    
    def _extract_data_elem(self, article: html.HtmlElement) -> dict:
        raise NotImplementedError
    
    def __repr__(self) -> str:
        return '{clazz}({sep}host={host},{sep}fetched={fetched},\n)'.format(
            sep='\n\t',
            clazz=self.__class__.__name__,
            host=self._host,
            fetched=(self._data_to_return is not None),
        )

In [None]:
class MailRuScrapper(BaseScrapper):

    def __init__(self):
        super().__init__(host='https://news.mail.ru')
        
    def _next_page_url(self, parsed: html.HtmlElement) -> str:
        pass

    def _get_articles(self, parsed: html.HtmlElement) -> list:
        return parsed.xpath('//div[contains(@class,"newsitem_height")]')

    def _extract_data_elem(self, article: html.HtmlElement) -> dict:
        return {
            'raw': etree.tostring(article),
            'url': self._host + article.xpath('.//span[@class="cell"]/a[@class="newsitem__title link-holder"]/@href')[0], 
            'header': unicodedata.normalize('NFKD', article.xpath('.//span[@class="newsitem__title-inner"]/text()')[0]),
            'date': datetime.strptime(article.xpath('.//div[@class="newsitem__params"]/span[contains(@class,"js-ago")]/@datetime')[0], '%Y-%m-%dT%H:%M:%S%z'),
            'source': unicodedata.normalize('NFKD', article.xpath('.//div[@class="newsitem__params"]/span[@class="newsitem__param"]/text()')[0]),
        }

## Persisting

Create mongodb docker container

In [1]:
!docker run -d -p 27017:27017 --name mongodb mongo

a30840108e9e6d7f28141e005ef0024c6b8d01b52b7cd03c7bc88e7ab564c2b2


Start mongo db

In [None]:
!docker start mongo

Run spiders and fill database

In [None]:
mongo = MongoClient('localhost', 27017)
db = mongo['food_db']

In [None]:
def fill_via_scrapper(recipes, collection):
    for recipe in recipes:
        if collection.update_one({'_id': recipe['url']}, {'$set': recipe}, upsert=True).matched_count != 0:
            break

Get dump from mongodb container

In [None]:
!docker exec -it mongodb mongodump --out=/backup/ --db=food_db --collection=recipes 
!docker exec -it mongodb tar czf dump.mongo.tgz /backup
!docker cp mongodb:/dump.mongo.tgz dump.mongo.tgz
!docker exec -it mongodb rm -rf /backup /dump.mongo.tgz

Put dump into mongodb container

In [None]:
!docker cp dump.mongo.tgz mongodb:/dump.mongo.tgz
!docker exec -it mongodb tar xzf dump.mongo.tgz
!docker exec -it mongodb mongorestore /backup
!docker exec -it mongodb rm -rf /backup /dump.mongo.tgz

Shutdown mongo db

In [None]:
!docker stop mongo

## Data validation

In [None]:
from pprint import pprint

In [2]:
def get_data(ingr, collection, limit):
    for recipe in collection.find({'ingr': ingr}, {'name': 1, 'ingr': 1, 'url': 1}):
        print(recipe['name'])
        print('url:', recipe['url'])
        pprint(recipe['ingr'])
        print()
        limit -= 1
        if limit <= 0:
            break;

In [None]:
get_data('фарш', food_db.recipes, 3)