# Методы сбора и обработки данных из сети Интернет
## Урок 3. MongoDB, SQLAlchemy
### Задание
1. Развернуть у себя на компьютере/виртуальной машине/хостинге MongoDB и реализовать функцию, записывающую собранные данные в созданную БД
2. Написать функцию, которая производит поиск и выводит на экран рецепты с перечисленными ингридиентами
3. *Написать функцию, которая будет добавлять в вашу базу данных только новые рецепты с сайта. Доработать функцию, которая будет обновлять старые.

### Запуск MongoDB

In [3]:
!docker run -d -p 27017:27017 --name scrapy_mongo mongo

Unable to find image 'mongo:latest' locally
latest: Pulling from library/mongo

[1Ba4a261c9: Pulling fs layer 
[1B20cdee96: Pulling fs layer 
[1B60e1d0de: Pulling fs layer 
[1B7668deea: Pulling fs layer 
[1B87a82b4c: Pulling fs layer 
[1B139e0836: Pulling fs layer 
[1B9c8680b4: Pulling fs layer 
[1Bdf30d947: Pulling fs layer 
[1B5ef3d2ce: Pulling fs layer 
[1B54ed6b43: Pulling fs layer 
[1Be535ddb8: Pulling fs layer 
[1Bdad81b2a: Pulling fs layer 
[1BDigest: sha256:7a1406bfc05547b33a3b7b112eda6346f42ea93ee06b74d30c4c47dfeca0d5f2[2K[13A[2K[13A[2K[10A[2K[13A[2K[13A[2K[13A[2K[13A[2K[13A[2K[13A[2K[13A[2K[13A[2K[7A[2K[13A[2K[7A[2K[8A[2K[13A[2K[8A[2K[7A[2K[8A[2K[13A[2K[8A[2K[13A[2K[8A[2K[7A[2K[8A[2K[7A[2K[8A[2K[8A[2K[7A[2K[13A[2K[13A[2K[8A[2K[13A[2K[8A[2K[13A[2K[7A[2K[8A[2K[7A[2K[8A[2K[7A[2K[13A[2K[7A[2K[13A[2K[7A[2K[13A[2K[7A[2K[8A[2K[7A[2K[8A[2K[13A[2K[13A[2K[7A[2K[13A[2K

In [7]:
!docker start scrapy_mongo

scrapy_mongo


In [6]:
!docker stop scrapy_mongo

scrapy_mongo


### Секция импорта

In [20]:
from pymongo import MongoClient
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

### Работа с БД

In [9]:
client = MongoClient('localhost', 27017)
db = client['users_db_224']

In [14]:
users = db.users
users.insert_one({'author':'Test_author2'})

<pymongo.results.InsertOneResult at 0x10adbbfa0>

In [18]:
users.delete_many({})

<pymongo.results.DeleteResult at 0x10adc4910>

In [17]:
[print(x) for x in users.find()]

[]

### Скраппер

In [21]:
class BaseScrapper:

    def __init__(self, category: str, init_page_num: int = 0, crawl_delay: int = 1):
        self._host: str = 'https://www.povarenok.ru/recipes'
        self._category: str = category
        self._crawl_delay: int = crawl_delay
        self._current_page_num: int = init_page_num
        self._last_recipes_urls: set = set()
        self._data_to_return: set = set()

    def __iter__(self) -> iter:
        return self

    def __next__(self):
        if len(self._data_to_return) == 0:
            page = self._next_page()
            recipes = self._extract_recipes(page)
            recipes_urls = set([recipe['url'] for recipe in recipes])
            if self._last_recipes_urls != recipes_urls:
                self._last_recipes_urls = recipes_urls
                self._data_to_return = recipes
                time.sleep(self._crawl_delay)
                return self.__next__()
            else:
                raise StopIteration
        else:
            return self._data_to_return.pop()

    def _next_page(self) -> bs:
        self._current_page_num += 1
        return self._get_page(self._current_page_num)

    def _get_page(self, page_num: int) -> bs:
        pass

    def _extract_recipes(self, page: bs) -> set:
        pass

    def reset(self):
        self._current_page_num = 0
        self._last_recipes = set()

    def __repr__(self) -> str:
        return '{clazz}({sep}host={host},{sep}category={category},{sep}crawl_delay={crawl_delay},\n)'.format(
            sep='\n\t',
            clazz=self.__class__.__name__,
            host=self._host,
            category=self._category,
            crawl_delay=self._crawl_delay,
        )

In [None]:
class PovarenokScrapper(BaseScrapper):

    def __init__(self, category: str, crawl_delay: int = 1):
        super().__init__(category, crawl_delay)
        self._host: str = 'https://www.povarenok.ru/recipes'
        self._category: str = {
            'ch': 'kitchen/73',
            'fr': 'kitchen/64',
            'it': 'kitchen/56',
            'jp': 'kitchen/79',
            'ru': 'kitchen/101',
            'ua': 'kitchen/104',
        }[category]

    def _get_page(self, page_num: int) -> bs:
        url = f'{self._host}/{self._category}/~{page_num}/'
        content = requests.get(url).text
        parsed = bs(content, 'lxml')
        return parsed

    def _extract_recipes(self, page: bs) -> set:
        return set([self._extract_recipe(article) for article in page.find_all('article')])

    def _extract_recipe(article: bs) -> dict:
        recipe = {}
        recipe['raw'] = str(article)
        recipe['name'] = article.select_one('article > h2').get_text().strip()
        recipe['url'] = article.select_one('article > h2 > a')['href']
        recipe['ingr'] = ','.join(map(lambda span: span.get_text().strip().lower(), article.select('div.ingr_fast span')))
        recipe['views'] = article.select_one('ul.icons-wrap .i-views').get_text()
        recipe['comm'] = article.select_one('ul.icons-wrap .i-comments').get_text()
        recipe['likes'] = article.select_one('ul.icons-wrap .i-likes').get_text()
        recipe['category'] = self._category
        return recipe

In [None]:
class RussianFoodScrapper(BaseScrapper):

    def __init__(self, category: str, crawl_delay: int = 1):
        super().__init__(category, crawl_delay)
        self._host: str = 'https://www.russianfood.com/recipes/'
        self._category: str = {
            'ch': 'bytype/?fid=132',
            'fr': 'bytype/?fid=102',
            'it': 'bytype/?fid=110',
            'jp': 'bytype/?fid=154',
            'ru': 'bytype/?fid=103',
            'ua': 'bytype/?fid=104',
        }[category]

    def _get_page(self, page_num: int) -> bs:
        url = f'{self._host}/{self._category}/~{page_num}/'
        content = requests.get(url).text
        parsed = bs(content, 'lxml')
        return parsed

    def _extract_recipes(self, page: bs) -> set:
        return set([self._extract_recipe(article) for article in page.find_all('article')])

    def _extract_recipe(article: bs) -> dict:
        recipe = {}
        recipe['raw'] = str(article)
        recipe['name'] = article.select_one('article > h2').get_text().strip()
        recipe['url'] = article.select_one('article > h2 > a')['href']
        recipe['ingr'] = ','.join(map(lambda span: span.get_text().strip().lower(), article.select('div.ingr_fast span')))
        recipe['views'] = article.select_one('ul.icons-wrap .i-views').get_text()
        recipe['comm'] = article.select_one('ul.icons-wrap .i-comments').get_text()
        recipe['likes'] = article.select_one('ul.icons-wrap .i-likes').get_text()
        recipe['category'] = self._category
        return recipe