In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
import pandas as pd

# Coba cookpad

In [108]:
print("meong") if False else None

In [2]:
def write_into_jsonl_files(file: str, content: dict):
    import jsonlines
    with jsonlines.open(file, mode='a') as writer:
        writer.write(content)

def write_normal_txt_files(file: str, content: str):
    with open(file, mode='a') as writer:
        writer.write(content + '\n')

In [3]:
class SeleniumCrawlOption(object):
    
    default_opt = {
        'limit' : 1000,
        'sleep_time' : 2,
        'file_output_type' : 'jsonl',
        'file_output_path' : 'tweets.jsonl'
    }
    
    def __init__(self, **args):
        self.limit = args['limit'] if 'limit' in args else self.default_opt['limit']
        self.sleep_time = args['sleep_time'] if 'sleep_time' in args else self.default_opt['sleep_time']
        self.file_output_type = args['file_output_type'] if 'file_output_type' in args else self.default_opt['file_output_type']
        self.file_output_path = args['file_output_path'] if 'file_output_path' in args else self.default_opt['file_output_path']

class BaseSeleniumCrawler(object):
    
    def __init__(self, crawl_ops: SeleniumCrawlOption, browser:str = 'firefox', 
                 driver_path:str = None, headless:bool = False):
        self.browser = browser.lower()
        if browser.lower() == 'firefox':
            assert driver_path is not None
            self.ops = Options()
            self.driver_path = driver_path
            ops.add_argument("-headless") if headless else None
            self._init_driver()
        else:
            raise NotImplementedError()
        self.crawl_ops = crawl_ops
    
    def _init_driver(self):
        if self.browser == 'firefox':
            self.driver = webdriver.Firefox(executable_path=self.driver_path, options=self.ops)
        
    def crawl(self, start_webpage, crawl_args: SeleniumCrawlOption = SeleniumCrawlOption,
              output_file='jsonl', output_type='jsonl'):
        raise NotImplementedError
    
    def _scrape(self):
        raise NotImplementedError
    
    def _get_next_url(self):
        raise NotImplementedError
    
    def _write_result(self, content) -> str:
        if self.crawl_ops.file_output_type == 'jsonl':
            write_into_jsonl_files(self.crawl_ops.file_output_path + 'output.jsonl', content)


In [225]:
time.time()

1569774970.1472015

In [33]:
import time
import traceback
class CookpadSelector():
    title_recipe = 'h1.recipe-title'
    description_recipe = 'div.recipe-show__story'
    ingredient_recipe = 'li.ingredient'
    step_recipe = 'div.step__description'
    list_url_scrape = 'a.media'
    next_page_url = 'a.pagination__next'

class CookpadSeleniumCrawler(BaseSeleniumCrawler):
    
    def __init__(self,crawl_ops: SeleniumCrawlOption, browser:str = 'firefox', 
                 driver_path:str = None, headless:bool = False):
        super().__init__(crawl_ops, browser, driver_path, headless)
        
    def crawl(self, start_webpage, crawl_args: SeleniumCrawlOption = SeleniumCrawlOption,
              output_file='jsonl', output_type='jsonl'):
        # Start
        self.driver.get(start_webpage)
        self.start_webpage = start_webpage
        print(self.crawl_ops.limit)
        try:
            self._count_scraped = 0
            while self._count_scraped < self.crawl_ops.limit:
                self._set_list_scrapped_url()
                next_url = self._get_next_url()
                write_normal_txt_files(self.crawl_ops.file_output_path + 'checkpoint.txt', next_url)

                self._scrape()
                print("Next to {}".format(next_url))
                print("Scrapped {} times".format(self._count_scraped))

                self.driver.get(next_url)
        except Exception as e:
            traceback.print_exc()
        finally:
            self.driver.close()
        

    def _trick_avoid_get_banned(self):
        import time
        self.driver.delete_all_cookies() # avoid getting banned
        time.sleep(self.crawl_ops.sleep_time) # avoid getting banned
    
    def _scrape(self):
        while len(self._list_scraped_url) > 0 and self._count_scraped < self.crawl_ops.limit:
            site_recipe = self._list_scraped_url.pop()
            self.driver.get(site_recipe)
            self._trick_avoid_get_banned()
            try:
                content = self._scrape_get_content(site_recipe)
            except:
                from selenium.webdriver.common.action_chains import ActionChains
                # Skip
                print("Light banned, need to click the bot clicker")
                elemnt = crawler_cookpad.driver.find_element_by_css_selector('html')
                clicker = ActionChains(crawler_cookpad.driver)
                clicker.click_and_hold(elemnt).perform()
                time.sleep(5)
#                 self.driver.close()
#                 self._init_driver()
            self._write_result(content)
            # Write checkpoint
            self._count_scraped += 1
    
    def _scrape_get_content(self, site_recipe) -> dict:
        print("Crawled {}".format(site_recipe))
        title_recipe = self.driver.find_element_by_css_selector(CookpadSelector.title_recipe).text
        description_recipe = self.driver.find_element_by_css_selector(CookpadSelector.description_recipe).text
        ingredient_recipe = list(map(lambda x : x.text, self.driver.find_elements_by_css_selector(CookpadSelector.ingredient_recipe)))
        step_recipe = list(map(lambda x : x.text, self.driver.find_elements_by_css_selector(CookpadSelector.step_recipe)))
        return {'title' : title_recipe, 
                  'description' : description_recipe, 
                  'ingredient' : ingredient_recipe,
                  'step' : step_recipe,
                  'link' : site_recipe,
                  'timestamps' : time.time(),
                   'start' : self.start_webpage}
        
    def _set_list_scrapped_url(self):
        link_to_recipe = self.driver.find_elements_by_css_selector(CookpadSelector.list_url_scrape)
        self._list_scraped_url = list(set(map(lambda x : x.get_attribute('href'), link_to_recipe)))
    
    def _get_next_url(self) -> str:
        return self.driver.find_element_by_css_selector(CookpadSelector.next_page_url).get_attribute('href')

In [None]:
path = "C:/Users/HaryoAW/Documents/Driver/selenium_gecko/geckodriver.exe"
start_webpage ='https://cookpad.com/id/cari/masakan%20padang?page=3'
opt = SeleniumCrawlOption(file_output_path = '../data/scrapped/cookpad_selenium/', sleep_time = 1)
crawler_cookpad = CookpadSeleniumCrawler(opt, driver_path = path, headless=False)
crawler_cookpad.crawl(start_webpage)

1000
Crawled https://cookpad.com/id/resep/10653666-gulai-kikil
Crawled https://cookpad.com/id/resep/10749826-dendeng-balado
Crawled https://cookpad.com/id/resep/10667748-kentang-balado
Crawled https://cookpad.com/id/resep/10748878-terong-balado-simple-pedas
Crawled https://cookpad.com/id/resep/10683005-gulai-ayam-padang
Crawled https://cookpad.com/id/resep/10754294-gulai-ayam-%F0%9F%90%93%F0%9F%90%94%F0%9F%90%A4%F0%9F%90%A3
Crawled https://cookpad.com/id/resep/10754805-gulai-jengkol
Crawled https://cookpad.com/id/resep/10754902-udang-saos-padang
Crawled https://cookpad.com/id/resep/10754276-gulai-ayam-kentang
Crawled https://cookpad.com/id/resep/10754766-kepiting-saos-padang-resep-sesuai-selera-saya
Crawled https://cookpad.com/id/resep/10755188-gule-kepala-kakap
Crawled https://cookpad.com/id/resep/10754356-keripik-balado-udang
Crawled https://cookpad.com/id/resep/10755531-sapi-balado-extra-pedas-non-msg
Crawled https://cookpad.com/id/resep/10755921-gulai-asam-ikan-kembung
Crawled http

In [13]:
path = "C:/Users/HaryoAW/Documents/Driver/selenium_gecko/geckodriver.exe"
start_webpage ='https://cookpad.com/id/cari/masakan%20padang?page=3'
opt = SeleniumCrawlOption(file_output_path = '../data/scrapped/cookpad_selenium/', sleep_time = 2)
crawler_cookpad = CookpadSeleniumCrawler(opt, driver_path = path, headless=False)

In [18]:
crawler_cookpad.driver.get(start_webpage)

In [25]:
elemnt = crawler_cookpad.driver.find_element_by_css_selector('html')

In [28]:
from selenium.webdriver.common.action_chains import ActionChains

In [30]:
test = ActionChains(crawler_cookpad.driver)

In [31]:
test.click_and_hold(elemnt).perform()

In [None]:
pd.set_option('display.max_colwidth', 1000)

In [234]:
driver = webdriver.Firefox(executable_path=path)

In [236]:
driver.get("https://cookpad.com/id/cari/masakan%20indonesia?page=17")

In [37]:
crawler_cookpad = CookpadSeleniumCrawler(opt, driver_path = path, headless=False)
crawler_cookpad.driver.get("https://cookpad.com/id/search_categories")

In [46]:
extract_url = crawler_cookpad.driver.find_elements_by_css_selector('.table-view__item a')

In [48]:
link_todo = list(map(lambda x : x.get_attribute('href'),extract_url))

In [None]:
for link in link_todo:
    try:
        path = "C:/Users/HaryoAW/Documents/Driver/selenium_gecko/geckodriver.exe"
        start_webpage = link
        opt = SeleniumCrawlOption(file_output_path = '../data/scrapped/cookpad_selenium/', sleep_time = 2.8)
        crawler_cookpad = CookpadSeleniumCrawler(opt, driver_path = path, headless=False)
    except:
        print("go to next link")