In [28]:
from bs4 import BeautifulSoup
import urllib2
import urllib
import re
import sys

# sys.setdefaultencoding() does not exist, here!
reload(sys)  # Reload does the trick!
sys.setdefaultencoding('utf-8')

## Required Info
0. webpage url
1. image url [func: scrape_image_url()]
2. entry title [func: scrape_entry_title()]
3. detail url [func: scrape_detail_url()]
4. ingredient (inside detail url) [func: scrape_ingredient()]
5. next webpage url [func: scrape_next_url()]

In [29]:
class Scrape(object):
    """Web Scrape: ruled.me/keto-recipes
    
    Attributes:
        url: A string with keto recipes
        mealtype: A string with type of meal for classification
       
    """

    def __init__(self, url, mealtype):
        """Return a new Scrape object."""
        req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) 
        con = urllib2.urlopen( req )
        output = con.read()
        self.data = BeautifulSoup(output, "lxml")
        self.url = url
        self.mealtype = mealtype
        self.post = self.data.findAll("div", { "class" : "type-post" })

    # 1.image url [func: scrape_image_url()]
    def scrape_image_url(self):
        """ Return the url for all the images"""
        self.image_url = [i.find('img')['src'] for i in self.post]
    
    def save_image(self, location = 'pic/'):
        """ Save image as a file"""
        if location is None:
            location = ''
        
        file_name = [location + i + '.jpg' for i in self.title]
        for n in range(len(file_name)):
            # print file_name[n]
            urllib.urlretrieve(self.image_url[n], file_name[n])

    # 2.entry title [func: scrape_entry_title()]
    def scrape_entry_title(self):
        """ Return the title for all the recipes"""
        self.title = [i.find('h2').getText() for i in self.post]
    
    # 3.detail url [func: scrape_detail_url()]
    def scrape_detail_url(self):
        """ Return the url for detail information"""
        self.detail_url = [i.find("div", { "class" : "post-data" }).find('a')['href'] for i in self.post]

    # 4.ingredient (inside detail url) [func: scrape_ingredient()]    
    def scrape_ingredient(self):
        """ Return ingredients from detail url """
        def ingredients(string_url):
            # print string_url
            
            req = urllib2.Request(string_url, headers={'User-Agent' : "Magic Browser"}) 
            con = urllib2.urlopen( req )
            output = con.read()
            data = BeautifulSoup(output, "lxml")
            try:
                li_data = data.find("div", { "class" : "entry-content" }).find('ul').findAll('li')
                result = [i.getText() for i in li_data]
            except:
                result = []
        
            return result
    
        self.ingredient = [ingredients(i) for i in self.detail_url]
    
    # 5.next webpage url [func: scrape_next_url()]
    def scrape_next_url(self, n=20):
        """ Return the next url page for recipes """
        self.next_url = [self.url + 'page/' + str(i) for i in range(n) if i > 1]
    
    def scrape_all(self):
        """ Return all the scraping done in above functions """
        self.scrape_image_url()
        self.scrape_entry_title()
        self.scrape_detail_url()
        self.scrape_next_url()
        self.scrape_ingredient()
        self.save_image()
        
    def as_df(self):
        """ Convert data as pandas DataFrame """
        import pandas as pd
        
        df = pd.DataFrame()
        df['MealType'] = [self.mealtype] * len(self.title)
        df['Title'] = self.title
        df['ImageUrl'] = self.image_url
        df['Ingredient'] = [' // '.join(i) for i in self.ingredient]
        df['DetailUrl'] = self.detail_url
        
        return df
    
    def roll_through(self):
        """ Repeat above process with next_url """
        import pandas as pd
        self.scrape_all()
        output_df = self.as_df()

        for n_url in self.next_url:
            try:
                scrape_object = Scrape(n_url, self.mealtype)
                scrape_object.scrape_all()
                output_df = output_df.append(scrape_object.as_df())
                # print scrape_object.as_df()
            except:
                print n_url
                pass

        file_name = "Keto-Recipe-" + self.mealtype + ".csv"
        print file_name
        
        output_df.to_csv(file_name,sep=",")
        # return output_df
            
            
        
    

In [30]:
all_urls = [['breakfast', 'http://www.ruled.me/keto-recipes/breakfast/'],
            ['lunch', 'http://www.ruled.me/keto-recipes/lunch/'],
            ['dinner', 'http://www.ruled.me/keto-recipes/dinner/'],
            ['dessert', 'http://www.ruled.me/keto-recipes/dessert/'],
            ['snacks', 'http://www.ruled.me/keto-recipes/snacks/'],
            ['side items', 'http://www.ruled.me/keto-recipes/side-items/'],
            ['condiments', 'http://www.ruled.me/keto-recipes/condiments/']]

In [31]:
breakfast = Scrape('http://www.ruled.me/keto-recipes/breakfast/', 'breakfast')

In [32]:
for obj in all_urls:
    scrape_object = Scrape(obj[1],obj[0])
    scrape_object.roll_through()

In [8]:
breakfast.scrape_all()