## Notebook for testing out the NYT Scraper

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np

In [2]:
url = 'http://cooking.nytimes.com/recipes/1017728-spicy-fried-shrimp-with-green-chutney'

if 'cooking.nytimes.com' in url:
    html = requests.get(url)
else:
    print("URL is no good")

In [3]:
soup = BeautifulSoup(html.content, "html.parser")

In [5]:
try:
    recipe_name = soup.find('h1', {'class': 'recipe-title title name'}).text.strip()
except AttributeError:
    recipe_name = []
    
print(recipe_name)

Spicy Fried Shrimp With Green Chutney


In [6]:
# <span class="byline-name" itemprop="author">Martha Rose Shulman</span>

try:
    recipe_author = soup.find('span', {'class': 'byline-name', 'itemprop': 'author'}).text.strip()
except AttributeError:
    recipe_author = []
    
print(recipe_author)

David Tanis


In [7]:
try:
    img_url = soup.find('div', {'class': 'recipe-intro'}).find('img')['src']
except AttributeError:
    img_url = None

print(img_url)

http://graphics8.nytimes.com/images/2015/10/07/dining/07KITCHEN1/07KITCHEN1-articleLarge.jpg


In [8]:
try:
    time_yield = [ty.text.strip() for ty in soup.find('ul', {'class': 'recipe-time-yield'}).findAll('li')]
except AttributeError:
    time_yield = []

print(time_yield)

['Time40 minutes', 'Yield4 to 6 servings']


In [9]:
try:
    description = soup.find('div', {'itemprop': 'description'}).text.strip()
except AttributeError:
    description = ''
    
print(description)

This highly seasoned Indian approach to fried shrimp elevates the concept. Perfect for snacking with drinks, it can be a meal with rice, dal and vegetables.

Featured in: 
Fried Shrimp That Welcome The Spice.


In [10]:
try:
    categories = [a.text for a in soup.find('p', {'class': 'special-diets tag-block'}).findAll('a')]
except AttributeError:
    categories = []
    
print(categories)

['Chickpea Flour', 'Shrimp', 'Indian']


In [13]:
rec_in_wrap = soup.find('section', {'class': 'recipe-ingredients-wrap'})
headers = [t.text for t in rec_in_wrap.findAll('h4', {'class': 'part-name'})]
print(headers)

['For the shrimp:', 'For the chutney:']


In [16]:
[t.text for l.findAll('li', {'itemprop': 'recipeIngredient'}) 
 for l in rec_in_wrap.findAll('ul', {'class': 'recipe-ingredients'})
for t in ]

# l = [t.findAll('ul', {'class': 'recipe-ingredients'}) for t in rec_in_wrap]
# i = [t.findAll('li', {'itemprop': 'recipeIngredient'}) for t in l[0]]
# for recipe_group in i:
#     print([r.text for r in recipe_group])

[[<li itemprop="recipeIngredient">
  <span class="quantity">1</span>
  <span class="ingredient-name">pound medium <span itemprop="name">shrimp</span>, peeled and deveined</span>
  </li>, <li itemprop="recipeIngredient">
  <span class="quantity">1</span>
  <span class="ingredient-name">teaspoon <span itemprop="name">salt</span></span>
  </li>, <li itemprop="recipeIngredient">
  <span class="quantity">½</span>
  <span class="ingredient-name">teaspoon <span itemprop="name">turmeric</span></span>
  </li>, <li itemprop="recipeIngredient">
  <span class="quantity">¼</span>
  <span class="ingredient-name">teaspoon <span itemprop="name">cayenne</span></span>
  </li>, <li itemprop="recipeIngredient">
  <span class="quantity">½</span>
  <span class="ingredient-name">teaspoon <span itemprop="name">black pepper</span></span>
  </li>, <li itemprop="recipeIngredient">
  <span class="quantity">½</span>
  <span class="ingredient-name">teaspoon grated <span itemprop="name">garlic</span></span>
  </li>,

In [30]:
try:
    ingredients_full = [n.text.strip().replace('\n', ' ') 
                        for n in soup.find('ul', {'class': 'recipe-ingredients'})
                                                .findAll('li')]
    ingredients_name = [n.text for n in soup.find('ul', {'class': 'recipe-ingredients'})
                                    .findAll('span', {'itemprop': 'name'})]
except AttributeError:
    ingredients_full = []
    ingredients_name = []
    
#print(ingredients_name)
print(ingredients_full)

[u'1 pound medium shrimp, peeled and deveined', u'1 teaspoon salt', u'\xbd teaspoon turmeric', u'\xbc teaspoon cayenne', u'\xbd teaspoon black pepper', u'\xbd teaspoon grated garlic', u'\xbd teaspoon grated ginger', u'\xbd teaspoon garam masala', u'1 teaspoon green or red chile, minced', u'2 tablespoons chopped cilantro', u'2 tablespoons chopped mint', u'2 tablespoons lemon juice', u'3 tablespoons chickpea flour (besan flour)', u'3 tablespoons rice flour', u'Vegetable oil for frying']


In [13]:
try:
    directions = [l.text for l in soup.find('ol', {'class': 'recipe-steps'}).findAll('li')]
except AttributeError:
    directions = []
    
print(directions)

[u'Preheat oven to 450.', u'Scrub potatoes under running water; dry them, and rub the skin of each with the oil and a little salt. Pierce the skin of each in three or four places with the tines of a fork.', u'Place the potatoes in the oven, and roast for 45 minutes to an hour, depending on the size of the potatoes, until they offer no resistance when a knife is inserted in their centers.', u'Remove the potatoes from the oven, slice them open down the middle, apply a tablespoon of butter to each one and serve immediately.']


In [14]:
try:
    notes = [l.text.strip() for l in soup.find('ul', {'class': 'recipe-notes'}).findAll('li')]
except AttributeError:
    notes = []
    
print(notes)

[]


In [15]:
try:
    servings = soup.find('div', {'class': 'nutrition-tooltip'}).find('h5').text.strip()
    nutrition = soup.find('span', {'class': 'description', 'itemprop': 'nutrition'}).text.strip()
except AttributeError:
    servings, nutrition = [], []
    
print(servings)
print(nutrition)

Nutritional analysis per serving (4 servings)
330 calories; 12 grams fat; 7 grams saturated fat; 0 grams trans fat; 3 grams monounsaturated fat; 0 grams polyunsaturated fat; 50 grams carbohydrates; 3 grams dietary fiber; 1 gram sugars; 6 grams protein; 30 milligrams cholesterol; 566 milligrams sodium


In [16]:
from scrapers import NYT
#html = requests.get('http://cooking.nytimes.com/recipes/1014382-hazelnut-cheesecake-with-salted-caramel-glaze')
nyt = NYT(html.content)

In [17]:
print(nyt)

UnicodeEncodeError: 'ascii' codec can't encode character u'\xf1' in position 188: ordinal not in range(128)