Skip to content

Commit

Permalink
Merge pull request #6 from ecrax/@type_array
Browse files Browse the repository at this point in the history
added support for "@type" arrays
  • Loading branch information
kkyr committed Mar 16, 2023
2 parents 60f6861 + ff31a9f commit f89e0c2
Show file tree
Hide file tree
Showing 7 changed files with 6,060 additions and 5 deletions.
88 changes: 88 additions & 0 deletions internal/html/scrape/custom/allrecipes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package custom

import (
"fmt"
"time"

"github.com/kkyr/go-recipe"
"github.com/kkyr/go-recipe/internal/html/scrape/schema"

"github.com/PuerkitoBio/goquery"
)

const AllRecipesHost = "allrecipes.com"

// NewAllRecipesScraper returns a new instance of AllRecipesScraper.
func NewAllRecipesScraper(doc *goquery.Document) (recipe.Scraper, error) {
s, err := schema.NewRecipeScraper(doc)
if err != nil {
return nil, fmt.Errorf("unable to create schema scraper: %w", err)
}

return &AllRecipesScraper{schema: s}, nil
}

// AllRecipesScraper is a custom recipe scraper for allrecipes.com.
type AllRecipesScraper struct {
schema *schema.RecipeScraper
}

func (m *AllRecipesScraper) Author() (string, bool) {
return m.schema.Author()
}

func (m *AllRecipesScraper) Categories() ([]string, bool) {
return m.schema.Categories()
}

func (m *AllRecipesScraper) CookTime() (time.Duration, bool) {
return m.schema.CookTime()
}

func (m *AllRecipesScraper) Cuisine() ([]string, bool) {
return m.schema.Cuisine()
}

func (m *AllRecipesScraper) Description() (string, bool) {
return m.schema.Description()
}

func (m *AllRecipesScraper) ImageURL() (string, bool) {
return m.schema.ImageURL()
}

func (m *AllRecipesScraper) Ingredients() ([]string, bool) {
return m.schema.Ingredients()
}

func (m *AllRecipesScraper) Instructions() ([]string, bool) {
return m.schema.Instructions()
}

func (m *AllRecipesScraper) Language() (string, bool) {
return m.schema.Language()
}

func (m *AllRecipesScraper) Name() (string, bool) {
return m.schema.Name()
}

func (m *AllRecipesScraper) Nutrition() (recipe.Nutrition, bool) {
return m.schema.Nutrition()
}

func (m *AllRecipesScraper) PrepTime() (time.Duration, bool) {
return m.schema.PrepTime()
}

func (m *AllRecipesScraper) SuitableDiets() ([]recipe.Diet, bool) {
return m.schema.SuitableDiets()
}

func (m *AllRecipesScraper) TotalTime() (time.Duration, bool) {
return m.schema.TotalTime()
}

func (m *AllRecipesScraper) Yields() (string, bool) {
return m.schema.Yields()
}
39 changes: 39 additions & 0 deletions internal/html/scrape/custom/allrecipes_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package custom_test

import (
"testing"
"time"

"github.com/kkyr/go-recipe"
"github.com/kkyr/go-recipe/internal/html/scrape/custom"
"github.com/kkyr/go-recipe/internal/html/scrape/test"

"github.com/kkyr/assert"
)

func TestNewAllRecipesScraper(t *testing.T) {
doc := test.ReadHTMLFileOrFail(t, custom.AllRecipesHost)

scraper, err := custom.NewAllRecipesScraper(doc)
assert.New(t).Require().Nil(err)

scraperTest := test.Scraper{
Author: "Jackie M",
Categories: nil,
CookTime: 30 * time.Minute,
Cuisine: nil,
Description: "This marinara sauce recipe is made by blending tomatoes, parsley, garlic, and oregano before simmering with onion and white wine for amazing flavor.",
ImageURL: "https://www.allrecipes.com/thmb/rAJNjIWA7FHaI4bveYdZFkCJ7oM=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/11966-best-marinara-sauce-ddmfs-137-1x1-1-a9d124d5e86d463d87fa99de61bb9f02.jpg",
Ingredients: []string{"2 (14.5 ounce) cans stewed tomatoes", "1 (6 ounce) can tomato paste", "4 tablespoons chopped fresh parsley", "1 clove garlic, minced", "1 teaspoon dried oregano", "1 teaspoon salt", "0.25 teaspoon ground black pepper", "6 tablespoons olive oil", "0.33333334326744 cup finely diced onion", "0.5 cup white wine"},
Instructions: []string{"Place tomatoes, tomato paste, parsley, garlic, oregano, salt, and pepper in a food processor; blend until smooth.", "Heat oil in a large skillet over medium heat. Add onion and cook until slightly softened, about 2 minutes. Stir in blended tomato sauce and white wine.", "Simmer, stirring occasionally, until thickened, about 30 minutes."},
Language: "",
Name: "Best Marinara Sauce Yet",
Nutrition: recipe.Nutrition{Calories: 151, CarbohydrateGrams: 12, CholesterolMilligrams: 0, FatGrams: 11, FiberGrams: 2, ProteinGrams: 2, SaturatedFatGrams: 2, ServingSize: "", SodiumMilligrams: 685, SugarGrams: 7, TransFatGrams: 0, UnsaturatedFatGrams: 0},
PrepTime: 15 * time.Minute,
SuitableDiets: nil,
TotalTime: 45 * time.Minute,
Yields: "8",
}

scraperTest.Run(t, scraper)
}
5,686 changes: 5,686 additions & 0 deletions internal/html/scrape/custom/testdata/allrecipes.com.html

Large diffs are not rendered by default.

41 changes: 37 additions & 4 deletions internal/html/scrape/schema/json-ld/recipe.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ld

import (
"encoding/json"
"errors"
"fmt"

"github.com/PuerkitoBio/goquery"
Expand Down Expand Up @@ -64,8 +65,8 @@ func (rp *RecipeProcessor) GetRecipeNode(doc *goquery.Document) (map[string]any,
}

func (rp *RecipeProcessor) parseJSON(data string) (map[string]any, error) {
var nodeMap map[string]any
if err := json.Unmarshal([]byte(data), &nodeMap); err != nil {
nodeMap, err := unmarshalJSONObjectOrArray(data)
if err != nil {
return nil, fmt.Errorf("unmarshal data failed: %w", err)
}

Expand All @@ -83,14 +84,40 @@ func (rp *RecipeProcessor) parseJSON(data string) (map[string]any, error) {

addSchemaCtx(recipeNode)

recipeNode, err := rp.proc.Compact(recipeNode, rp.ctx, rp.opts)
recipeNode, err = rp.proc.Compact(recipeNode, rp.ctx, rp.opts)
if err != nil {
return nil, fmt.Errorf("could not compact Recipe node: %w", err)
}

return recipeNode, nil
}

func unmarshalJSONObjectOrArray(data string) (map[string]any, error) {
var m map[string]any
if err := json.Unmarshal([]byte(data), &m); err == nil {
return m, nil
}

var nodes []any
if err := json.Unmarshal([]byte(data), &nodes); err != nil {
var syntaxError *json.SyntaxError
if errors.As(err, &syntaxError) {
return nil, fmt.Errorf("unmarshal as array failed at byte offset %d, because of: \"%w\"",
syntaxError.Offset, syntaxError)
}

return nil, fmt.Errorf("unmarshal as array failed: %w", err)
}

for _, node := range nodes {
if m, ok := node.(map[string]any); ok {
return m, nil
}
}

return nil, fmt.Errorf("unable to unmarshal data")
}

func isGraphNode(v any) bool {
vMap, isMap := v.(map[string]any)
_, containsGraph := vMap[graphKey]
Expand All @@ -110,8 +137,14 @@ func addSchemaCtx(v any) {
func findRecipeNode(nodes []any) (map[string]any, bool) {
for _, node := range nodes {
if m, ok := node.(map[string]any); ok {
if m[typeKey] == recipeType {
if t, ok := m[typeKey].(string); ok && t == recipeType {
return m, true
} else if t, ok := m[typeKey].([]interface{}); ok {
for _, v := range t {
if v == recipeType {
return m, true
}
}
}
}
}
Expand Down
35 changes: 34 additions & 1 deletion internal/html/scrape/schema/json-ld/recipe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ func TestRecipeProcessor_GetRecipeNode(t *testing.T) {
{name: "parses graph", file: "json-ld-schema-graph.html"},
{name: "parses graph with no schema", file: "json-ld-schema-graph-no-schema.html"},
{name: "parses node", file: "json-ld-schema-node.html"},
{name: "parses graph with array", file: "json-ld-schema-as-array-type-array.html"},
} {
t.Run(tc.name, func(t *testing.T) {
require := assert.New(t).Require()
Expand All @@ -36,7 +37,16 @@ func TestRecipeProcessor_GetRecipeNode(t *testing.T) {
data, err := rp.GetRecipeNode(doc)
require.Nil(err)

require.Field("type").Equal("Recipe", data["type"])
recipeType := "Recipe"

if typeData, ok := data["type"].(string); ok {
require.Field("type").Equal(typeData, recipeType)
} else if typeData, ok := data["type"].([]interface{}); ok {
require.Field("type[0]").Equal(typeData[0], recipeType)
} else {
t.Fatal("type attribute not in expected shape")
}

require.Field("name").NotZero(data["name"])
})
}
Expand All @@ -53,4 +63,27 @@ func TestRecipeProcessor_GetRecipeNode(t *testing.T) {
_, err = rp.GetRecipeNode(doc)
require.NotNil(err)
})

t.Run("returns err when syntax error in json", func(t *testing.T) {
require := assert.New(t).Require()

rp := ld.NewRecipeProcessor()

const html = `<html>
<head>
<script type="application/ld+json">
{
"@type": "Recipe",
"
}
</script>
</head>
</html>
`
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
require.Nil(err)

_, err = rp.GetRecipeNode(doc)
require.NotNil(err)
})
}

0 comments on commit f89e0c2

Please sign in to comment.