Skip to content

Commit

Permalink
1228 fix instagram download (#1252)
Browse files Browse the repository at this point in the history
* Extract Instagram payload

Read the payload of stories and images

* Extract images and videos

* Update the file sizes in Instagram tests

* go mod tidy

---------

Co-authored-by: Xinzhao Xu <z2d@jifangcheng.com>
  • Loading branch information
shavit and iawia002 committed Jul 6, 2023
1 parent 3da4af3 commit 99f8093
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 94 deletions.
1 change: 1 addition & 0 deletions extractors/errors.go
Expand Up @@ -9,4 +9,5 @@ var (
ErrURLParseFailed = errors.New("url parse failed")
ErrInvalidRegularExpression = errors.New("invalid regular expression")
ErrURLQueryParamsParseFailed = errors.New("url query params parse failed")
ErrBodyParseFailed = errors.New("body parse failed")
)
214 changes: 124 additions & 90 deletions extractors/instagram/instagram.go
Expand Up @@ -3,13 +3,12 @@ package instagram
import (
"encoding/json"
netURL "net/url"
"path"
"strings"

"github.com/pkg/errors"
"golang.org/x/net/html"

"github.com/iawia002/lux/extractors"
"github.com/iawia002/lux/parser"
"github.com/iawia002/lux/request"
"github.com/iawia002/lux/utils"
)
Expand All @@ -18,18 +17,30 @@ func init() {
extractors.Register("instagram", New())
}

type instagram struct {
ShortcodeMedia struct {
EdgeSidecar struct {
Edges []struct {
Node struct {
DisplayURL string `json:"display_url"`
IsVideo bool `json:"is_video"`
VideoURL string `json:"video_url"`
} `json:"node"`
} `json:"edges"`
} `json:"edge_sidecar_to_children"`
} `json:"shortcode_media"`
type instagramPayload struct {
ArticleBody string `json:"articleBody"`
Author struct {
Image string `json:"image"`
Name string `json:"name"`
AlternativeName string `json:"alternativeName"`
Url string `json:"url"`
} `json:"author"`
Videos []struct {
UploadData string `json:"string"`
Description string `json:"description"`
Name string `json:"name"`
Caption string `json:"caption"`
Height string `json:"height"`
Width string `json:"width"`
ContentURL string `json:"contentUrl"`
ThumbnailURL string `json:"thumbnailUrl"`
} `json:"video"`
Images []struct {
Caption string `json:"caption"`
Height string `json:"height"`
Width string `json:"width"`
URL string `json:"url"`
} `json:"image"`
}

type extractor struct{}
Expand All @@ -39,104 +50,65 @@ func New() extractors.Extractor {
return &extractor{}
}

func extractImageFromPage(html, url string) (map[string]*extractors.Stream, error) {
_, realURLs, err := parser.GetImages(html, "EmbeddedMediaImage", nil)
// Extract is the main function to extract the data.
func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) {
u, err := netURL.Parse(url)
if err != nil {
return nil, errors.WithStack(err)
}

urls := make([]*extractors.Part, 0, len(realURLs))
var totalSize int64
for _, realURL := range realURLs {
size, err := request.Size(realURL, url)
if err != nil {
return nil, errors.WithStack(err)
}
urlData := &extractors.Part{
URL: realURL,
Size: size,
Ext: "jpg",
}
urls = append(urls, urlData)
totalSize += size
htmlResp, err := request.Get(u.String(), url, nil)
if err != nil {
return nil, errors.WithStack(err)
}

return map[string]*extractors.Stream{
"default": {
Parts: urls,
Size: totalSize,
},
}, nil
}
reader := strings.NewReader(htmlResp)
htmlRoot, err := html.Parse(reader)
if err != nil {
return nil, errors.WithStack(err)
}

func extractFromData(dataString, url string) (map[string]*extractors.Stream, error) {
var data instagram
if err := json.Unmarshal([]byte(dataString), &data); err != nil {
sNode, err := dfsFindScript(htmlRoot)
if err != nil {
return nil, errors.WithStack(err)
}

var payload instagramPayload
if err = json.Unmarshal([]byte(sNode.Data), &payload); err != nil {
return nil, errors.WithStack(err)
}

urls := make([]*extractors.Part, 0, len(data.ShortcodeMedia.EdgeSidecar.Edges))
var totalSize int64
for _, u := range data.ShortcodeMedia.EdgeSidecar.Edges {
// Image
realURL := u.Node.DisplayURL
ext := "jpg"
if u.Node.IsVideo {
// Video
realURL = u.Node.VideoURL
ext = "mp4"
var parts []*extractors.Part
if len(payload.Videos) > 0 {
videoParts, err := createPartVideos(&payload, url)
if err != nil {
return nil, errors.WithStack(extractors.ErrBodyParseFailed)
}

size, err := request.Size(realURL, url)
parts = append(parts, videoParts...)
}
if len(payload.Images) > 0 {
imageParts, err := createPartImages(&payload, url)
if err != nil {
return nil, errors.WithStack(err)
}
urlData := &extractors.Part{
URL: realURL,
Size: size,
Ext: ext,
return nil, errors.WithStack(extractors.ErrBodyParseFailed)
}
urls = append(urls, urlData)
totalSize += size

parts = append(parts, imageParts...)
}

for _, part := range parts {
totalSize += part.Size
}

return map[string]*extractors.Stream{
streams := map[string]*extractors.Stream{
"default": {
Parts: urls,
Parts: parts,
Size: totalSize,
},
}, nil
}

// Extract is the main function to extract the data.
func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) {
// Instagram is forcing a login to access the page, so we use the embed page to bypass that.
u, err := netURL.Parse(url)
if err != nil {
return nil, errors.WithStack(err)
}
id := u.Path[strings.LastIndex(u.Path, "/")+1:]
u.Path = path.Join(u.Path, "embed")

html, err := request.Get(u.String(), url, nil)
if err != nil {
return nil, errors.WithStack(err)
}
dataStrings := utils.MatchOneOf(html, `window\.__additionalDataLoaded\('graphql',(.*)\);`)
if dataStrings == nil || len(dataStrings) < 2 {
return nil, errors.WithStack(extractors.ErrURLParseFailed)
}
dataString := dataStrings[1]

var streams map[string]*extractors.Stream
if dataString == "" || dataString == "null" {
streams, err = extractImageFromPage(html, url)
} else {
streams, err = extractFromData(dataString, url)
}
if err != nil {
return nil, errors.WithStack(err)
}
id := u.Path[strings.LastIndex(u.Path, "/")+1:]

return []*extractors.Data{
{
Expand All @@ -148,3 +120,65 @@ func (e *extractor) Extract(url string, option extractors.Options) ([]*extractor
},
}, nil
}

func dfsFindScript(n *html.Node) (*html.Node, error) {
if n.Type == html.ElementNode && n.Data == "script" {
for _, attr := range n.Attr {
if attr.Key == "type" && attr.Val == "application/ld+json" {
return n.FirstChild, nil
}
}
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
if ret, err := dfsFindScript(c); err == nil {
return ret, nil
}
}

return nil, errors.WithStack(extractors.ErrBodyParseFailed)
}

func createPartVideos(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) {
for _, it := range payload.Videos {
_, ext, err := utils.GetNameAndExt(it.ContentURL)
if err != nil {
return parts, errors.WithStack(err)
}
filesize, err := request.Size(it.ContentURL, ref)
if err != nil {
return parts, errors.WithStack(err)
}

part := &extractors.Part{
URL: it.ContentURL,
Size: filesize,
Ext: ext,
}
parts = append(parts, part)
}

return parts, err
}

func createPartImages(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) {
for _, it := range payload.Images {
_, ext, err := utils.GetNameAndExt(it.URL)
if err != nil {
return parts, errors.WithStack(err)
}
filesize, err := request.Size(it.URL, ref)
if err != nil {
return parts, errors.WithStack(err)
}

part := &extractors.Part{
URL: it.URL,
Size: filesize,
Ext: ext,
}
parts = append(parts, part)
}

return parts, err
}
6 changes: 3 additions & 3 deletions extractors/instagram/instagram_test.go
Expand Up @@ -17,23 +17,23 @@ func TestDownload(t *testing.T) {
args: test.Args{
URL: "https://www.instagram.com/p/BlIka1ZFCNr",
Title: "Instagram BlIka1ZFCNr",
Size: 3003662,
Size: 577298,
},
},
{
name: "image test",
args: test.Args{
URL: "https://www.instagram.com/p/Bl5oVUyl9Yx",
Title: "Instagram Bl5oVUyl9Yx",
Size: 250596,
Size: 101611,
},
},
{
name: "image album test",
args: test.Args{
URL: "https://www.instagram.com/p/Bjyr-gxF4Rb",
Title: "Instagram Bjyr-gxF4Rb",
Size: 4599909,
Size: 241466,
},
},
}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Expand Up @@ -16,6 +16,7 @@ require (
github.com/pkg/errors v0.9.1
github.com/robertkrimen/otto v0.0.0-20211024170158-b87d35c0b86f
github.com/urfave/cli/v2 v2.6.0
golang.org/x/net v0.7.0
)

require (
Expand All @@ -37,7 +38,6 @@ require (
github.com/rogpeppe/go-internal v1.9.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
golang.org/x/exp v0.0.0-20220518171630-0b5c67f07fdf // indirect
golang.org/x/net v0.7.0 // indirect
golang.org/x/sys v0.5.0 // indirect
golang.org/x/text v0.7.0 // indirect
gopkg.in/sourcemap.v1 v1.0.5 // indirect
Expand Down

0 comments on commit 99f8093

Please sign in to comment.