From 3f57debefa41d3e84eee1054c24da1c1a80f235a Mon Sep 17 00:00:00 2001 From: iawia002 Date: Tue, 13 Mar 2018 21:28:46 +0800 Subject: [PATCH] extractors/youtube: Add support --- README.md | 1 + extractors/youtube.go | 80 +++++++++++++++++++ extractors/youtube_signature.go | 136 ++++++++++++++++++++++++++++++++ extractors/youtube_test.go | 41 ++++++++++ main.go | 4 + utils/utils.go | 13 ++- 6 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 extractors/youtube.go create mode 100644 extractors/youtube_signature.go create mode 100644 extractors/youtube_test.go diff --git a/README.md b/README.md index d43026a3b..b47fa65ac 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,7 @@ Site | URL | Videos | Images | Playlist 半次元 | | | ✓ | | pixivision | | | ✓ | | 优酷 | | ✓ | | | +YouTube | | ✓ | | | ## Contributing diff --git a/extractors/youtube.go b/extractors/youtube.go new file mode 100644 index 000000000..39e76dc97 --- /dev/null +++ b/extractors/youtube.go @@ -0,0 +1,80 @@ +package extractors + +import ( + "encoding/json" + "fmt" + "log" + "net/url" + "strings" + + "github.com/iawia002/annie/downloader" + "github.com/iawia002/annie/request" + "github.com/iawia002/annie/utils" +) + +type args struct { + Title string `json:"title"` + Stream string `json:"url_encoded_fmt_stream_map"` +} + +type assets struct { + JS string `json:"js"` +} + +type youtubeData struct { + Args args `json:"args"` + Assets assets `json:"assets"` +} + +func getSig(sig, js string) string { + html := request.Get(fmt.Sprintf("https://www.youtube.com%s", js)) + return decipherTokens(getSigTokens(html), sig) +} + +// Youtube download function +func Youtube(uri string) downloader.VideoData { + patterns := []string{ + `watch\?v=(\w+)`, + `youtu\.be/([^?/]+)`, + `embed/([^/?]+)`, + `v/([^/?]+)`, + } + vid := utils.MatchOneOf(patterns, uri) + if vid == nil { + log.Fatal("Can't find vid") + } + videoURL := fmt.Sprintf( + "https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999", + vid[1], + ) + html := request.Get(videoURL) + ytplayer := utils.Match1(`;ytplayer\.config\s*=\s*({.+?});`, html)[1] + var youtube youtubeData + json.Unmarshal([]byte(ytplayer), &youtube) + title := youtube.Args.Title + streams := strings.Split(youtube.Args.Stream, ",") + stream, _ := url.ParseQuery(streams[0]) // Best quality + quality := stream.Get("quality") + ext := utils.Match1(`video/(\w+);`, stream.Get("type"))[1] + sig := stream.Get("sig") + if sig == "" { + sig = getSig(stream.Get("s"), youtube.Assets.JS) + } + realURL := fmt.Sprintf("%s&signature=%s", stream.Get("url"), sig) + size := request.Size(realURL, uri) + urlData := downloader.URLData{ + URL: realURL, + Size: size, + Ext: ext, + } + data := downloader.VideoData{ + Site: "YouTube youtube.com", + Title: title, + Type: "video", + URLs: []downloader.URLData{urlData}, + Size: size, + Quality: quality, + } + data.Download(uri) + return data +} diff --git a/extractors/youtube_signature.go b/extractors/youtube_signature.go new file mode 100644 index 000000000..ea308688b --- /dev/null +++ b/extractors/youtube_signature.go @@ -0,0 +1,136 @@ +package extractors + +import ( + "fmt" + "log" + "regexp" + "strconv" + "strings" +) + +// The algorithm comes from https://github.com/rylio/ytdl, it's also MIT License +// Many thanks +const ( + jsvarStr = `[a-zA-Z_\$][a-zA-Z_0-9]*` + reverseStr = `:function\(a\)\{` + + `(?:return )?a\.reverse\(\)` + + `\}` + sliceStr = `:function\(a,b\)\{` + + `return a\.slice\(b\)` + + `\}` + spliceStr = `:function\(a,b\)\{` + + `a\.splice\(0,b\)` + + `\}` + swapStr = `:function\(a,b\)\{` + + `var c=a\[0\];a\[0\]=a\[b%a\.length\];a\[b(?:%a\.length)?\]=c(?:;return a)?` + + `\}` +) + +var actionsObjRegexp = regexp.MustCompile(fmt.Sprintf( + `var (%s)=\{((?:(?:%s%s|%s%s|%s%s|%s%s),?\n?)+)\};`, + jsvarStr, jsvarStr, reverseStr, jsvarStr, sliceStr, jsvarStr, spliceStr, jsvarStr, swapStr, +)) + +var actionsFuncRegexp = regexp.MustCompile(fmt.Sprintf( + `function(?: %s)?\(a\)\{`+ + `a=a\.split\(""\);\s*`+ + `((?:(?:a=)?%s\.%s\(a,\d+\);)+)`+ + `return a\.join\(""\)`+ + `\}`, + jsvarStr, jsvarStr, jsvarStr, +)) + +var reverseRegexp = regexp.MustCompile(fmt.Sprintf( + `(?m)(?:^|,)(%s)%s`, jsvarStr, reverseStr, +)) +var sliceRegexp = regexp.MustCompile(fmt.Sprintf( + `(?m)(?:^|,)(%s)%s`, jsvarStr, sliceStr, +)) +var spliceRegexp = regexp.MustCompile(fmt.Sprintf( + `(?m)(?:^|,)(%s)%s`, jsvarStr, spliceStr, +)) +var swapRegexp = regexp.MustCompile(fmt.Sprintf( + `(?m)(?:^|,)(%s)%s`, jsvarStr, swapStr, +)) + +func getSigTokens(html string) []string { + objResult := actionsObjRegexp.FindStringSubmatch(html) + funcResult := actionsFuncRegexp.FindStringSubmatch(html) + + if len(objResult) < 3 || len(funcResult) < 2 { + log.Fatal("Error parsing signature tokens") + } + obj := strings.Replace(objResult[1], "$", `\$`, -1) + objBody := strings.Replace(objResult[2], "$", `\$`, -1) + funcBody := strings.Replace(funcResult[1], "$", `\$`, -1) + + var reverseKey, sliceKey, spliceKey, swapKey string + var result []string + + if result = reverseRegexp.FindStringSubmatch(objBody); len(result) > 1 { + reverseKey = strings.Replace(result[1], "$", `\$`, -1) + } + if result = sliceRegexp.FindStringSubmatch(objBody); len(result) > 1 { + sliceKey = strings.Replace(result[1], "$", `\$`, -1) + } + if result = spliceRegexp.FindStringSubmatch(objBody); len(result) > 1 { + spliceKey = strings.Replace(result[1], "$", `\$`, -1) + } + if result = swapRegexp.FindStringSubmatch(objBody); len(result) > 1 { + swapKey = strings.Replace(result[1], "$", `\$`, -1) + } + + keys := []string{reverseKey, sliceKey, spliceKey, swapKey} + regex, err := regexp.Compile(fmt.Sprintf( + `(?:a=)?%s\.(%s)\(a,(\d+)\)`, obj, strings.Join(keys, "|"), + )) + if err != nil { + log.Fatal(err) + } + results := regex.FindAllStringSubmatch(funcBody, -1) + var tokens []string + for _, s := range results { + switch s[1] { + case swapKey: + tokens = append(tokens, "w"+s[2]) + case reverseKey: + tokens = append(tokens, "r") + case sliceKey: + tokens = append(tokens, "s"+s[2]) + case spliceKey: + tokens = append(tokens, "p"+s[2]) + } + } + return tokens +} + +func reverseStringSlice(s []string) { + for i, j := 0, len(s)-1; i < len(s)/2; i, j = i+1, j-1 { + s[i], s[j] = s[j], s[i] + } +} + +func decipherTokens(tokens []string, sig string) string { + var pos int + sigSplit := strings.Split(sig, "") + for i, l := 0, len(tokens); i < l; i++ { + tok := tokens[i] + if len(tok) > 1 { + pos, _ = strconv.Atoi(string(tok[1:])) + pos = ^^pos + } + switch string(tok[0]) { + case "r": + reverseStringSlice(sigSplit) + case "w": + s := sigSplit[0] + sigSplit[0] = sigSplit[pos] + sigSplit[pos] = s + case "s": + sigSplit = sigSplit[pos:] + case "p": + sigSplit = sigSplit[pos:] + } + } + return strings.Join(sigSplit, "") +} diff --git a/extractors/youtube_test.go b/extractors/youtube_test.go new file mode 100644 index 000000000..aef46981a --- /dev/null +++ b/extractors/youtube_test.go @@ -0,0 +1,41 @@ +package extractors + +import ( + "testing" + + "github.com/iawia002/annie/config" + "github.com/iawia002/annie/test" +) + +func TestYoutube(t *testing.T) { + config.InfoOnly = true + tests := []struct { + name string + args test.Args + }{ + { + name: "normal test", + args: test.Args{ + URL: "https://www.youtube.com/watch?v=Gnbch2osEeo", + Title: "Multifandom Mashup 2017", + Size: 60785404, + Quality: "hd720", + }, + }, + { + name: "normal test", + args: test.Args{ + URL: "https://youtu.be/z8eFzkfto2w", + Title: "Circle Of Love | Rudy Mancuso", + Size: 27183162, + Quality: "hd720", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := Youtube(tt.args.URL) + test.Check(t, tt.args, data) + }) + } +} diff --git a/main.go b/main.go index d7ed8afa1..933ad3e2c 100644 --- a/main.go +++ b/main.go @@ -51,6 +51,10 @@ func main() { extractors.Pixivision(videoURL) case "youku": extractors.Youku(videoURL) + case "youtube": + extractors.Youtube(videoURL) + case "youtu": // youtu.be + extractors.Youtube(videoURL) default: extractors.Universal(videoURL) } diff --git a/utils/utils.go b/utils/utils.go index d3afddfbb..0719cd760 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -17,6 +17,17 @@ func Match1(pattern, text string) []string { return value } +// MatchOneOf match one of the patterns +func MatchOneOf(patterns []string, text string) []string { + for _, pattern := range patterns { + value := Match1(pattern, text) + if len(value) > 0 { + return value + } + } + return nil +} + // MatchAll return all matching results func MatchAll(pattern, text string) [][]string { re := regexp.MustCompile(pattern) @@ -37,7 +48,7 @@ func FileSize(filePath string) int64 { func Domain(url string) string { domainPattern := `([a-z0-9][-a-z0-9]{0,62})\.` + `(com\.cn|com\.hk|` + - `cn|com|net|edu|gov|biz|org|info|pro|name|xxx|xyz|` + + `cn|com|net|edu|gov|biz|org|info|pro|name|xxx|xyz|be|` + `me|top|cc|tv|tt)` domain := Match1(domainPattern, url)[1] return domain