Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Supporting drama type and sorting urls to make resuming deterministic #3

Merged
merged 2 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions proto/category.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion proto/category.proto
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ option go_package = "github.com/its-my-data/doubak/proto";
// - 游戏 game
// - 移动应用 app
// - 评论 review
// - 舞台剧 drama
// - 小组 group (not supported)
// - 日记 note (not supported)
// - 图片 album (not supported)
// - 小站 site (not supported)
// - 同城活动 activity (not supported)
// - 舞台剧 drama (not supported)
// - 豆品 thing (not supported)
enum Category {
broadcast = 0;
Expand All @@ -28,4 +28,5 @@ enum Category {
music = 4;
app = 5;
review = 6;
drama = 7;
}
63 changes: 60 additions & 3 deletions task/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"log"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
Expand All @@ -27,6 +28,7 @@ const PeopleURL = DoubanURL + "people/"
const MoviePeopleURL = MovieURL + "people/"
const BookPeopleURL = BookURL + "people/"
const MusicPeopleURL = MusicURL + "people/"
const LocationPeopleURL = DoubanURL + "location/people/"

const startingPage = 1
const startingItemId = 0
Expand Down Expand Up @@ -101,6 +103,9 @@ func (task *Collector) Execute() error {
case proto.Category_music.String():
task.crawlMusicListDispatcher()
task.crawlItemDetails(proto.Category_music, "div.item > div.info > ul > li.title > a:nth-child(1)")
case proto.Category_drama.String():
task.crawlDramaListDispatcher()
task.crawlItemDetails(proto.Category_drama, "div.item > div.info > ul > li.title > a:nth-child(1)")
default:
return errors.New("Category not implemented " + c)
}
Expand Down Expand Up @@ -409,6 +414,50 @@ func (task *Collector) crawlMusicLists(totalItems int, tag string, urlAction str
return task.crawlItemLists(proto.Category_music, totalItems, pageStep, tag, urlTemplate)
}

func (task *Collector) crawlDramaListDispatcher() error {
// The drama page does not have an entry (https://www.douban.com/location/people/<user_name>/drama/).
// However, each page contains the following parts:
// - To-watch dramas.
// - Watched dramas.

// Drama list starts with item ID (which is 0). Each page has 15 items. Example:
// https://www.douban.com/location/people/mewcatcher/drama/collect?sort=time&start=0&filter=all&mode=grid&tags_sort=count
nToWatch := 0
nWatched := 0
c := util.NewColly()
c.OnHTML("div.article > div.mod > h2", func(e *colly.HTMLElement) {
secText := e.Text
re := regexp.MustCompile("[0-9]+")
nParsed, _ := strconv.Atoi(re.FindString(secText))

switch {
case strings.Contains(secText, "想看"):
nToWatch = nParsed
log.Println("Found to-watch dramas:", nToWatch)
case strings.Contains(secText, "看过"):
nWatched = nParsed
log.Println("Found watched dramas:", nWatched)
default:
log.Println("Ignoring:", util.MergeSpaces(&secText))
}
})
c.Visit(LocationPeopleURL + task.user + "/drama/")

if err := task.crawlDramaLists(nWatched, "watched", "collect"); err != nil {
return err
}
if err := task.crawlDramaLists(nToWatch, "towatch", "wish"); err != nil {
return err
}
return nil
}

func (task *Collector) crawlDramaLists(totalItems int, tag string, urlAction string) error {
const pageStep = 15
urlTemplate := fmt.Sprintf("https://www.douban.com/location/people/%s/drama/%s?sort=time&start=%%d&filter=all&mode=grid&tags_sort=count", task.user, urlAction)
return task.crawlItemLists(proto.Category_drama, totalItems, pageStep, tag, urlTemplate)
}

// TODO: implement more crawlers.

// crawlItemLists downloads an item list universally.
Expand Down Expand Up @@ -452,7 +501,8 @@ func (task *Collector) crawlItemLists(cat proto.Category, totalItems int, pageSt
}

func (task *Collector) crawlItemDetails(cat proto.Category, selector string) error {
var urls []string
// Run statistical URL counter.
urlCounter := make(map[string]int)
inputFileNamePattern := fmt.Sprintf("*_%s_*.html", cat)
files := util.GetFilePathListWithPattern(task.outputDir, inputFileNamePattern)
for _, fn := range files {
Expand All @@ -466,10 +516,17 @@ func (task *Collector) crawlItemDetails(cat proto.Category, selector string) err
if !exists {
log.Fatal("Found item without link", sel.Text())
}
urls = append(urls, url)
urlCounter[url]++
})
}

// Convert map to sorted slices to make resuming idempotent.
var urls []string
for url := range urlCounter {
urls = append(urls, url)
}
sort.Strings(urls)

// Hack around to continue progress. Set to the last downloaded progress count (1-based, 0 by default).
// This hack will continue with the next URL in the queue.
const iResume = 0
Expand Down Expand Up @@ -542,7 +599,7 @@ func (task *Collector) getItemMatcherPattern(cat proto.Category) string {
switch cat {
case proto.Category_book:
return "class=\"subject-item\""
case proto.Category_movie, proto.Category_music:
case proto.Category_movie, proto.Category_music, proto.Category_drama:
return "class=\"item\""
case proto.Category_game:
return "class=\"common-item\""
Expand Down
5 changes: 3 additions & 2 deletions util/crawlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const RequestInterval = 1 * time.Second
// TODO: add a rate limiter.
func NewQueue() *queue.Queue {
q, err := queue.New(
1, // Number of consumer threads
1, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)
if err != nil {
Expand All @@ -42,7 +42,7 @@ func NewColly() *colly.Collector {

c := colly.NewCollector(
colly.MaxDepth(1),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"),
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61"),
)

c.OnError(func(r *colly.Response, err error) {
Expand All @@ -64,6 +64,7 @@ func NewColly() *colly.Collector {
r.Headers.Set("Cookie", cookies)
}

r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
r.Headers.Set("Referer", "https://www.douban.com/")
r.Headers.Set("Host", "https://www.douban.com/")
})
Expand Down
Loading